From 26dc5bbe75f5a7491f9bc99d89ed5a3d9c034686 Mon Sep 17 00:00:00 2001 From: hershd23 Date: Fri, 3 Nov 2023 04:25:13 -0400 Subject: [PATCH 1/8] Added custom function for extracting columns from unstructured data new file: ../evadb/functions/extract_columns.py --- evadb/functions/extract_columns.py | 166 +++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 evadb/functions/extract_columns.py diff --git a/evadb/functions/extract_columns.py b/evadb/functions/extract_columns.py new file mode 100644 index 0000000000..7fa6904dc4 --- /dev/null +++ b/evadb/functions/extract_columns.py @@ -0,0 +1,166 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from io import BytesIO + +import numpy as np +import pandas as pd +import json +from retry import retry + +from evadb.catalog.catalog_type import NdArrayType +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe +from evadb.utils.generic_utils import try_to_import_openai +from evadb.utils.logging_manager import logger + + +class ExtractColumnsFunction(AbstractFunction): + @property + def name(self) -> str: + return "EXTRACT_COLUMNS" + + def setup( + self, + model="gpt-3.5-turbo", + temperature: float = 0, + openai_api_key="" + ) -> None: + self.model = model + self.temperature = temperature + self.openai_api_key = openai_api_key + + @forward( + input_signatures=[ + PandasDataframe( + columns=["input_rows"], + column_types=[ + NdArrayType.STR, + ], + column_shapes=[(1,)], + ) + ], + output_signatures=[ + PandasDataframe( + columns=["response"], + column_types=[ + NdArrayType.STR, + ], + column_shapes=[(1,)], + ) + ], + ) + def forward(self, unstructured_df): + """ + NOTE (QUESTION) : Can we structure the inputs and outputs better + The circumvent issues surrounding the input being only one pandas dataframe and output columns being predefined + Will add all column types as a JSON and parse in the forward function + Provide only the file name from which the input will be read + Output in JSON which can be serialized and stored in the results column of the DF + """ + + try_to_import_openai() + import openai + + @retry(tries=6, delay=20) + def completion_with_backoff(**kwargs): + return openai.ChatCompletion.create(**kwargs) + + openai.api_key = self.openai_api_key + # If not found, try OS Environment Variable + if len(openai.api_key) == 0: + openai.api_key = os.environ.get("OPENAI_API_KEY", "") + assert ( + len(openai.api_key) != 0 + ), "Please set your OpenAI API key using SET OPENAI_API_KEY = 'sk-' or environment variable (OPENAI_API_KEY)" + + def generate_structured_data(unstructured_df: PandasDataframe): + results = [] + #column_types = json.loads(unstructured_df[unstructured_df.columns[0]]) + input_rows = unstructured_df[unstructured_df.columns[0]] + + column_types_dict = { + "columns": + [ + { + "name": "Issue Category", + "description": "The category of the issue", + "type": "One of (hardware, software)" + }, + { + "name": "Raw Issue String", + "description": "The raw issue string containing the exact input given by the user", + "type": "string" + }, + { + "name": "Issue Component", + "description": "The component that is causing the issue", + "type": "string" + }, + ] + } + + column_types = json.dumps(column_types_dict) + + base_prompt = """ + You are given a user query. Your task is to extract the following fields from the query and return the result in json format.\n + """ + + # TODO : Check if this is fine or if we need to add column types as string + """ + Not able to add serialized json as input to the column types. Adding a static column types list for now + """ + + for input_row in input_rows: + # TODO : Hardcoding some params for now, will revert later + params = { + "model": self.model, + "temperature": self.temperature, + "messages": [], + } + + def_sys_prompt_message = { + "role": "system", + "content": base_prompt + } + + params["messages"].append(def_sys_prompt_message) + params["messages"].extend( + [ + { + "role": "user", + "content": f"Here are the column types we need the data to be structured in : \n {column_types} \n", + }, + { + "role": "user", + "content": f"Here is the unstructured query which needs to be converted: {input_row}\n", + }, + ], + ) + + logger.info("Params {}".format(params)) + response = completion_with_backoff(**params) + + logger.info("Response {}".format(response)) + answer = response.choices[0].message.content + results.append(answer) + + + return results + + df = pd.DataFrame({"response": generate_structured_data(unstructured_df=unstructured_df)}) + return df From 38f52e3028c39d688ed571f15b8c6e60047f8d81 Mon Sep 17 00:00:00 2001 From: hershd23 Date: Fri, 3 Nov 2023 04:35:10 -0400 Subject: [PATCH 2/8] Adding notebook for structured data conversion new file: 20-structured-data.ipynb --- tutorials/20-structured-data.ipynb | 198 +++++++++++++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 tutorials/20-structured-data.ipynb diff --git a/tutorials/20-structured-data.ipynb b/tutorials/20-structured-data.ipynb new file mode 100644 index 0000000000..8f4745bf2e --- /dev/null +++ b/tutorials/20-structured-data.ipynb @@ -0,0 +1,198 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Import dependencies\n", + "import os\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install --quiet \"evadb[document,notebook]\"\n", + "import evadb\n", + "cursor = evadb.connect().cursor()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Set your OpenAI key as an environment variable\n", + "import os\n", + "#os.environ['OPENAI_API_KEY'] = ''\n", + "open_ai_key = os.environ.get(\"OPENAI_API_KEY\", \"\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# set up the extract columns UDF available at functions/extract_columns.py\n", + "cursor.query(\"\"\"CREATE FUNCTION IF NOT EXISTS ExtractColumns\n", + " IMPL '../evadb/functions/extract_columns.py';\n", + " \"\"\").execute()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []\n" + ] + } + ], + "source": [ + "# # delete the table if it already exists\n", + "cursor.query(\"\"\"DROP TABLE IF EXISTS InputUnstructured\n", + " \"\"\").execute()\n", + "\n", + "# create the table specifying the type of the prompt column\n", + "cursor.query(\"\"\"CREATE TABLE IF NOT EXISTS InputUnstructured (\n", + " input_rows TEXT)\n", + " \"\"\").execute()\n", + "\n", + "table = cursor.query(\"SELECT * FROM InputUnstructured;\").df()\n", + "print(table)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "input_rows_list = [\"The touch screen on my tablet stopped working for no reason.\",\n", + "# \"Why does my computer take so long to start up? It's been like this for weeks.\",\n", + "# \"My phone battery dies too quickly. I just bought it!\",\n", + " \"My headphones won't connect to my phone anymore, even though they used to work just fine.\",\n", + " \"The software update completely messed up my computer. Now nothing works properly.\"]\n", + "\n", + "for input_row in input_rows_list:\n", + " cursor.query(f\"\"\"INSERT INTO InputUnstructured (input_rows) VALUES (\"{input_row}\")\"\"\").execute()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " _row_id input_rows\n", + "0 1 The touch screen on my tablet stopped working ...\n", + "1 2 My headphones won't connect to my phone anymor...\n", + "2 3 The software update completely messed up my co...\n" + ] + } + ], + "source": [ + "table = cursor.query(\"SELECT * FROM InputUnstructured;\").df()\n", + "print(table)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"Issue Category\": \"hardware\",\n", + " \"Raw Issue String\": \"The touch screen on my tablet stopped working for no reason.\",\n", + " \"Issue Component\": \"touch screen\"\n", + "}\n", + "{\n", + " \"Issue Category\": \"hardware\",\n", + " \"Raw Issue String\": \"My headphones won't connect to my phone anymore, even though they used to work just fine.\",\n", + " \"Issue Component\": \"headphones\"\n", + "}\n", + "{\n", + " \"Issue Category\": \"software\",\n", + " \"Raw Issue String\": \"The software update completely messed up my computer. Now nothing works properly.\",\n", + " \"Issue Component\": \"computer\"\n", + "}\n" + ] + } + ], + "source": [ + "table = cursor.query(\"SELECT ExtractColumns(input_rows) FROM InputUnstructured;\").df()\n", + "\n", + "for _, row in table.iterrows():\n", + " print(row['response'])\n", + "#print(table.iloc[1]['response'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From a77dd2607bced617fd45cf60585d69fe5623433b Mon Sep 17 00:00:00 2001 From: Hersh Dhillon Date: Mon, 27 Nov 2023 21:39:42 -0500 Subject: [PATCH 3/8] Added default env path as mentioned in the docs modified: .gitignore Added file to extract on column at a time new file: evadb/functions/extract_column.py Removed the previous implementation deleted: evadb/functions/extract_columns.py Updated the notebook modified: tutorials/20-structured-data.ipynb --- .gitignore | 1 + evadb/functions/extract_column.py | 116 ++++++++++++++++++++ evadb/functions/extract_columns.py | 166 ----------------------------- tutorials/20-structured-data.ipynb | 71 +++++++----- 4 files changed, 161 insertions(+), 193 deletions(-) create mode 100644 evadb/functions/extract_column.py delete mode 100644 evadb/functions/extract_columns.py diff --git a/.gitignore b/.gitignore index 1a091b9894..8d1232ff7d 100644 --- a/.gitignore +++ b/.gitignore @@ -101,6 +101,7 @@ env.bak/ venv.bak/ env38/ env_eva/ +evadb-venv/ # Spyder project settings .spyderproject diff --git a/evadb/functions/extract_column.py b/evadb/functions/extract_column.py new file mode 100644 index 0000000000..6a7c6122e4 --- /dev/null +++ b/evadb/functions/extract_column.py @@ -0,0 +1,116 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from io import BytesIO + +import numpy as np +import pandas as pd +import json +from retry import retry + +from evadb.catalog.catalog_type import NdArrayType +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe +from evadb.functions.chatgpt import ChatGPT +from evadb.utils.generic_utils import try_to_import_openai +from evadb.utils.logging_manager import logger + + +class ExtractColumnFunction(ChatGPT): + @property + def name(self) -> str: + return "EXTRACT_COLUMN" + + def setup( + self, + model="gpt-3.5-turbo", + temperature: float = 0, + openai_api_key="" + ) -> None: + super(ExtractColumnFunction, self).setup(model, temperature, openai_api_key) + + @forward( + input_signatures=[ + PandasDataframe( + columns=[ + "field_name" + "description", + "data_type", + "input_rows" + ], + column_types=[ + NdArrayType.STR, + NdArrayType.STR, + NdArrayType.STR, + NdArrayType.STR, + ], + column_shapes=[ + (1,), + (1,), + (1,), + (1,), + ], + ) + ], + output_signatures=[ + PandasDataframe( + columns=["response"], + column_types=[ + NdArrayType.STR, + ], + column_shapes=[(1,)], + ) + ], + ) + def forward(self, unstructured_df): + """ + NOTE (QUESTION) : Can we structure the inputs and outputs better + The circumvent issues surrounding the input being only one pandas dataframe and output columns being predefined + Will add all column types as a JSON and parse in the forward function + Provide only the file name from which the input will be read + Output in JSON which can be serialized and stored in the results column of the DF + """ + field_name = unstructured_df.iloc[0, 0] + description = unstructured_df.iloc[0, 1] + data_type = unstructured_df.iloc[0, 2] + input_rows = unstructured_df[unstructured_df.columns[3]] + prompt = """ + You are given a user query. Your task is to extract the following fields from the query and return the result in string format. + IMPORTANT: RETURN ONLY THE EXTRACTED VALUE (one word or phrase). DO NOT RETURN THE FIELD NAME OR ANY OTHER INFORMATION. + """ + content = """ + Extract the following fields from the unstructured text below: + + Format of the field is given in the format + Field Name: Field Description: Field Type + {}: {}: {} + + The unstructured text is as follows: + """.format(field_name, description, data_type) + + print(prompt) + print(content) + + output_df = pd.DataFrame({"response": []}) + + for row in input_rows: + query = row + input_df = pd.DataFrame({"query": [query],"content": content, "prompt": prompt}) + print(query) + df = super(ExtractColumnFunction, self).forward(input_df) + output_df = pd.concat([output_df, df], ignore_index=True) + return output_df diff --git a/evadb/functions/extract_columns.py b/evadb/functions/extract_columns.py deleted file mode 100644 index 7fa6904dc4..0000000000 --- a/evadb/functions/extract_columns.py +++ /dev/null @@ -1,166 +0,0 @@ -# coding=utf-8 -# Copyright 2018-2023 EvaDB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from io import BytesIO - -import numpy as np -import pandas as pd -import json -from retry import retry - -from evadb.catalog.catalog_type import NdArrayType -from evadb.functions.abstract.abstract_function import AbstractFunction -from evadb.functions.decorators.decorators import forward -from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe -from evadb.utils.generic_utils import try_to_import_openai -from evadb.utils.logging_manager import logger - - -class ExtractColumnsFunction(AbstractFunction): - @property - def name(self) -> str: - return "EXTRACT_COLUMNS" - - def setup( - self, - model="gpt-3.5-turbo", - temperature: float = 0, - openai_api_key="" - ) -> None: - self.model = model - self.temperature = temperature - self.openai_api_key = openai_api_key - - @forward( - input_signatures=[ - PandasDataframe( - columns=["input_rows"], - column_types=[ - NdArrayType.STR, - ], - column_shapes=[(1,)], - ) - ], - output_signatures=[ - PandasDataframe( - columns=["response"], - column_types=[ - NdArrayType.STR, - ], - column_shapes=[(1,)], - ) - ], - ) - def forward(self, unstructured_df): - """ - NOTE (QUESTION) : Can we structure the inputs and outputs better - The circumvent issues surrounding the input being only one pandas dataframe and output columns being predefined - Will add all column types as a JSON and parse in the forward function - Provide only the file name from which the input will be read - Output in JSON which can be serialized and stored in the results column of the DF - """ - - try_to_import_openai() - import openai - - @retry(tries=6, delay=20) - def completion_with_backoff(**kwargs): - return openai.ChatCompletion.create(**kwargs) - - openai.api_key = self.openai_api_key - # If not found, try OS Environment Variable - if len(openai.api_key) == 0: - openai.api_key = os.environ.get("OPENAI_API_KEY", "") - assert ( - len(openai.api_key) != 0 - ), "Please set your OpenAI API key using SET OPENAI_API_KEY = 'sk-' or environment variable (OPENAI_API_KEY)" - - def generate_structured_data(unstructured_df: PandasDataframe): - results = [] - #column_types = json.loads(unstructured_df[unstructured_df.columns[0]]) - input_rows = unstructured_df[unstructured_df.columns[0]] - - column_types_dict = { - "columns": - [ - { - "name": "Issue Category", - "description": "The category of the issue", - "type": "One of (hardware, software)" - }, - { - "name": "Raw Issue String", - "description": "The raw issue string containing the exact input given by the user", - "type": "string" - }, - { - "name": "Issue Component", - "description": "The component that is causing the issue", - "type": "string" - }, - ] - } - - column_types = json.dumps(column_types_dict) - - base_prompt = """ - You are given a user query. Your task is to extract the following fields from the query and return the result in json format.\n - """ - - # TODO : Check if this is fine or if we need to add column types as string - """ - Not able to add serialized json as input to the column types. Adding a static column types list for now - """ - - for input_row in input_rows: - # TODO : Hardcoding some params for now, will revert later - params = { - "model": self.model, - "temperature": self.temperature, - "messages": [], - } - - def_sys_prompt_message = { - "role": "system", - "content": base_prompt - } - - params["messages"].append(def_sys_prompt_message) - params["messages"].extend( - [ - { - "role": "user", - "content": f"Here are the column types we need the data to be structured in : \n {column_types} \n", - }, - { - "role": "user", - "content": f"Here is the unstructured query which needs to be converted: {input_row}\n", - }, - ], - ) - - logger.info("Params {}".format(params)) - response = completion_with_backoff(**params) - - logger.info("Response {}".format(response)) - answer = response.choices[0].message.content - results.append(answer) - - - return results - - df = pd.DataFrame({"response": generate_structured_data(unstructured_df=unstructured_df)}) - return df diff --git a/tutorials/20-structured-data.ipynb b/tutorials/20-structured-data.ipynb index 8f4745bf2e..7b3d9e89f4 100644 --- a/tutorials/20-structured-data.ipynb +++ b/tutorials/20-structured-data.ipynb @@ -22,6 +22,17 @@ "text": [ "Note: you may need to restart the kernel to use updated packages.\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/hershdhillon23/projects/evadb/evadb-venv/lib/python3.9/site-packages/urllib3/__init__.py:34: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n", + " warnings.warn(\n", + "Downloading: \"http://ml.cs.tsinghua.edu.cn/~chenxi/pytorch-models/mnist-b07bb66b.pth\" to /Users/hershdhillon23/.cache/torch/hub/checkpoints/mnist-b07bb66b.pth\n", + "100%|██████████| 1.03M/1.03M [00:01<00:00, 697kB/s] \n", + "Downloading: \"https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth\" to /Users/hershdhillon23/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth\n" + ] } ], "source": [ @@ -32,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -44,30 +55,30 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 11, + "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# set up the extract columns UDF available at functions/extract_columns.py\n", - "cursor.query(\"\"\"CREATE FUNCTION IF NOT EXISTS ExtractColumns\n", - " IMPL '../evadb/functions/extract_columns.py';\n", + "cursor.query(\"\"\"CREATE FUNCTION IF NOT EXISTS ExtractColumn\n", + " IMPL '../evadb/functions/extract_column.py';\n", " \"\"\").execute()" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 78, "metadata": {}, "outputs": [ { @@ -96,7 +107,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ @@ -112,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 80, "metadata": {}, "outputs": [ { @@ -133,33 +144,39 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 82, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{\n", - " \"Issue Category\": \"hardware\",\n", - " \"Raw Issue String\": \"The touch screen on my tablet stopped working for no reason.\",\n", - " \"Issue Component\": \"touch screen\"\n", - "}\n", - "{\n", - " \"Issue Category\": \"hardware\",\n", - " \"Raw Issue String\": \"My headphones won't connect to my phone anymore, even though they used to work just fine.\",\n", - " \"Issue Component\": \"headphones\"\n", - "}\n", - "{\n", - " \"Issue Category\": \"software\",\n", - " \"Raw Issue String\": \"The software update completely messed up my computer. Now nothing works properly.\",\n", - " \"Issue Component\": \"computer\"\n", - "}\n" + "\n", + " You are given a user query. Your task is to extract the following fields from the query and return the result in string format.\n", + " IMPORTANT: RETURN ONLY THE EXTRACTED VALUE (one word or phrase). DO NOT RETURN THE FIELD NAME OR ANY OTHER INFORMATION.\n", + " \n", + "\n", + " Extract the following fields from the unstructured text below:\n", + " \n", + " Format of the field is given in the format\n", + " Field Name: Field Description: Field Type\n", + " Issue Component: The component that is causing the issue: string less than 2 words\n", + "\n", + " The unstructured text is as follows:\n", + " \n", + "The touch screen on my tablet stopped working for no reason.\n", + "My headphones won't connect to my phone anymore, even though they used to work just fine.\n", + "The software update completely messed up my computer. Now nothing works properly.\n", + "touch screen\n", + "headphones, connect, phone, work, fine\n", + "software update\n" ] } ], "source": [ - "table = cursor.query(\"SELECT ExtractColumns(input_rows) FROM InputUnstructured;\").df()\n", + "table = cursor.query(\n", + " \"\"\"SELECT ExtractColumn(\"Issue Component\",\"The component that is causing the issue\", \"string less than 2 words\", input_rows) FROM InputUnstructured;\"\"\"\n", + " ).df()\n", "\n", "for _, row in table.iterrows():\n", " print(row['response'])\n", @@ -190,7 +207,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.9.6" } }, "nbformat": 4, From 4d318f8da604b91e1ebb789abf3b1c6e37fedbb8 Mon Sep 17 00:00:00 2001 From: Hersh Dhillon Date: Wed, 29 Nov 2023 00:03:49 -0500 Subject: [PATCH 4/8] Linter checks --- evadb/functions/extract_column.py | 35 +++++++++--------------------- tutorials/20-structured-data.ipynb | 7 ------ 2 files changed, 10 insertions(+), 32 deletions(-) diff --git a/evadb/functions/extract_column.py b/evadb/functions/extract_column.py index 6a7c6122e4..8e3de297cf 100644 --- a/evadb/functions/extract_column.py +++ b/evadb/functions/extract_column.py @@ -13,21 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -from io import BytesIO - -import numpy as np import pandas as pd -import json -from retry import retry from evadb.catalog.catalog_type import NdArrayType -from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.chatgpt import ChatGPT from evadb.functions.decorators.decorators import forward from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe -from evadb.functions.chatgpt import ChatGPT -from evadb.utils.generic_utils import try_to_import_openai -from evadb.utils.logging_manager import logger class ExtractColumnFunction(ChatGPT): @@ -36,22 +27,14 @@ def name(self) -> str: return "EXTRACT_COLUMN" def setup( - self, - model="gpt-3.5-turbo", - temperature: float = 0, - openai_api_key="" + self, model="gpt-3.5-turbo", temperature: float = 0, openai_api_key="" ) -> None: super(ExtractColumnFunction, self).setup(model, temperature, openai_api_key) @forward( input_signatures=[ PandasDataframe( - columns=[ - "field_name" - "description", - "data_type", - "input_rows" - ], + columns=["field_name" "description", "data_type", "input_rows"], column_types=[ NdArrayType.STR, NdArrayType.STR, @@ -79,7 +62,7 @@ def setup( def forward(self, unstructured_df): """ NOTE (QUESTION) : Can we structure the inputs and outputs better - The circumvent issues surrounding the input being only one pandas dataframe and output columns being predefined + The circumvent issues surrounding the input being only one pandas dataframe and output columns being predefined Will add all column types as a JSON and parse in the forward function Provide only the file name from which the input will be read Output in JSON which can be serialized and stored in the results column of the DF @@ -94,13 +77,13 @@ def forward(self, unstructured_df): """ content = """ Extract the following fields from the unstructured text below: - Format of the field is given in the format Field Name: Field Description: Field Type {}: {}: {} - The unstructured text is as follows: - """.format(field_name, description, data_type) + """.format( + field_name, description, data_type + ) print(prompt) print(content) @@ -109,7 +92,9 @@ def forward(self, unstructured_df): for row in input_rows: query = row - input_df = pd.DataFrame({"query": [query],"content": content, "prompt": prompt}) + input_df = pd.DataFrame( + {"query": [query], "content": content, "prompt": prompt} + ) print(query) df = super(ExtractColumnFunction, self).forward(input_df) output_df = pd.concat([output_df, df], ignore_index=True) diff --git a/tutorials/20-structured-data.ipynb b/tutorials/20-structured-data.ipynb index 7b3d9e89f4..c29557ca4c 100644 --- a/tutorials/20-structured-data.ipynb +++ b/tutorials/20-structured-data.ipynb @@ -182,13 +182,6 @@ " print(row['response'])\n", "#print(table.iloc[1]['response'])" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 4010d490204b0691e7c1ca289d0072a182f77b23 Mon Sep 17 00:00:00 2001 From: Hersh Dhillon Date: Thu, 30 Nov 2023 02:15:45 -0500 Subject: [PATCH 5/8] Adding test for extract_column modified: evadb/functions/extract_column.py new file: test/integration_tests/long/functions/test_extract_column.py modified: tutorials/20-structured-data.ipynb --- evadb/functions/extract_column.py | 6 +- .../long/functions/test_extract_column.py | 57 +++++++++ tutorials/20-structured-data.ipynb | 116 ++++-------------- 3 files changed, 81 insertions(+), 98 deletions(-) create mode 100644 test/integration_tests/long/functions/test_extract_column.py diff --git a/evadb/functions/extract_column.py b/evadb/functions/extract_column.py index 8e3de297cf..81bef1246f 100644 --- a/evadb/functions/extract_column.py +++ b/evadb/functions/extract_column.py @@ -34,7 +34,7 @@ def setup( @forward( input_signatures=[ PandasDataframe( - columns=["field_name" "description", "data_type", "input_rows"], + columns=["field_name", "description", "data_type", "input_rows"], column_types=[ NdArrayType.STR, NdArrayType.STR, @@ -85,9 +85,6 @@ def forward(self, unstructured_df): field_name, description, data_type ) - print(prompt) - print(content) - output_df = pd.DataFrame({"response": []}) for row in input_rows: @@ -95,7 +92,6 @@ def forward(self, unstructured_df): input_df = pd.DataFrame( {"query": [query], "content": content, "prompt": prompt} ) - print(query) df = super(ExtractColumnFunction, self).forward(input_df) output_df = pd.concat([output_df, df], ignore_index=True) return output_df diff --git a/test/integration_tests/long/functions/test_extract_column.py b/test/integration_tests/long/functions/test_extract_column.py new file mode 100644 index 0000000000..74a01198aa --- /dev/null +++ b/test/integration_tests/long/functions/test_extract_column.py @@ -0,0 +1,57 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +from test.markers import chatgpt_skip_marker +from test.util import get_evadb_for_testing + +import pandas as pd + +from evadb.server.command_handler import execute_query_fetch_all + + +class ExtractColumnTest(unittest.TestCase): + def setUp(self) -> None: + self.evadb = get_evadb_for_testing() + self.evadb.catalog().reset() + create_table_query = """CREATE TABLE IF NOT EXISTS InputUnstructured ( + input_rows TEXT) + """ + + execute_query_fetch_all(self.evadb, create_table_query) + + input_row = "My keyboard has stopped working" + + insert_query = f"""INSERT INTO InputUnstructured (input_rows) VALUES ("{input_row}")""" + execute_query_fetch_all(self.evadb, insert_query) + # Add actual API key here + os.environ["OPENAI_API_KEY"] = "sk-..." + + def tearDown(self) -> None: + execute_query_fetch_all(self.evadb, "DROP TABLE IF EXISTS InputUnstructured;") + + @chatgpt_skip_marker + def test_extract_column_function(self): + function_name = "ExtractColumn" + execute_query_fetch_all(self.evadb, f"DROP FUNCTION IF EXISTS {function_name};") + + create_function_query = f"""CREATE FUNCTION IF NOT EXISTS {function_name} IMPL 'evadb/functions/extract_column.py';""" + + execute_query_fetch_all(self.evadb, create_function_query) + + extract_columns_query = f"SELECT {function_name}('Issue Component','The component that is causing the issue', 'string less than 2 words', input_rows) FROM InputUnstructured;" + output_batch = execute_query_fetch_all(self.evadb, extract_columns_query) + self.assertEqual(output_batch.columns, ["chatgpt.response"]) diff --git a/tutorials/20-structured-data.ipynb b/tutorials/20-structured-data.ipynb index c29557ca4c..5c8accee08 100644 --- a/tutorials/20-structured-data.ipynb +++ b/tutorials/20-structured-data.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -13,28 +13,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/hershdhillon23/projects/evadb/evadb-venv/lib/python3.9/site-packages/urllib3/__init__.py:34: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n", - " warnings.warn(\n", - "Downloading: \"http://ml.cs.tsinghua.edu.cn/~chenxi/pytorch-models/mnist-b07bb66b.pth\" to /Users/hershdhillon23/.cache/torch/hub/checkpoints/mnist-b07bb66b.pth\n", - "100%|██████████| 1.03M/1.03M [00:01<00:00, 697kB/s] \n", - "Downloading: \"https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth\" to /Users/hershdhillon23/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth\n" - ] - } - ], + "outputs": [], "source": [ "%pip install --quiet \"evadb[document,notebook]\"\n", "import evadb\n", @@ -43,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -55,20 +36,9 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 77, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# set up the extract columns UDF available at functions/extract_columns.py\n", "cursor.query(\"\"\"CREATE FUNCTION IF NOT EXISTS ExtractColumn\n", @@ -78,19 +48,9 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []\n" - ] - } - ], + "outputs": [], "source": [ "# # delete the table if it already exists\n", "cursor.query(\"\"\"DROP TABLE IF EXISTS InputUnstructured\n", @@ -107,15 +67,15 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "input_rows_list = [\"The touch screen on my tablet stopped working for no reason.\",\n", - "# \"Why does my computer take so long to start up? It's been like this for weeks.\",\n", - "# \"My phone battery dies too quickly. I just bought it!\",\n", + " \"Why does my computer take so long to start up? It's been like this for weeks.\",\n", + " \"My phone battery dies too quickly. I just bought it!\",\n", " \"My headphones won't connect to my phone anymore, even though they used to work just fine.\",\n", - " \"The software update completely messed up my computer. Now nothing works properly.\"]\n", + " \"The software update completely messed up my computer. Now nothing works properly.\"]\n", "\n", "for input_row in input_rows_list:\n", " cursor.query(f\"\"\"INSERT INTO InputUnstructured (input_rows) VALUES (\"{input_row}\")\"\"\").execute()\n" @@ -123,20 +83,9 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " _row_id input_rows\n", - "0 1 The touch screen on my tablet stopped working ...\n", - "1 2 My headphones won't connect to my phone anymor...\n", - "2 3 The software update completely messed up my co...\n" - ] - } - ], + "outputs": [], "source": [ "table = cursor.query(\"SELECT * FROM InputUnstructured;\").df()\n", "print(table)" @@ -144,35 +93,9 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " You are given a user query. Your task is to extract the following fields from the query and return the result in string format.\n", - " IMPORTANT: RETURN ONLY THE EXTRACTED VALUE (one word or phrase). DO NOT RETURN THE FIELD NAME OR ANY OTHER INFORMATION.\n", - " \n", - "\n", - " Extract the following fields from the unstructured text below:\n", - " \n", - " Format of the field is given in the format\n", - " Field Name: Field Description: Field Type\n", - " Issue Component: The component that is causing the issue: string less than 2 words\n", - "\n", - " The unstructured text is as follows:\n", - " \n", - "The touch screen on my tablet stopped working for no reason.\n", - "My headphones won't connect to my phone anymore, even though they used to work just fine.\n", - "The software update completely messed up my computer. Now nothing works properly.\n", - "touch screen\n", - "headphones, connect, phone, work, fine\n", - "software update\n" - ] - } - ], + "outputs": [], "source": [ "table = cursor.query(\n", " \"\"\"SELECT ExtractColumn(\"Issue Component\",\"The component that is causing the issue\", \"string less than 2 words\", input_rows) FROM InputUnstructured;\"\"\"\n", @@ -182,6 +105,13 @@ " print(row['response'])\n", "#print(table.iloc[1]['response'])" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 6f9a1c22942387c05268f4dd1c789d5988108733 Mon Sep 17 00:00:00 2001 From: Hersh Dhillon Date: Thu, 30 Nov 2023 02:25:13 -0500 Subject: [PATCH 6/8] Resolved review comments --- evadb/functions/extract_column.py | 10 ++++++---- tutorials/20-structured-data.ipynb | 11 +++-------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/evadb/functions/extract_column.py b/evadb/functions/extract_column.py index 81bef1246f..db1f5d7817 100644 --- a/evadb/functions/extract_column.py +++ b/evadb/functions/extract_column.py @@ -67,10 +67,12 @@ def forward(self, unstructured_df): Provide only the file name from which the input will be read Output in JSON which can be serialized and stored in the results column of the DF """ - field_name = unstructured_df.iloc[0, 0] - description = unstructured_df.iloc[0, 1] - data_type = unstructured_df.iloc[0, 2] - input_rows = unstructured_df[unstructured_df.columns[3]] + for row in unstructured_df.itertuples(): + field_name = row[0] + description = row[1] + data_type = row[2] + input_rows = row[3] + prompt = """ You are given a user query. Your task is to extract the following fields from the query and return the result in string format. IMPORTANT: RETURN ONLY THE EXTRACTED VALUE (one word or phrase). DO NOT RETURN THE FIELD NAME OR ANY OTHER INFORMATION. diff --git a/tutorials/20-structured-data.ipynb b/tutorials/20-structured-data.ipynb index 5c8accee08..c33747d518 100644 --- a/tutorials/20-structured-data.ipynb +++ b/tutorials/20-structured-data.ipynb @@ -59,10 +59,7 @@ "# create the table specifying the type of the prompt column\n", "cursor.query(\"\"\"CREATE TABLE IF NOT EXISTS InputUnstructured (\n", " input_rows TEXT)\n", - " \"\"\").execute()\n", - "\n", - "table = cursor.query(\"SELECT * FROM InputUnstructured;\").df()\n", - "print(table)" + " \"\"\").execute()" ] }, { @@ -87,8 +84,7 @@ "metadata": {}, "outputs": [], "source": [ - "table = cursor.query(\"SELECT * FROM InputUnstructured;\").df()\n", - "print(table)" + "table = cursor.query(\"SELECT * FROM InputUnstructured;\").df()" ] }, { @@ -102,8 +98,7 @@ " ).df()\n", "\n", "for _, row in table.iterrows():\n", - " print(row['response'])\n", - "#print(table.iloc[1]['response'])" + " print(row['response'])" ] }, { From 0fc67717c8d652fbe26880f1ec195f4c574a2bab Mon Sep 17 00:00:00 2001 From: Hersh Dhillon Date: Fri, 1 Dec 2023 17:08:02 -0500 Subject: [PATCH 7/8] Solving for linter changes due to the notebook test --- script/test/test.sh | 2 +- tutorials/20-structured-data.ipynb | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/script/test/test.sh b/script/test/test.sh index ae67efb62c..47f54bbeaf 100644 --- a/script/test/test.sh +++ b/script/test/test.sh @@ -94,7 +94,7 @@ long_integration_test() { } notebook_test() { - PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" --capture=sys --tb=short -v --log-level=WARNING --nbmake-timeout=3000 --ignore="tutorials/08-chatgpt.ipynb" --ignore="tutorials/14-food-review-tone-analysis-and-response.ipynb" --ignore="tutorials/15-AI-powered-join.ipynb" --ignore="tutorials/16-homesale-forecasting.ipynb" --ignore="tutorials/17-home-rental-prediction.ipynb" --ignore="tutorials/18-stable-diffusion.ipynb" --ignore="tutorials/19-employee-classification-prediction.ipynb" + PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" --capture=sys --tb=short -v --log-level=WARNING --nbmake-timeout=3000 --ignore="tutorials/08-chatgpt.ipynb" --ignore="tutorials/14-food-review-tone-analysis-and-response.ipynb" --ignore="tutorials/15-AI-powered-join.ipynb" --ignore="tutorials/16-homesale-forecasting.ipynb" --ignore="tutorials/17-home-rental-prediction.ipynb" --ignore="tutorials/18-stable-diffusion.ipynb" --ignore="tutorials/19-employee-classification-prediction.ipynb" --ignore="tutorials/20-structured-data.ipynb" code=$? print_error_code $code "NOTEBOOK TEST" } diff --git a/tutorials/20-structured-data.ipynb b/tutorials/20-structured-data.ipynb index c33747d518..ad0668b90e 100644 --- a/tutorials/20-structured-data.ipynb +++ b/tutorials/20-structured-data.ipynb @@ -100,13 +100,6 @@ "for _, row in table.iterrows():\n", " print(row['response'])" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 7b1917443611fea085116fc0039b34c150eaea28 Mon Sep 17 00:00:00 2001 From: Hersh Dhillon Date: Fri, 1 Dec 2023 17:17:23 -0500 Subject: [PATCH 8/8] Solving for linter changes --- evadb/functions/extract_column.py | 2 +- .../integration_tests/long/functions/test_extract_column.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/evadb/functions/extract_column.py b/evadb/functions/extract_column.py index db1f5d7817..6897944b3e 100644 --- a/evadb/functions/extract_column.py +++ b/evadb/functions/extract_column.py @@ -72,7 +72,7 @@ def forward(self, unstructured_df): description = row[1] data_type = row[2] input_rows = row[3] - + prompt = """ You are given a user query. Your task is to extract the following fields from the query and return the result in string format. IMPORTANT: RETURN ONLY THE EXTRACTED VALUE (one word or phrase). DO NOT RETURN THE FIELD NAME OR ANY OTHER INFORMATION. diff --git a/test/integration_tests/long/functions/test_extract_column.py b/test/integration_tests/long/functions/test_extract_column.py index 74a01198aa..d1cd0407e2 100644 --- a/test/integration_tests/long/functions/test_extract_column.py +++ b/test/integration_tests/long/functions/test_extract_column.py @@ -18,8 +18,6 @@ from test.markers import chatgpt_skip_marker from test.util import get_evadb_for_testing -import pandas as pd - from evadb.server.command_handler import execute_query_fetch_all @@ -35,7 +33,9 @@ def setUp(self) -> None: input_row = "My keyboard has stopped working" - insert_query = f"""INSERT INTO InputUnstructured (input_rows) VALUES ("{input_row}")""" + insert_query = ( + f"""INSERT INTO InputUnstructured (input_rows) VALUES ("{input_row}")""" + ) execute_query_fetch_all(self.evadb, insert_query) # Add actual API key here os.environ["OPENAI_API_KEY"] = "sk-..."