georgia-tech-db · hershd23 · Nov 3, 2023 · Nov 3, 2023 · Nov 28, 2023 · Nov 28, 2023
diff --git a/.gitignore b/.gitignore
@@ -101,6 +101,7 @@ env.bak/
 venv.bak/
 env38/
 env_eva/
+evadb-venv/
 test_eva_db/
 
 # Spyder project settings

diff --git a/evadb/functions/extract_column.py b/evadb/functions/extract_column.py
@@ -0,0 +1,99 @@
+# coding=utf-8
+# Copyright 2018-2023 EvaDB
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+
+from evadb.catalog.catalog_type import NdArrayType
+from evadb.functions.chatgpt import ChatGPT
+from evadb.functions.decorators.decorators import forward
+from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe
+
+
+class ExtractColumnFunction(ChatGPT):
+    @property
+    def name(self) -> str:
+        return "EXTRACT_COLUMN"
+
+    def setup(
+        self, model="gpt-3.5-turbo", temperature: float = 0, openai_api_key=""
+    ) -> None:
+        super(ExtractColumnFunction, self).setup(model, temperature, openai_api_key)
+
+    @forward(
+        input_signatures=[
+            PandasDataframe(
+                columns=["field_name", "description", "data_type", "input_rows"],
+                column_types=[
+                    NdArrayType.STR,
+                    NdArrayType.STR,
+                    NdArrayType.STR,
+                    NdArrayType.STR,
+                ],
+                column_shapes=[
+                    (1,),
+                    (1,),
+                    (1,),
+                    (1,),
+                ],
+            )
+        ],
+        output_signatures=[
+            PandasDataframe(
+                columns=["response"],
+                column_types=[
+                    NdArrayType.STR,
+                ],
+                column_shapes=[(1,)],
+            )
+        ],
+    )
+    def forward(self, unstructured_df):
+        """
+        NOTE (QUESTION) : Can we structure the inputs and outputs better
+        The circumvent issues surrounding the input being only one pandas dataframe and output columns being predefined
+        Will add all column types as a JSON and parse in the forward function
+        Provide only the file name from which the input will be read
+        Output in JSON which can be serialized and stored in the results column of the DF
+        """
+        for row in unstructured_df.itertuples():
+            field_name = row[0]
+            description = row[1]
+            data_type = row[2]
+            input_rows = row[3]
+
+        prompt = """
+            You are given a user query. Your task is to extract the following fields from the query and return the result in string format.
+            IMPORTANT: RETURN ONLY THE EXTRACTED VALUE (one word or phrase). DO NOT RETURN THE FIELD NAME OR ANY OTHER INFORMATION.
+        """
+        content = """
+            Extract the following fields from the unstructured text below:
+            Format of the field is given in the format
+            Field Name: Field Description: Field Type
+            {}: {}: {}
+            The unstructured text is as follows:
+        """.format(
+            field_name, description, data_type
+        )
+
+        output_df = pd.DataFrame({"response": []})
+
+        for row in input_rows:
+            query = row
+            input_df = pd.DataFrame(
+                {"query": [query], "content": content, "prompt": prompt}
+            )
+            df = super(ExtractColumnFunction, self).forward(input_df)
+            output_df = pd.concat([output_df, df], ignore_index=True)
+        return output_df
diff --git a/script/test/test.sh b/script/test/test.sh
@@ -94,7 +94,7 @@ long_integration_test() {
 }
 
 notebook_test() {
-  PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" --capture=sys --tb=short -v --log-level=WARNING --nbmake-timeout=3000 --ignore="tutorials/08-chatgpt.ipynb" --ignore="tutorials/14-food-review-tone-analysis-and-response.ipynb" --ignore="tutorials/15-AI-powered-join.ipynb" --ignore="tutorials/16-homesale-forecasting.ipynb" --ignore="tutorials/17-home-rental-prediction.ipynb" --ignore="tutorials/18-stable-diffusion.ipynb" --ignore="tutorials/19-employee-classification-prediction.ipynb"
+  PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" --capture=sys --tb=short -v --log-level=WARNING --nbmake-timeout=3000 --ignore="tutorials/08-chatgpt.ipynb" --ignore="tutorials/14-food-review-tone-analysis-and-response.ipynb" --ignore="tutorials/15-AI-powered-join.ipynb" --ignore="tutorials/16-homesale-forecasting.ipynb" --ignore="tutorials/17-home-rental-prediction.ipynb" --ignore="tutorials/18-stable-diffusion.ipynb" --ignore="tutorials/19-employee-classification-prediction.ipynb" --ignore="tutorials/20-structured-data.ipynb"
   code=$?
   print_error_code $code "NOTEBOOK TEST"
 }

diff --git a/test/integration_tests/long/functions/test_extract_column.py b/test/integration_tests/long/functions/test_extract_column.py
@@ -0,0 +1,57 @@
+# coding=utf-8
+# Copyright 2018-2023 EvaDB
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+from test.markers import chatgpt_skip_marker
+from test.util import get_evadb_for_testing
+
+from evadb.server.command_handler import execute_query_fetch_all
+
+
+class ExtractColumnTest(unittest.TestCase):
+    def setUp(self) -> None:
+        self.evadb = get_evadb_for_testing()
+        self.evadb.catalog().reset()
+        create_table_query = """CREATE TABLE IF NOT EXISTS InputUnstructured (
+                input_rows TEXT)
+            """
+
+        execute_query_fetch_all(self.evadb, create_table_query)
+
+        input_row = "My keyboard has stopped working"
+
+        insert_query = (
+            f"""INSERT INTO InputUnstructured (input_rows) VALUES ("{input_row}")"""
+        )
+        execute_query_fetch_all(self.evadb, insert_query)
+        # Add actual API key here
+        os.environ["OPENAI_API_KEY"] = "sk-..."
+
+    def tearDown(self) -> None:
+        execute_query_fetch_all(self.evadb, "DROP TABLE IF EXISTS InputUnstructured;")
+
+    @chatgpt_skip_marker
+    def test_extract_column_function(self):
+        function_name = "ExtractColumn"
+        execute_query_fetch_all(self.evadb, f"DROP FUNCTION IF EXISTS {function_name};")
+
+        create_function_query = f"""CREATE FUNCTION IF NOT EXISTS {function_name} IMPL 'evadb/functions/extract_column.py';"""
+
+        execute_query_fetch_all(self.evadb, create_function_query)
+
+        extract_columns_query = f"SELECT {function_name}('Issue Component','The component that is causing the issue', 'string less than 2 words', input_rows) FROM InputUnstructured;"
+        output_batch = execute_query_fetch_all(self.evadb, extract_columns_query)
+        self.assertEqual(output_batch.columns, ["chatgpt.response"])
diff --git a/tutorials/20-structured-data.ipynb b/tutorials/20-structured-data.ipynb
 PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" --capture=sys --tb=short -v --log-level=WARNING --nbmake-timeout=3000 --ignore="tutorials/08-chatgpt.ipynb" --ignore="tutorials/14-food-review-tone-analysis-and-response.ipynb" --ignore="tutorials/15-AI-powered-join.ipynb" --ignore="tutorials/16-homesale-forecasting.ipynb" --ignore="tutorials/17-home-rental-prediction.ipynb" --ignore="tutorials/18-stable-diffusion.ipynb" --ignore="tutorials/19-employee-classification-prediction.ipynb" 
 PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" --capture=sys --tb=short -v --log-level=WARNING --nbmake-timeout=3000 --ignore="tutorials/08-chatgpt.ipynb" --ignore="tutorials/14-food-review-tone-analysis-and-response.ipynb" --ignore="tutorials/15-AI-powered-join.ipynb" --ignore="tutorials/16-homesale-forecasting.ipynb" --ignore="tutorials/17-home-rental-prediction.ipynb" --ignore="tutorials/18-stable-diffusion.ipynb" --ignore="tutorials/19-employee-classification-prediction.ipynb" 
@@ -0,0 +1,126 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import dependencies\n",
+    "import os\n",
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install --quiet \"evadb[document,notebook]\"\n",
+    "import evadb\n",
+    "cursor = evadb.connect().cursor()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set your OpenAI key as an environment variable\n",
+    "import os\n",
+    "#os.environ['OPENAI_API_KEY'] = ''\n",
+    "open_ai_key = os.environ.get(\"OPENAI_API_KEY\", \"\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set up the extract columns UDF available at functions/extract_columns.py\n",
+    "cursor.query(\"\"\"CREATE FUNCTION IF NOT EXISTS ExtractColumn\n",
+    "            IMPL  '../evadb/functions/extract_column.py';\n",
+    "                \"\"\").execute()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # delete the table if it already exists\n",
+    "cursor.query(\"\"\"DROP TABLE IF EXISTS InputUnstructured\n",
+    "                \"\"\").execute()\n",
+    "\n",
+    "# create the table specifying the type of the prompt column\n",
+    "cursor.query(\"\"\"CREATE TABLE IF NOT EXISTS InputUnstructured (\n",
+    "             input_rows TEXT)\n",
+    "                \"\"\").execute()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_rows_list = [\"The touch screen on my tablet stopped working for no reason.\",\n",
+    "                    \"Why does my computer take so long to start up? It's been like this for weeks.\",\n",
+    "                    \"My phone battery dies too quickly. I just bought it!\",\n",
+    "                    \"My headphones won't connect to my phone anymore, even though they used to work just fine.\",\n",
+    "                    \"The software update completely messed up my computer. Now nothing works properly.\"]\n",
+    "\n",
+    "for input_row in input_rows_list:\n",
+    "    cursor.query(f\"\"\"INSERT INTO InputUnstructured (input_rows) VALUES (\"{input_row}\")\"\"\").execute()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "table = cursor.query(\"SELECT * FROM InputUnstructured;\").df()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "table = cursor.query(\n",
+    "        \"\"\"SELECT ExtractColumn(\"Issue Component\",\"The component that is causing the issue\", \"string less than 2 words\", input_rows) FROM InputUnstructured;\"\"\"\n",
+    "    ).df()\n",
+    "\n",
+    "for _, row in table.iterrows():\n",
+    "    print(row['response'])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}