georgia-tech-db · hershd23 · Nov 3, 2023 · Nov 3, 2023 · Nov 28, 2023 · Nov 28, 2023
diff --git a/.gitignore b/.gitignore
@@ -101,6 +101,7 @@ env.bak/
 venv.bak/
 env38/
 env_eva/
+evadb-venv/
 test_eva_db/
 
 # Spyder project settings

diff --git a/evadb/functions/extract_column.py b/evadb/functions/extract_column.py
@@ -0,0 +1,101 @@
+# coding=utf-8
+# Copyright 2018-2023 EvaDB
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+
+from evadb.catalog.catalog_type import NdArrayType
+from evadb.functions.chatgpt import ChatGPT
+from evadb.functions.decorators.decorators import forward
+from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe
+
+
+class ExtractColumnFunction(ChatGPT):
+    @property
+    def name(self) -> str:
+        return "EXTRACT_COLUMN"
+
+    def setup(
+        self, model="gpt-3.5-turbo", temperature: float = 0, openai_api_key=""
+    ) -> None:
+        super(ExtractColumnFunction, self).setup(model, temperature, openai_api_key)
+
+    @forward(
+        input_signatures=[
+            PandasDataframe(
+                columns=["field_name" "description", "data_type", "input_rows"],
+                column_types=[
+                    NdArrayType.STR,
+                    NdArrayType.STR,
+                    NdArrayType.STR,
+                    NdArrayType.STR,
+                ],
+                column_shapes=[
+                    (1,),
+                    (1,),
+                    (1,),
+                    (1,),
+                ],
+            )
+        ],
+        output_signatures=[
+            PandasDataframe(
+                columns=["response"],
+                column_types=[
+                    NdArrayType.STR,
+                ],
+                column_shapes=[(1,)],
+            )
+        ],
+    )
+    def forward(self, unstructured_df):
+        """
+        NOTE (QUESTION) : Can we structure the inputs and outputs better
+        The circumvent issues surrounding the input being only one pandas dataframe and output columns being predefined
+        Will add all column types as a JSON and parse in the forward function
+        Provide only the file name from which the input will be read
+        Output in JSON which can be serialized and stored in the results column of the DF
+        """
+        field_name = unstructured_df.iloc[0, 0]
+        description = unstructured_df.iloc[0, 1]
+        data_type = unstructured_df.iloc[0, 2]
+        input_rows = unstructured_df[unstructured_df.columns[3]]
+        prompt = """
+            You are given a user query. Your task is to extract the following fields from the query and return the result in string format.
+            IMPORTANT: RETURN ONLY THE EXTRACTED VALUE (one word or phrase). DO NOT RETURN THE FIELD NAME OR ANY OTHER INFORMATION.
+        """
+        content = """
+            Extract the following fields from the unstructured text below:
+            Format of the field is given in the format
+            Field Name: Field Description: Field Type
+            {}: {}: {}
+            The unstructured text is as follows:
+        """.format(
+            field_name, description, data_type
+        )
+
+        print(prompt)
+        print(content)
+
+        output_df = pd.DataFrame({"response": []})
+
+        for row in input_rows:
+            query = row
+            input_df = pd.DataFrame(
+                {"query": [query], "content": content, "prompt": prompt}
+            )
+            print(query)
+            df = super(ExtractColumnFunction, self).forward(input_df)
+            output_df = pd.concat([output_df, df], ignore_index=True)
+        return output_df
diff --git a/tutorials/20-structured-data.ipynb b/tutorials/20-structured-data.ipynb
 PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" --capture=sys --tb=short -v --log-level=WARNING --nbmake-timeout=3000 --ignore="tutorials/08-chatgpt.ipynb" --ignore="tutorials/14-food-review-tone-analysis-and-response.ipynb" --ignore="tutorials/15-AI-powered-join.ipynb" --ignore="tutorials/16-homesale-forecasting.ipynb" --ignore="tutorials/17-home-rental-prediction.ipynb" --ignore="tutorials/18-stable-diffusion.ipynb" --ignore="tutorials/19-employee-classification-prediction.ipynb" 
 PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" --capture=sys --tb=short -v --log-level=WARNING --nbmake-timeout=3000 --ignore="tutorials/08-chatgpt.ipynb" --ignore="tutorials/14-food-review-tone-analysis-and-response.ipynb" --ignore="tutorials/15-AI-powered-join.ipynb" --ignore="tutorials/16-homesale-forecasting.ipynb" --ignore="tutorials/17-home-rental-prediction.ipynb" --ignore="tutorials/18-stable-diffusion.ipynb" --ignore="tutorials/19-employee-classification-prediction.ipynb" 
@@ -0,0 +1,208 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import dependencies\n",
+    "import os\n",
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/hershdhillon23/projects/evadb/evadb-venv/lib/python3.9/site-packages/urllib3/__init__.py:34: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n",
+      "  warnings.warn(\n",
+      "Downloading: \"http://ml.cs.tsinghua.edu.cn/~chenxi/pytorch-models/mnist-b07bb66b.pth\" to /Users/hershdhillon23/.cache/torch/hub/checkpoints/mnist-b07bb66b.pth\n",
+      "100%|██████████| 1.03M/1.03M [00:01<00:00, 697kB/s] \n",
+      "Downloading: \"https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth\" to /Users/hershdhillon23/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install --quiet \"evadb[document,notebook]\"\n",
+    "import evadb\n",
+    "cursor = evadb.connect().cursor()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set your OpenAI key as an environment variable\n",
+    "import os\n",
+    "#os.environ['OPENAI_API_KEY'] = ''\n",
+    "open_ai_key = os.environ.get(\"OPENAI_API_KEY\", \"\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<evadb.models.storage.batch.Batch at 0x176ef33d0>"
+      ]
+     },
+     "execution_count": 77,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# set up the extract columns UDF available at functions/extract_columns.py\n",
+    "cursor.query(\"\"\"CREATE FUNCTION IF NOT EXISTS ExtractColumn\n",
+    "            IMPL  '../evadb/functions/extract_column.py';\n",
+    "                \"\"\").execute()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Empty DataFrame\n",
+      "Columns: []\n",
+      "Index: []\n"
+     ]
+    }
+   ],
+   "source": [
+    "# # delete the table if it already exists\n",
+    "cursor.query(\"\"\"DROP TABLE IF EXISTS InputUnstructured\n",
+    "                \"\"\").execute()\n",
+    "\n",
+    "# create the table specifying the type of the prompt column\n",
+    "cursor.query(\"\"\"CREATE TABLE IF NOT EXISTS InputUnstructured (\n",
+    "             input_rows TEXT)\n",
+    "                \"\"\").execute()\n",
+    "\n",
+    "table = cursor.query(\"SELECT * FROM InputUnstructured;\").df()\n",
+    "print(table)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_rows_list = [\"The touch screen on my tablet stopped working for no reason.\",\n",
+    "#                   \"Why does my computer take so long to start up? It's been like this for weeks.\",\n",
+    "#                   \"My phone battery dies too quickly. I just bought it!\",\n",
+    "                    \"My headphones won't connect to my phone anymore, even though they used to work just fine.\",\n",
+    "                   \"The software update completely messed up my computer. Now nothing works properly.\"]\n",
+    "\n",
+    "for input_row in input_rows_list:\n",
+    "    cursor.query(f\"\"\"INSERT INTO InputUnstructured (input_rows) VALUES (\"{input_row}\")\"\"\").execute()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   _row_id                                         input_rows\n",
+      "0        1  The touch screen on my tablet stopped working ...\n",
+      "1        2  My headphones won't connect to my phone anymor...\n",
+      "2        3  The software update completely messed up my co...\n"
+     ]
+    }
+   ],
+   "source": [
+    "table = cursor.query(\"SELECT * FROM InputUnstructured;\").df()\n",
+    "print(table)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "            You are given a user query. Your task is to extract the following fields from the query and return the result in string format.\n",
+      "            IMPORTANT: RETURN ONLY THE EXTRACTED VALUE (one word or phrase). DO NOT RETURN THE FIELD NAME OR ANY OTHER INFORMATION.\n",
+      "        \n",
+      "\n",
+      "            Extract the following fields from the unstructured text below:\n",
+      "            \n",
+      "            Format of the field is given in the format\n",
+      "            Field Name: Field Description: Field Type\n",
+      "            Issue Component: The component that is causing the issue: string less than 2 words\n",
+      "\n",
+      "            The unstructured text is as follows:\n",
+      "        \n",
+      "The touch screen on my tablet stopped working for no reason.\n",
+      "My headphones won't connect to my phone anymore, even though they used to work just fine.\n",
+      "The software update completely messed up my computer. Now nothing works properly.\n",
+      "touch screen\n",
+      "headphones, connect, phone, work, fine\n",
+      "software update\n"
+     ]
+    }
+   ],
+   "source": [
+    "table = cursor.query(\n",
+    "        \"\"\"SELECT ExtractColumn(\"Issue Component\",\"The component that is causing the issue\", \"string less than 2 words\", input_rows) FROM InputUnstructured;\"\"\"\n",
+    "    ).df()\n",
+    "\n",
+    "for _, row in table.iterrows():\n",
+    "    print(row['response'])\n",
+    "#print(table.iloc[1]['response'])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}