Merge branch 'staging' into extract_columns

georgia-tech-db · Nov 28, 2023 · cadedd1 · cadedd1
2 parents 8cab604 + 334c8b1
commit cadedd1
Show file tree

Hide file tree

Showing 85 changed files with 2,993 additions and 202 deletions.
diff --git a/.gitignore b/.gitignore
@@ -102,6 +102,7 @@ venv.bak/
 env38/
 env_eva/
 evadb-venv/
+test_eva_db/
 
 # Spyder project settings
 .spyderproject
@@ -226,3 +227,5 @@ eva_db/*
 eva/*
 
 blog.md
+
+tests/integration_tests/short/*.db
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,40 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### [Deprecated]
 ### [Removed]
 
+##  [0.3.9] - 2023-11-19
+
+* PR #1367: v0.3.9 - new release 
+* PR #1368: Migrate ChatGPT function to openai v1.0 
+* PR #1361: Adding changes for Flaml Sklearn integration 
+* PR #1258: Add feedback for forecasting 
+* PR #1364: Fix python3.8 failing testcases due to type hint 
+* PR #1308: Job scheduler implementation 
+* PR #1352: Make Logic Operators Case Insensitve 
+* PR #1342: Add train scores for ludwig in the create function handler. 
+* PR #1067: Verified that issue #1067 is resolved and added documentation for load pdf functionality. 
+* PR #1334: logging an error message for invalid files while loading 
+* PR #1330: [WIP] Improving error handling messages for Custom Functions 
+* PR #1033: feat: third party app support in EVADB 
+* PR #1339: Support semicolon and escaped strings in lark 
+* PR #1337: CREATE INDEX IF NOT EXISTS is broken. 
+* PR #1333: Fix #1333 dependency and CMD in DockerFile 
+* PR #1336: Add test_eva_db to gitignore 
+* PR #1327: Add the validation score and training time for create_function in XGBoost 
+* PR #1322: String Helper functions in EvaDB 
+* PR #1321: REST API Documentation 
+* PR #1323: Added docs for SET and SHOW CONFIG query 
+* PR #1316: Adding colab notebook for XGBoost Regression and Classification 
+* PR #1269: Add Milvus integration for vector create and search 
+* PR #1319: refactor: code refactor 
+* PR #1298: Convert nested join in Vector Queries to Pandas Merge.  
+* PR #1305: Starting the changes for XGBoost classification integration. 
+* PR #1289: SnowFlake Integration for EvaDB 
+* PR #1307: Fix: minor typo 
+* PR #1272: Configuration Manager Redesign 
+* PR #1283: Fix current issues with forecasting 
+* PR #1304: Bump Version to v0.3.9+dev 
+* PR #1303: v0.3.8 - new release 
+
 ##  [0.3.8] - 2023-10-18
 
 * PR #1303: v0.3.8 - new release 

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -15,6 +15,11 @@ RUN apt-get update && \
     apt-get autoclean -y && \
     rm -rf /var/cache/apt/* /var/lib/apt/lists/*
 
+# Install gcc and python3.9-dev
+RUN apt-get update && apt-get install -y \
+    gcc \
+    python3.9-dev
+
 # Install pip
 RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
     python3.9 get-pip.py && \
@@ -32,5 +37,5 @@ RUN python3.9 -m pip install evadb
 # Expose the default port EvaDB runs on 
 EXPOSE 8803
 
-# Start EvaDB
-CMD ["eva_server"]
+# Start EvaDB server
+CMD ["evadb_server"]
diff --git a/docs/_toc.yml b/docs/_toc.yml
@@ -45,6 +45,7 @@ parts:
           - file: source/reference/evaql/load_csv
           - file: source/reference/evaql/load_image
           - file: source/reference/evaql/load_video
+          - file: source/reference/evaql/load_pdf
           - file: source/reference/evaql/select
           - file: source/reference/evaql/explain
           - file: source/reference/evaql/show_functions
@@ -78,6 +79,7 @@ parts:
           - file: source/reference/databases/clickhouse
           - file: source/reference/databases/github
           - file: source/reference/databases/snowflake
+          - file: source/reference/databases/hackernews
 
       - file: source/reference/vector_databases/index
         title: Vector Databases
@@ -88,6 +90,7 @@ parts:
           - file: source/reference/vector_databases/pgvector
           - file: source/reference/vector_databases/pinecone
           - file: source/reference/vector_databases/milvus
+          - file: source/reference/vector_databases/weaviate
 
       - file: source/reference/ai/index
         title: AI Engines

diff --git a/docs/source/overview/concepts.rst b/docs/source/overview/concepts.rst
@@ -92,6 +92,6 @@ After registering ``MnistImageClassifier`` function, you can call the function i
 AI-Centric Query Optimization
 -----------------------------
 
-EvaDB optimizes the AI queries to save money spent on running models and reduce query execution time. It contains a novel `Cascades-style query optimizer <https://www.cse.iitb.ac.in/infolab/Data/Courses/CS632/Papers/Cascades-graefe.pdf>`__  tailored for AI queries.
+EvaDB optimizes the AI queries to save money spent on running models and reduce query execution time. It contains a novel `Cascades-style query optimizer <https://faculty.cc.gatech.edu/~jarulraj/courses/8803-s21/slides/22-cascades.pdf>`__  tailored for AI queries.
 
-Query optimization has powered SQL database systems for several decades. It is the bridge that connects the declarative query language to efficient query execution on hardware. EvaDB accelerates AI queries using a collection of optimizations detailed in the :ref:`optimizations<optimizations>` page.
+Query optimization has powered SQL database systems for several decades. It is the bridge that connects the declarative query language to efficient query execution on hardware. EvaDB accelerates AI queries using a collection of optimizations detailed in the :ref:`optimizations<optimizations>` page.
diff --git a/docs/source/reference/ai/model-forecasting.rst b/docs/source/reference/ai/model-forecasting.rst
@@ -3,6 +3,7 @@
 Time Series Forecasting
 ========================
 
+A Time Series is a series of data points recorded at different time intervals. Time series forecasting involves estimating future values of a time series by analyzing historical data.
 You can train a forecasting model easily in EvaDB.
 
 .. note::
@@ -28,15 +29,14 @@ Next, we create a function of `TYPE Forecasting`. We must enter the column name
    CREATE FUNCTION IF NOT EXISTS Forecast FROM
    (SELECT y FROM AirData)
    TYPE Forecasting
+   HORIZON 12
    PREDICT 'y';
 
 This trains a forecasting model. The model can be called by providing the horizon for forecasting.
 
 .. code-block:: sql
 
-   SELECT Forecast(12);
-
-Here, the horizon is `12`, which represents the forecast 12 steps into the future.
+   SELECT Forecast();
 
 
 Forecast Parameters
@@ -58,13 +58,25 @@ EvaDB's default forecast framework is `statsforecast <https://nixtla.github.io/s
    * - LIBRARY (str, default: 'statsforecast')
      - We can select one of `statsforecast` (default) or `neuralforecast`. `statsforecast` provides access to statistical forecasting methods, while `neuralforecast` gives access to deep-learning based forecasting methods.
    * - MODEL (str, default: 'ARIMA')
-     - If LIBRARY is `statsforecast`, we can select one of ARIMA, ting, ETS, Theta. The default is ARIMA. Check `Automatic Forecasting <https://nixtla.github.io/statsforecast/src/core/models_intro.html#automatic-forecasting>`_ to learn details about these models. If LIBRARY is `neuralforecast`, we can select one of NHITS or NBEATS. The default is NBEATS. Check `NBEATS docs <https://nixtla.github.io/neuralforecast/models.nbeats.html>`_ for details.
+     - If LIBRARY is `statsforecast`, we can select one of ARIMA, ting, ETS, Theta. The default is ARIMA. Check `Automatic Forecasting <https://nixtla.mintlify.app/statsforecast/index.html#automatic-forecasting>`_ to learn details about these models. If LIBRARY is `neuralforecast`, we can select one of NHITS or NBEATS. The default is NBEATS. Check `NBEATS docs <https://nixtla.github.io/neuralforecast/models.nbeats.html>`_ for details.
    * - AUTO (str, default: 'T')
      - If set to 'T', it enables automatic hyperparameter optimization. Must be set to 'T' for `statsforecast` library. One may set this parameter to `false` if LIBRARY is `neuralforecast` for faster (but less reliable) results.
-   * - Frequency (str, default: 'auto')
+   * - CONF (int, default: 90)
+     - Sets the confidence interval in percentage for the forecast. Must be a number between 0 and 100. The lower and upper bounds of the confidence interval are returned in two separate columns, named as the PREDICT column with `-lo` and `-hi` suffixes.
+   * - FREQUENCY (str, default: 'auto')
      - A string indicating the frequency of the data. The common used ones are D, W, M, Y, which respectively represents day-, week-, month- and year- end frequency. The default value is M. Check `pandas available frequencies <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_ for all available frequencies. If it is not provided, the frequency is attempted to be determined automatically.
+   * - METRICS (str, default: 'True')
+     - Compute NRMSE by performing cross-validation. It is `False` by default if `LIBRARY` is `neuralforecast` as it can take an extensively long time. The metrics are logged locally.
+
+.. note::
+
+    If columns other than the ones required as mentioned above are passed while creating the function, they will be treated as exogenous variables if LIBRARY is `neuralforecast`. Otherwise, they would be ignored.
+
+
+.. note::
+
+   `Forecasting` function also logs suggestions. Logged information, such as metrics and suggestions, is sent to STDOUT by default. If you wish not to print it, please send `FALSE` as an optional argument while calling the function. Eg. `SELECT Forecast(FALSE);`
 
-Note: If columns other than the ones required as mentioned above are passed while creating the function, they will be treated as exogenous variables if LIBRARY is `neuralforecast`. Otherwise, they would be ignored.
 
 Below is an example query specifying the above parameters:
 
@@ -90,4 +102,4 @@ Below is an example query with `neuralforecast` with `trend` column as exogenous
     PREDICT 'y'
     LIBRARY 'neuralforecast'
     AUTO 'f'
-    FREQUENCY 'M';
+    FREQUENCY 'M';
diff --git a/docs/source/reference/databases/github.rst b/docs/source/reference/databases/github.rst
@@ -19,7 +19,7 @@ Required:
 
 Optional:
 
-* ``github_token`` is not required for public repositories. However, the rate limit is lower without a valid github_token. Check `Rate limits page <https://docs.github.com/en/rest/overview/resources-in-the-rest-api?apiVersion=2022-11-28#rate-limits>`_ to learn more about how to check your rate limit status. Check `Managing your personal access tokens page <https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens>`_ to learn how to create personal access tokens.
+* ``github_token`` is not required for public repositories. However, the rate limit is lower without a valid github_token. Check the `Rate limits page <https://docs.github.com/en/rest/overview/rate-limits-for-the-rest-api?apiVersion=2022-11-28>`_ to learn more about how to check your rate limit status. Check `Managing your personal access tokens page <https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens>`_ to learn how to create personal access tokens.
 
 Create Connection
 -----------------

diff --git a/docs/source/reference/databases/hackernews.rst b/docs/source/reference/databases/hackernews.rst
@@ -0,0 +1,44 @@
+Hackernews
+==========
+
+The connection to Hackernews is based on the `Algolia Hackernews <https://hn.algolia.com/api>`_ API.
+
+Dependency
+----------
+
+* requests
+
+
+Parameters
+----------
+
+Required:
+
+* ``query`` is the search query for getting the results.
+
+Optional:
+
+* ``tags`` is the tag used for filtering the query results. Check `available tags <https://hn.algolia.com/api>`_ to see a list of available filter tags.
+
+Create Connection
+-----------------
+
+.. code-block:: text
+
+   CREATE DATABASE hackernews_data WITH ENGINE = 'hackernews', PARAMETERS = {
+        "query": "EVADB",
+        "tags": "story"
+   };
+
+Supported Tables
+----------------
+
+* ``search_results``: Lists the search query results. Check `table_column_info.py <https://github.com/georgia-tech-db/evadb/blob/staging/evadb/third_party/databases/hackernews/table_column_info.py>`_ for all the available columns in the table.
+
+.. code-block:: sql
+
+   SELECT * FROM hackernews_data.search_results LIMIT 3;
+
+.. note::
+
+   Looking for another table from Hackernews? Please raise a `Feature Request <https://github.com/georgia-tech-db/evadb/issues/new/choose>`_.
diff --git a/docs/source/reference/evaql/load_pdf.rst b/docs/source/reference/evaql/load_pdf.rst
@@ -0,0 +1,16 @@
+LOAD PDF
+==========
+
+.. _load-pdf:
+
+.. code:: mysql
+
+   LOAD PDF 'test_pdf.pdf' INTO MyPDFs;
+
+PDFs can be directly imported into a table, where the PDF document is segmented into pages and paragraphs.
+Each row in the table corresponds to a paragraph extracted from the PDF, and the resulting table includes columns for ``name`` , ``page``, ``paragraph``, and ``data``.
+
+| ``name`` signifies the title of the uploaded PDF.
+| ``page`` signifies the specific page number from which the data is retrieved.
+| ``paragraph`` signifies the individual paragraph within a page from which the data is extracted.
+| ``data``  refers to the text extracted from the paragraph on the given page.
diff --git a/docs/source/reference/evaql/set_config.rst b/docs/source/reference/evaql/set_config.rst
@@ -1,4 +1,4 @@
-SET CONFIG
+SET CONFIGS
 ==============
 
 .. _set_config:
@@ -8,4 +8,8 @@ Sets the value of a configuration parameter to the passed value. Both `TO` and `
 .. code:: sql
 
     SET OPENAI_KEY TO "abc";
-    SET OPENAI_KEY = "abc";
+    SET OPENAI_KEY = "abc";
+
+.. note::
+
+    The `SET` command does not support `CONFIG` or `CONFIGS` as keys names. This is because `CONFIG` and `CONFIGS` are reserved keywords.
diff --git a/docs/source/reference/evaql/show_config.rst b/docs/source/reference/evaql/show_config.rst
@@ -1,4 +1,4 @@
-SHOW CONFIG
+SHOW CONFIGS
 ==============
 
 .. _show_config:
@@ -9,3 +9,11 @@ Returns the value of a specified configuration parameter.
 
     SHOW <parameter_name>;
     SHOW OPENAI_KEY;
+
+.. _show_configs:
+
+In order to see all the configuration parameters, use the following command:
+
+.. code:: sql
+
+    SHOW CONFIGS;
diff --git a/docs/source/reference/vector_databases/weaviate.rst b/docs/source/reference/vector_databases/weaviate.rst
@@ -0,0 +1,31 @@
+Weaviate
+==========
+
+Weaviate is an open-source vector database designed for scalability and rich querying capabilities. It allows for semantic search, automated vectorization, and supports large language model (LLM) integration.
+The connection to Weaviate is based on the `weaviate-client <https://weaviate.io/developers/weaviate/client-libraries/python>`_ library.
+
+Dependency
+----------
+
+* weaviate-client
+
+Parameters
+----------
+
+To use Weaviate, you need an API key and a URL of your Weaviate instance. Here are the `instructions for setting up a Weaviate instance <https://weaviate.io/developers/weaviate/quickstart>`_. After setting up your instance, you will find the API key and URL on the Details tab in Weaviate Cloud Services (WCS) dashboard. These details are essential for establishing a connection to the Weaviate server.
+
+* `WEAVIATE_API_KEY` is the API key for your Weaviate instance.
+* `WEAVIATE_API_URL` is the URL of your Weaviate instance.
+
+The above values can either be set via the ``SET`` statement, or in the os environment fields "WEAVIATE_API_KEY", "WEAVIATE_API_URL"
+
+Create Collection
+-----------------
+
+Weaviate uses collections (similar to 'classes') to store data. To create a collection in Weaviate, use the following SQL command in EvaDB:
+
+.. code-block:: sql
+
+   CREATE INDEX collection_name ON table_name (data) USING WEAVIATE;
+
+This command creates a collection in Weaviate with the specified name, linked to the table in EvaDB. You can also specify vectorizer settings and other configurations for the collection as needed.
diff --git a/evadb/binder/function_expression_binder.py b/evadb/binder/function_expression_binder.py
@@ -112,7 +112,7 @@ def bind_func_expr(binder: StatementBinder, node: FunctionExpression):
             if string_comparison_case_insensitive(node.name, "CHATGPT"):
                 # if the user didn't provide any API_KEY, check if we have one in the catalog
                 if "OPENAI_API_KEY" not in properties.keys():
-                    OpenAI_key = binder._catalog().get_configuration_catalog_value(
+                    openai_key = binder._catalog().get_configuration_catalog_value(
                         "OPENAI_API_KEY"
                     )
                     properties["openai_api_key"] = openai_key

diff --git a/evadb/binder/statement_binder.py b/evadb/binder/statement_binder.py
@@ -89,6 +89,7 @@ def _bind_create_function_statement(self, node: CreateFunctionStatement):
                 for column in all_column_list:
                     if column.name in predict_columns:
                         column.name = column.name + "_predictions"
+
                         outputs.append(column)
                     else:
                         inputs.append(column)
@@ -122,6 +123,22 @@ def _bind_create_function_statement(self, node: CreateFunctionStatement):
                 assert (
                     len(required_columns) == 0
                 ), f"Missing required {required_columns} columns for forecasting function."
+                outputs.extend(
+                    [
+                        ColumnDefinition(
+                            arg_map.get("predict", "y") + "-lo",
+                            ColumnType.FLOAT,
+                            None,
+                            None,
+                        ),
+                        ColumnDefinition(
+                            arg_map.get("predict", "y") + "-hi",
+                            ColumnType.FLOAT,
+                            None,
+                            None,
+                        ),
+                    ]
+                )
             else:
                 raise BinderError(
                     f"Unsupported type of function: {node.function_type}."