From d4540dc048004585dd386d8456fb4af7bb49c708 Mon Sep 17 00:00:00 2001 From: Kaushik Ravichandran Date: Thu, 16 Nov 2023 02:33:22 -0500 Subject: [PATCH 1/9] hackernews support --- .../databases/hackernews/__init__.py | 15 ++ .../hackernews/hackernews_handler.py | 148 ++++++++++++++++++ .../databases/hackernews/table_column_info.py | 24 +++ evadb/third_party/databases/interface.py | 2 + .../long/test_hackernews_datasource .py | 55 +++++++ 5 files changed, 244 insertions(+) create mode 100644 evadb/third_party/databases/hackernews/__init__.py create mode 100644 evadb/third_party/databases/hackernews/hackernews_handler.py create mode 100644 evadb/third_party/databases/hackernews/table_column_info.py create mode 100644 test/integration_tests/long/test_hackernews_datasource .py diff --git a/evadb/third_party/databases/hackernews/__init__.py b/evadb/third_party/databases/hackernews/__init__.py new file mode 100644 index 000000000..705157094 --- /dev/null +++ b/evadb/third_party/databases/hackernews/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""hackernews search integration""" diff --git a/evadb/third_party/databases/hackernews/hackernews_handler.py b/evadb/third_party/databases/hackernews/hackernews_handler.py new file mode 100644 index 000000000..2b5e5a8c6 --- /dev/null +++ b/evadb/third_party/databases/hackernews/hackernews_handler.py @@ -0,0 +1,148 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import github +import pandas as pd +import requests +import json + +from evadb.third_party.databases.hackernews.table_column_info import HACKERNEWS_COLUMNS +from evadb.third_party.databases.types import ( + DBHandler, + DBHandlerResponse, + DBHandlerStatus, +) + + +class HackernewsSearchHandler(DBHandler): + connection = lambda x: requests.get("https://www.google.com/").status_code == 200 + def __init__(self, name: str, **kwargs): + """ + Initialize the handler. + Args: + name (str): name of the DB handler instance + **kwargs: arbitrary keyword arguments for establishing the connection. + """ + super().__init__(name) + self.query = kwargs.get("query", "") + self.tags = kwargs.get("tags", "") + + @property + def supported_table(self): + def _hackernews_topics_generator(): + url = "http://hn.algolia.com/api/v1/search?" + url += ("query=" + self.query) + url += ("" if self.tags == "" else "&tags=" + self.tags) + response = requests.get(url) + if (response.status_code != 200): + raise Exception("Could not reach website.") + json_result = response.content + dict_result = json.loads(json_result) + for row in dict_result: + yield { + property_name: row[property_name] + for property_name, _ in HACKERNEWS_COLUMNS + } + + mapping = { + "search_results": { + "columns": HACKERNEWS_COLUMNS, + "generator": _hackernews_topics_generator(), + }, + } + return mapping + + def connect(self): + """ + Set up the connection required by the handler. + Returns: + DBHandlerStatus + """ + return DBHandlerStatus(status=True) + + def disconnect(self): + """ + Close any existing connections. + """ + pass + + def check_connection(self) -> DBHandlerStatus: + """ + Check connection to the handler. + Returns: + DBHandlerStatus + """ + if self.connection(): + return DBHandlerStatus(status=True) + else: + return DBHandlerStatus(status=False, error="Not connected to the internet.") + + def get_tables(self) -> DBHandlerResponse: + """ + Return the list of tables in the database. + Returns: + DBHandlerResponse + """ + if not self.connection(): + return DBHandlerResponse(data=None, error="Not connected to the internet.") + + try: + tables_df = pd.DataFrame( + list(self.supported_table.keys()), columns=["table_name"] + ) + return DBHandlerResponse(data=tables_df) + except Exception as e: + return DBHandlerResponse(data=None, error=str(e)) + + def get_columns(self, table_name: str) -> DBHandlerResponse: + """ + Returns the list of columns for the given table. + Args: + table_name (str): name of the table whose columns are to be retrieved. + Returns: + DBHandlerResponse + """ + if not self.connection(): + return DBHandlerResponse(data=None, error="Not connected to the internet.") + try: + columns_df = pd.DataFrame( + self.supported_table[table_name]["columns"], columns=["name", "dtype"] + ) + return DBHandlerResponse(data=columns_df) + except Exception as e: + return DBHandlerResponse(data=None, error=str(e)) + + def select(self, table_name: str) -> DBHandlerResponse: + """ + Returns a generator that yields the data from the given table. + Args: + table_name (str): name of the table whose data is to be retrieved. + Returns: + DBHandlerResponse + """ + if not self.connection: + return DBHandlerResponse(data=None, error="Not connected to the database.") + try: + if table_name not in self.supported_table: + return DBHandlerResponse( + data=None, + error="{} is not supported or does not exist.".format(table_name), + ) + + return DBHandlerResponse( + data=None, + data_generator=self.supported_table[table_name]["generator"], + ) + except Exception as e: + return DBHandlerResponse(data=None, error=str(e)) diff --git a/evadb/third_party/databases/hackernews/table_column_info.py b/evadb/third_party/databases/hackernews/table_column_info.py new file mode 100644 index 000000000..ed2eb1a87 --- /dev/null +++ b/evadb/third_party/databases/hackernews/table_column_info.py @@ -0,0 +1,24 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Autogenerated by ChatGPT from https://github.com/PyGithub/PyGithub/blob/main/github/NamedUser.py +HACKERNEWS_COLUMNS = [ + ["title", str], + ["url", str], + ["author", str], + ["points", int], + ["story_text", str], + ["num_comments", int] +] diff --git a/evadb/third_party/databases/interface.py b/evadb/third_party/databases/interface.py index 5f8c4c2ac..cacb4110f 100644 --- a/evadb/third_party/databases/interface.py +++ b/evadb/third_party/databases/interface.py @@ -48,6 +48,8 @@ def _get_database_handler(engine: str, **kwargs): return mod.SnowFlakeDbHandler(engine, **kwargs) elif engine == "github": return mod.GithubHandler(engine, **kwargs) + elif engine == "hackernews": + return mod.HackernewsSearchHandler(engine, **kwargs) elif engine == "slack": return mod.SlackHandler(engine, **kwargs) else: diff --git a/test/integration_tests/long/test_hackernews_datasource .py b/test/integration_tests/long/test_hackernews_datasource .py new file mode 100644 index 000000000..8219b04a0 --- /dev/null +++ b/test/integration_tests/long/test_hackernews_datasource .py @@ -0,0 +1,55 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +from test.util import get_evadb_for_testing + +import pytest + +from evadb.server.command_handler import execute_query_fetch_all +from evadb.third_party.databases.github.table_column_info import STARGAZERS_COLUMNS + + +@pytest.mark.notparallel +class HackernewsDataSourceTest(unittest.TestCase): + def setUp(self): + self.evadb = get_evadb_for_testing() + # reset the catalog manager before running each test + self.evadb.catalog().reset() + + def tearDown(self): + execute_query_fetch_all(self.evadb, "DROP DATABASE IF EXISTS hackernews_data;") + + @pytest.mark.xfail(reason="Flaky testcase due to `bad request` error message") + def test_should_run_select_query_in_github(self): + # Create database. + params = { + "query": "EVADB", + "tags": "story", + } + query = f"""CREATE DATABASE hackernews_data + WITH ENGINE = "hackernews", + PARAMETERS = {params};""" + execute_query_fetch_all(self.evadb, query) + + query = "SELECT * FROM hackernews_data.search_results LIMIT 5;" + batch = execute_query_fetch_all(self.evadb, query) + self.assertEqual(len(batch), 10) + expected_column = list( + ["search_results.{}".format(col) for col, _ in STARGAZERS_COLUMNS] + ) + self.assertEqual(batch.columns, expected_column) + +if __name__ == "__main__": + unittest.main() From a62f86eacf607ac8bd6871a23a5e032a8910b877 Mon Sep 17 00:00:00 2001 From: Kaushik Ravichandran Date: Thu, 16 Nov 2023 02:37:40 -0500 Subject: [PATCH 2/9] change default --- evadb/third_party/databases/hackernews/hackernews_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evadb/third_party/databases/hackernews/hackernews_handler.py b/evadb/third_party/databases/hackernews/hackernews_handler.py index 2b5e5a8c6..dbf082d82 100644 --- a/evadb/third_party/databases/hackernews/hackernews_handler.py +++ b/evadb/third_party/databases/hackernews/hackernews_handler.py @@ -43,7 +43,7 @@ def supported_table(self): def _hackernews_topics_generator(): url = "http://hn.algolia.com/api/v1/search?" url += ("query=" + self.query) - url += ("" if self.tags == "" else "&tags=" + self.tags) + url += ("&tags=" + ("story" if self.tags == "" else + self.tags)) # search stories by default response = requests.get(url) if (response.status_code != 200): raise Exception("Could not reach website.") From 024eca48e1536e43bdcd95657c5771d4888852cd Mon Sep 17 00:00:00 2001 From: Kaushik Ravichandran Date: Thu, 16 Nov 2023 02:51:53 -0500 Subject: [PATCH 3/9] linter fix --- .../databases/hackernews/hackernews_handler.py | 12 ++++++++---- .../databases/hackernews/table_column_info.py | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/evadb/third_party/databases/hackernews/hackernews_handler.py b/evadb/third_party/databases/hackernews/hackernews_handler.py index dbf082d82..da056d20f 100644 --- a/evadb/third_party/databases/hackernews/hackernews_handler.py +++ b/evadb/third_party/databases/hackernews/hackernews_handler.py @@ -12,10 +12,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import json + import github import pandas as pd import requests -import json from evadb.third_party.databases.hackernews.table_column_info import HACKERNEWS_COLUMNS from evadb.third_party.databases.types import ( @@ -27,6 +28,7 @@ class HackernewsSearchHandler(DBHandler): connection = lambda x: requests.get("https://www.google.com/").status_code == 200 + def __init__(self, name: str, **kwargs): """ Initialize the handler. @@ -42,10 +44,12 @@ def __init__(self, name: str, **kwargs): def supported_table(self): def _hackernews_topics_generator(): url = "http://hn.algolia.com/api/v1/search?" - url += ("query=" + self.query) - url += ("&tags=" + ("story" if self.tags == "" else + self.tags)) # search stories by default + url += "query=" + self.query + url += "&tags=" + ( + "story" if self.tags == "" else +self.tags + ) # search stories by default response = requests.get(url) - if (response.status_code != 200): + if response.status_code != 200: raise Exception("Could not reach website.") json_result = response.content dict_result = json.loads(json_result) diff --git a/evadb/third_party/databases/hackernews/table_column_info.py b/evadb/third_party/databases/hackernews/table_column_info.py index ed2eb1a87..d9e2bfcc2 100644 --- a/evadb/third_party/databases/hackernews/table_column_info.py +++ b/evadb/third_party/databases/hackernews/table_column_info.py @@ -20,5 +20,5 @@ ["author", str], ["points", int], ["story_text", str], - ["num_comments", int] + ["num_comments", int], ] From 7554ef97d04345d10bd14bd425685e855c593e04 Mon Sep 17 00:00:00 2001 From: Kaushik Ravichandran Date: Thu, 16 Nov 2023 02:58:46 -0500 Subject: [PATCH 4/9] fix typo --- ...st_hackernews_datasource .py => test_hackernews_datasource.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test/integration_tests/long/{test_hackernews_datasource .py => test_hackernews_datasource.py} (100%) diff --git a/test/integration_tests/long/test_hackernews_datasource .py b/test/integration_tests/long/test_hackernews_datasource.py similarity index 100% rename from test/integration_tests/long/test_hackernews_datasource .py rename to test/integration_tests/long/test_hackernews_datasource.py From add43c14a26377b614b788f977715804f94d5752 Mon Sep 17 00:00:00 2001 From: Kaushik Ravichandran Date: Thu, 16 Nov 2023 03:02:49 -0500 Subject: [PATCH 5/9] linter fix --- evadb/third_party/databases/hackernews/hackernews_handler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evadb/third_party/databases/hackernews/hackernews_handler.py b/evadb/third_party/databases/hackernews/hackernews_handler.py index da056d20f..4f999a2e2 100644 --- a/evadb/third_party/databases/hackernews/hackernews_handler.py +++ b/evadb/third_party/databases/hackernews/hackernews_handler.py @@ -14,7 +14,6 @@ # limitations under the License. import json -import github import pandas as pd import requests @@ -27,7 +26,8 @@ class HackernewsSearchHandler(DBHandler): - connection = lambda x: requests.get("https://www.google.com/").status_code == 200 + def connection (): + return requests.get("https://www.google.com/").status_code == 200 def __init__(self, name: str, **kwargs): """ From fbacdacbcf273d781084399aa4951248a2f02b7e Mon Sep 17 00:00:00 2001 From: Kaushik Ravichandran Date: Thu, 16 Nov 2023 03:09:57 -0500 Subject: [PATCH 6/9] linter fix --- evadb/third_party/databases/hackernews/hackernews_handler.py | 2 +- test/integration_tests/long/test_hackernews_datasource.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/evadb/third_party/databases/hackernews/hackernews_handler.py b/evadb/third_party/databases/hackernews/hackernews_handler.py index 4f999a2e2..11025b27e 100644 --- a/evadb/third_party/databases/hackernews/hackernews_handler.py +++ b/evadb/third_party/databases/hackernews/hackernews_handler.py @@ -26,7 +26,7 @@ class HackernewsSearchHandler(DBHandler): - def connection (): + def connection(): return requests.get("https://www.google.com/").status_code == 200 def __init__(self, name: str, **kwargs): diff --git a/test/integration_tests/long/test_hackernews_datasource.py b/test/integration_tests/long/test_hackernews_datasource.py index 8219b04a0..19bfe1058 100644 --- a/test/integration_tests/long/test_hackernews_datasource.py +++ b/test/integration_tests/long/test_hackernews_datasource.py @@ -51,5 +51,6 @@ def test_should_run_select_query_in_github(self): ) self.assertEqual(batch.columns, expected_column) + if __name__ == "__main__": unittest.main() From 9bbc473d395b301e4638c3c015173f9c75179630 Mon Sep 17 00:00:00 2001 From: Kaushik Ravichandran Date: Thu, 16 Nov 2023 10:14:24 -0500 Subject: [PATCH 7/9] fixes --- evadb/third_party/databases/hackernews/table_column_info.py | 1 - setup.py | 3 +++ test/integration_tests/long/test_hackernews_datasource.py | 6 +++--- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/evadb/third_party/databases/hackernews/table_column_info.py b/evadb/third_party/databases/hackernews/table_column_info.py index d9e2bfcc2..aae50e18c 100644 --- a/evadb/third_party/databases/hackernews/table_column_info.py +++ b/evadb/third_party/databases/hackernews/table_column_info.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Autogenerated by ChatGPT from https://github.com/PyGithub/PyGithub/blob/main/github/NamedUser.py HACKERNEWS_COLUMNS = [ ["title", str], ["url", str], diff --git a/setup.py b/setup.py index 3334fa836..ab5571211 100644 --- a/setup.py +++ b/setup.py @@ -124,6 +124,8 @@ def read(path, encoding="utf-8"): xgboost_libs = ["flaml[automl]"] +hackernews_libs = ["requests"] + forecasting_libs = [ "statsforecast", # MODEL TRAIN AND FINE TUNING "neuralforecast", # MODEL TRAIN AND FINE TUNING @@ -176,6 +178,7 @@ def read(path, encoding="utf-8"): "sklearn": sklearn_libs, "xgboost": xgboost_libs, "forecasting": forecasting_libs, + "hackernews": hackernews_libs, # everything except ray, qdrant, ludwig and postgres. The first three fail on pyhton 3.11. "dev": dev_libs + vision_libs + document_libs + function_libs + notebook_libs + forecasting_libs + sklearn_libs + imagegen_libs + xgboost_libs } diff --git a/test/integration_tests/long/test_hackernews_datasource.py b/test/integration_tests/long/test_hackernews_datasource.py index 19bfe1058..0cc3293d4 100644 --- a/test/integration_tests/long/test_hackernews_datasource.py +++ b/test/integration_tests/long/test_hackernews_datasource.py @@ -18,7 +18,7 @@ import pytest from evadb.server.command_handler import execute_query_fetch_all -from evadb.third_party.databases.github.table_column_info import STARGAZERS_COLUMNS +from evadb.third_party.databases.hackernews.table_column_info import HACKERNEWS_COLUMNS @pytest.mark.notparallel @@ -32,7 +32,7 @@ def tearDown(self): execute_query_fetch_all(self.evadb, "DROP DATABASE IF EXISTS hackernews_data;") @pytest.mark.xfail(reason="Flaky testcase due to `bad request` error message") - def test_should_run_select_query_in_github(self): + def test_should_run_select_query_in_hackernews(self): # Create database. params = { "query": "EVADB", @@ -47,7 +47,7 @@ def test_should_run_select_query_in_github(self): batch = execute_query_fetch_all(self.evadb, query) self.assertEqual(len(batch), 10) expected_column = list( - ["search_results.{}".format(col) for col, _ in STARGAZERS_COLUMNS] + ["search_results.{}".format(col) for col, _ in HACKERNEWS_COLUMNS] ) self.assertEqual(batch.columns, expected_column) From 762f1af7d199f28fe0f35e33f7b13040db259772 Mon Sep 17 00:00:00 2001 From: Kaushik Ravichandran Date: Sat, 18 Nov 2023 14:33:53 -0500 Subject: [PATCH 8/9] add docs --- docs/_toc.yml | 1 + .../source/reference/databases/hackernews.rst | 44 +++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 docs/source/reference/databases/hackernews.rst diff --git a/docs/_toc.yml b/docs/_toc.yml index ca191ce42..eb57363f4 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -79,6 +79,7 @@ parts: - file: source/reference/databases/clickhouse - file: source/reference/databases/github - file: source/reference/databases/snowflake + - file: source/reference/databases/hackernews - file: source/reference/vector_databases/index title: Vector Databases diff --git a/docs/source/reference/databases/hackernews.rst b/docs/source/reference/databases/hackernews.rst new file mode 100644 index 000000000..a4542a93c --- /dev/null +++ b/docs/source/reference/databases/hackernews.rst @@ -0,0 +1,44 @@ +Hackernews +========== + +The connection to Hackernews is based on the `Algolia Hackernews `_ API. + +Dependency +---------- + +* requests + + +Parameters +---------- + +Required: + +* ``query`` is the search query for getting the results. + +Optional: + +* ``tags`` is the tag used for filtering the query results. Check `available tags `_ to see a list of available filter tags. + +Create Connection +----------------- + +.. code-block:: text + + CREATE DATABASE hackernews_data WITH ENGINE = 'hackernews', PARAMETERS = { + "owner": "EVADB", + "repo": "story" + }; + +Supported Tables +---------------- + +* ``search_results``: Lists the search query results. Check `table_column_info.py `_ for all the available columns in the table. + +.. code-block:: sql + + SELECT * FROM hackernews_data.search_results LIMIT 3; + +.. note:: + + Looking for another table from Hackernews? Please raise a `Feature Request `_. From 10096b1d06adb4cdd0c786cdd15b20bf74c24cf7 Mon Sep 17 00:00:00 2001 From: Kaushik Ravichandran Date: Sun, 19 Nov 2023 12:17:41 -0500 Subject: [PATCH 9/9] change doc --- docs/source/reference/databases/hackernews.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/reference/databases/hackernews.rst b/docs/source/reference/databases/hackernews.rst index a4542a93c..d96112e81 100644 --- a/docs/source/reference/databases/hackernews.rst +++ b/docs/source/reference/databases/hackernews.rst @@ -26,8 +26,8 @@ Create Connection .. code-block:: text CREATE DATABASE hackernews_data WITH ENGINE = 'hackernews', PARAMETERS = { - "owner": "EVADB", - "repo": "story" + "query": "EVADB", + "tags": "story" }; Supported Tables