Skip to content

Commit

Permalink
feat: arXiv datasource addition, closes georgia-tech-db#1161
Browse files Browse the repository at this point in the history
  • Loading branch information
aayushacharya committed Dec 29, 2023
1 parent e5a9190 commit 614d676
Show file tree
Hide file tree
Showing 5 changed files with 246 additions and 0 deletions.
53 changes: 53 additions & 0 deletions docs/source/reference/databases/arxiv.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
Arxiv
==========

The connection to Arxiv is based on the `Arxiv <https://github.com/lukasschwab/arxiv.py>`_ library.

Dependency
----------

* Arxiv


Parameters
----------

Required:

* ``query`` is the search query in the Arxiv repository. For example, Nuclear Physics.
* ``max_results`` is the max number of results to display. For example, 10.

Create Connection
-----------------

.. code-block:: text
CREATE DATABASE arxiv_data WITH ENGINE = 'arxiv', PARAMETERS = {
"query": "Nuclear Physics",
"max_results": "10"
};
Supported Tables
----------------

* ``search_results``: Lists the relevant articles in the arxiv repository. Check `table_column_info.py <https://github.com/georgia-tech-db/evadb/blob/staging/evadb/third_party/databases/arxiv/table_column_info.py>`_ for all the available columns in the table.

.. code-block:: sql
SELECT * FROM arxiv_data.search_results;
Here is the query output:

.. code-block::
+---------------------------------------------------+-----+---------------------------------------------+
| search_results.title | ... | search_results.doi |
|---------------------------------------------------|-----|---------------------------------------------|
| Nuclear Symmetry Energy Extracted from Laborat... | ... | 10.1080/10619127.2017.1388681 |
| Neutrino astrophysics and its connections to n... | ... | 10.1088/1742-6596/1056/1/012060 |
| ... | ... | ... |
+---------------------------------------------------+-----+---------------------------------------------+
.. note::

Looking for another table from Arxiv? You can add a table mapping in `arxiv_handler.py <https://github.com/georgia-tech-db/evadb/blob/staging/evadb/third_party/databases/arxiv/arxiv_handler.py>`_, or simply raise a `Feature Request <https://github.com/georgia-tech-db/evadb/issues/new/choose>`_.
15 changes: 15 additions & 0 deletions evadb/third_party/databases/arxiv/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# coding=utf-8
# Copyright 2018-2023 EvaDB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""arxiv integration"""
149 changes: 149 additions & 0 deletions evadb/third_party/databases/arxiv/arxiv_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
# coding=utf-8
# Copyright 2018-2023 EvaDB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import arxiv
import pandas as pd

from evadb.third_party.databases.arxiv.table_column_info import ARXIV_COLUMNS
from evadb.third_party.databases.types import (
DBHandler,
DBHandlerResponse,
DBHandlerStatus,
)


class ArxivHandler(DBHandler):
def __init__(self, name: str, **kwargs):
"""
Initialize the handler.
Args:
name (str): name of the DB handler instance
**kwargs: arbitrary keyword arguments for establishing the connection.
"""
super().__init__(name, **kwargs)
self.query=kwargs.get("query","")
self.max_results=int(kwargs.get("max_results",0))

@property
def supported_table(self):
def _arxiv_generator():
for eachRow in self.connection.results(arxiv.Search(
query=self.query,
max_results=self.max_results
)):
yield {
property_name: getattr(eachRow, property_name)
for property_name, _ in ARXIV_COLUMNS
}

mapping = {
"search_results": {
"columns": ARXIV_COLUMNS,
"generator": _arxiv_generator(),
},
}
return mapping



def connect(self):
"""
Set up the connection required by the handler.
Returns:
DBHandlerStatus
"""
try:
self.connection=arxiv.Client()
return DBHandlerStatus(status=True)
except Exception as e:
return DBHandlerStatus(status=False, error=str(e))

def disconnect(self):
"""
Close any existing connections.
"""
pass

def check_connection(self) -> DBHandlerStatus:
"""
Check connection to the handler.
Returns:
DBHandlerStatus
"""
if self.connection:
return DBHandlerStatus(status=True)
else:
return DBHandlerStatus(status=False, error="Not connected to the database.")

def get_tables(self) -> DBHandlerResponse:
"""
Return the list of tables in the database.
Returns:
DBHandlerResponse
"""
if not self.connection:
return DBHandlerResponse(data=None, error="Not connected to the internet.")

try:
tables_df = pd.DataFrame(
list(self.supported_table.keys()), columns=["table_name"]
)
return DBHandlerResponse(data=tables_df)
except Exception as e:
return DBHandlerResponse(data=None, error=str(e))

def get_columns(self, table_name: str) -> DBHandlerResponse:
"""
Returns the list of columns for the given table.
Args:
table_name (str): name of the table whose columns are to be retrieved.
Returns:
DBHandlerResponse
"""
if not self.connection:
return DBHandlerResponse(data=None, error="Not connected to the database.")
try:
columns_df = pd.DataFrame(
self.supported_table[table_name]["columns"], columns=["name", "dtype"]
)
return DBHandlerResponse(data=columns_df)
except Exception as e:
return DBHandlerResponse(data=None, error=str(e))



def select(self, table_name: str) -> DBHandlerResponse:
"""
Returns a generator that yields the data from the given table.
Args:
table_name (str): name of the table whose data is to be retrieved.
Returns:
DBHandlerResponse
"""
if not self.connection:
return DBHandlerResponse(data=None, error="Not connected to the database.")
try:
if table_name not in self.supported_table:
return DBHandlerResponse(
data=None,
error="{} is not supported or does not exist.".format(table_name),
)
# TODO: Projection column trimming optimization opportunity
return DBHandlerResponse(
data=None,
data_generator=self.supported_table[table_name]["generator"],
)
except Exception as e:
return DBHandlerResponse(data=None, error=str(e))

27 changes: 27 additions & 0 deletions evadb/third_party/databases/arxiv/table_column_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# coding=utf-8
# Copyright 2018-2023 EvaDB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

ARXIV_COLUMNS=[
["title",str],
["entry_id",str],
["published",str],
["updated",str],
["summary",str],
["authors",object],
["comment",str],
["primary_category",str],
["journal_ref",str],
["doi",str],
]
2 changes: 2 additions & 0 deletions evadb/third_party/databases/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ def _get_database_handler(engine: str, **kwargs):
return mod.SnowFlakeDbHandler(engine, **kwargs)
elif engine == "github":
return mod.GithubHandler(engine, **kwargs)
elif engine == "arxiv":
return mod.ArxivHandler(engine,**kwargs)
elif engine == "hackernews":
return mod.HackernewsSearchHandler(engine, **kwargs)
elif engine == "slack":
Expand Down

0 comments on commit 614d676

Please sign in to comment.