georgia-tech-db · eric-ming2 · Nov 25, 2023 · Nov 25, 2023 · Nov 25, 2023 · Nov 25, 2023
diff --git a/docs/source/reference/databases/youtube.rst b/docs/source/reference/databases/youtube.rst
@@ -0,0 +1,63 @@
+YouTube
+==========
+
+The connection to YouTube is based on the `YouTube Data API <https://developers.google.com/youtube/v3>`_.
+
+Dependency
+----------
+
+* pytube
+
+
+Parameters
+----------
+
+Required:
+
+* ``youtube_token`` is your API key. Instructions for obtaining this key can be found `here <https://developers.google.com/youtube/v3/getting-started>`_.
+
+Optional:
+
+* ``youtube_urls`` is a comma separated list of YouTube URLs. The pytube package is used to parse different YouTube URL formats.
+
+* ``search_query`` is the query you are searching for. Your request can use the Boolean NOT (-) and OR (|) operators to exclude or find videos associated with several search terms. Note that the pipe character must be URL-escaped with %7C.
+* ``max_results`` the maximum number of items that should be returned in the result set. Acceptable values are 0 to 50, inclusive. The default value is 5.
+
+Create Connection
+-----------------
+
+.. code-block:: text
+
+   CREATE DATABASE youtube_data WITH ENGINE = 'youtube', PARAMETERS = {
+        "youtube_token": <INSERT API KEY>,
+        "youtube_urls": "https://www.youtube.com/watch?v=7__r4FVj-EI, https://youtu.be/BYVZh5kqaFg"
+   };
+
+or
+
+.. code-block:: text
+
+   CREATE DATABASE youtube_data WITH ENGINE = 'youtube', PARAMETERS = {
+        "youtube_token": <INSERT API KEY>,
+        "search_query": "evadb",
+        "max_results": 3
+   };
+
+Supported Tables
+----------------
+
+* ``snippet``: Contains information such as the time the YouTube video was published, the video title, the description, the channel name and id, thumbnails, tags, and categories.
+
+.. code-block:: sql
+
+   SELECT * FROM youtube_data.snippet
+
+* ``statistics``: Contains the view count, like count, favorite count, and comment count.
+
+.. code-block:: sql
+
+   SELECT * FROM youtube_data.statistics
+
+.. note::
+
+   Looking for another table from Hackernews? Please raise a `Feature Request <https://github.com/georgia-tech-db/evadb/issues/new/choose>`_.
diff --git a/evadb/third_party/databases/interface.py b/evadb/third_party/databases/interface.py
@@ -52,6 +52,8 @@ def _get_database_handler(engine: str, **kwargs):
         return mod.HackernewsSearchHandler(engine, **kwargs)
     elif engine == "slack":
         return mod.SlackHandler(engine, **kwargs)
+    elif engine == "youtube":
+        return mod.YoutubeHandler(engine, **kwargs)
     else:
         raise NotImplementedError(f"Engine {engine} is not supported")
 

diff --git a/evadb/third_party/databases/youtube/__init__.py b/evadb/third_party/databases/youtube/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018-2023 EvaDB
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""third party/applications/youtube"""
diff --git a/evadb/third_party/databases/youtube/youtube_handler.py b/evadb/third_party/databases/youtube/youtube_handler.py
@@ -0,0 +1,228 @@
+# coding=utf-8
+# Copyright 2018-2023 EvaDB
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+import requests
+from pytube import extract
+
+from evadb.third_party.types import DBHandler, DBHandlerResponse, DBHandlerStatus
+
+
+class YoutubeHandler(DBHandler):
+    SNIPPET_COLUMNS = [
+        "publishedAt",
+        "channelId",
+        "title",
+        "description",
+        "thumbnails",
+        "channelTitle",
+        "tags",
+        "categoryId",
+    ]
+    STATISTICS_COLUMNS = [
+        "viewCount",
+        "likeCount",
+        "favoriteCount",
+        "commentCount",
+    ]
+
+    def __init__(self, name: str, **kwargs):
+        """
+        Initialize the handler.
+        Args:
+            name (str): name of the DB handler instance
+            **kwargs: arbitrary keyword arguments for establishing the connection.
+        """
+        super().__init__(name)
+        urls = kwargs.get("youtube_urls")
+        if urls is not None:
+            self.url_list = str(urls).strip().split(",")
+        else:
+            self.url_list = []
+        query = kwargs.get("search_query")
+        if query is not None:
+            self.query = str(query)
+        else:
+            self.query = None
+        maxResults = kwargs.get("max_results")
+        if maxResults is not None:
+            self.maxResults = int(maxResults)
+        else:
+            self.maxResults = None
+        self.api_key = str(kwargs.get("youtube_token"))
+
+    def connect(self):
+        """
+        Set up the connection required by the handler.
+        Returns:
+            DBHandlerStatus
+        """
+        try:
+            response = self._api_call(
+                "https://www.youtube.com/watch?v=BYVZh5kqaFg", "snippet"
+            )
+            if response.status_code == 200:
+                return DBHandlerStatus(status=True)
+            else:
+                return DBHandlerStatus(status=False, error=response.json())
+        except Exception as e:
+            return DBHandlerStatus(status=False, error=str(e))
+
+    def disconnect(self):
+        """
+        Close any existing connections.
+        """
+        pass
+
+    def check_connection(self) -> DBHandlerStatus:
+        """
+        Check connection to the handler.
+        Returns:
+            DBHandlerStatus
+        """
+        try:
+            response = self._api_call(
+                "https://www.youtube.com/watch?v=BYVZh5kqaFg", "snippet"
+            )
+            if response.status_code == 200:
+                return DBHandlerStatus(status=True)
+            else:
+                return DBHandlerStatus(status=False, error=response.json())
+        except Exception as e:
+            return DBHandlerStatus(status=False, error=str(e))
+
+    def get_tables(self) -> DBHandlerResponse:
+        """
+        Return the list of tables in the database.
+        Returns:
+            DBHandlerResponse
+        """
+        tables_df = pd.DataFrame(["snippet", "statistics"], columns=["table_name"])
+        return DBHandlerResponse(data=tables_df)
+
+    def get_columns(self, table_name: str) -> DBHandlerResponse:
+        """
+        Returns the list of columns for the given table.
+        Args:
+            table_name (str): name of the table whose columns are to be retrieved.
+        Returns:
+            DBHandlerResponse
+        """
+        if table_name == "snippet":
+            columns_df = pd.DataFrame(self.SNIPPET_COLUMNS, columns=["column_name"])
+            return DBHandlerResponse(data=columns_df)
+        elif table_name == "statistics":
+            columns_df = pd.DataFrame(self.STATISTICS_COLUMNS, columns=["column_name"])
+            return DBHandlerResponse(data=columns_df)
+        else:
+            return DBHandlerResponse(status=False, error="Invalid table name.")
+
+    def _api_call(self, url: str, table_name: str) -> requests.models.Response:
+        """
+        Calls the YouTube Data API with a specific video ID.
+        Args:
+            url (str): YouTube url of video we are retrieving information of.
+            table_name (str): name of table to determine which information to retrieve.
+        Returns:
+            requests.models.Response
+        """
+
+        video_id = extract.video_id(url)
+        url = f"https://youtube.googleapis.com/youtube/v3/videos?part={table_name}&id={video_id}&key={self.api_key}"
+        headers = {"Accept": "application/json"}
+        response = requests.request("GET", url, headers=headers, data={})
+
+        return response
+
+    def _search_api_call(self) -> requests.models.Response:
+        """
+        Calls the YouTube Data API with a search query.
+        Returns:
+            requests.models.Response
+        """
+
+        maxResults = "" if self.maxResults is None else f"&maxResults={self.maxResults}"
+        url = f"https://youtube.googleapis.com/youtube/v3/search?part=snippet{maxResults}&q={self.query}&type=video&key={self.api_key}"
+        headers = {"Accept": "application/json"}
+        response = requests.request("GET", url, headers=headers, data={})
+
+        return response
+
+    def _get_snippet_info(self) -> pd.DataFrame:
+        """
+        Retrieves snippet information and converts it into a DataFrame.
+        Returns:
+            pd.DataFrame
+        """
+        df = pd.DataFrame(columns=self.SNIPPET_COLUMNS)
+        for url_index in range(len(self.url_list)):
+            url = self.url_list[url_index]
+            response = self._api_call(url, "snippet").json()
+            if response["pageInfo"]["totalResults"] == 0:
+                df.loc[url_index] = {}
+            else:
+                snippet = response["items"][0]["snippet"]
+                df.loc[url_index] = snippet
+        if self.query is not None:
+            response = self._search_api_call().json()
+            results = len(response["items"])
+            for result_index in range(results):
+                snippet = response["items"][result_index]["snippet"]
+                df.loc[len(df)] = snippet
+        return df
+
+    def _get_statistics_info(self) -> pd.DataFrame:
+        """
+        Retrieves statistics information and converts it into a DataFrame.
+        Returns:
+            pd.DataFrame
+        """
+        df = pd.DataFrame(columns=self.STATISTICS_COLUMNS)
+        for url_index in range(len(self.url_list)):
+            url = self.url_list[url_index]
+            response = self._api_call(url, "statistics").json()
+            if response["pageInfo"]["totalResults"] == 0:
+                df.loc[url_index] = {}
+            else:
+                statistics = response["items"][0]["statistics"]
+                df.loc[url_index] = statistics
+        if self.query is not None:
+            response = self._search_api_call().json()
+            results = len(response["items"])
+            for result_index in range(results):
+                videoId = response["items"][result_index]["id"]["videoId"]
+                url = f"https://www.youtube.com/watch?v={videoId}"
+                stat_response = self._api_call(url, "statistics").json()
+                statistics = stat_response["items"][0]["statistics"]
+                df.loc[len(df)] = statistics
+        return df
+
+    def select(self, table_name: str) -> DBHandlerResponse:
+        """
+        Returns a generator that yields the data from the given table.
+        Args:
+            table_name (str): name of the table whose data is to be retrieved.
+        Returns:
+            DBHandlerResponse
+        """
+
+        if table_name == "snippet":
+            snippet_df = self._get_snippet_info()
+            return DBHandlerResponse(data=snippet_df)
+        elif table_name == "statistics":
+            statistics_df = self._get_statistics_info()
+            return DBHandlerResponse(data=statistics_df)
+        else:
+            return DBHandlerResponse(data=None, error="Invalid table name.")