From 92646d64e2c76b47389c65bcee3494d53c5818da Mon Sep 17 00:00:00 2001
From: Kapil Malik <kapilmalik@adobe.com>
Date: Fri, 17 Nov 2023 20:19:49 +0530
Subject: [PATCH] YoutubeChannelTranscriptReader implementation

---
 llama_hub/library.json                        |   8 +
 .../youtube_channel_transcript/README.md      |  62 ++++++++
 .../youtube_channel_transcript/__init__.py    |   6 +
 llama_hub/youtube_channel_transcript/base.py  |  75 +++++++++
 .../requirements.txt                          |   2 +
 pyproject.toml                                |   1 +
 test_requirements.txt                         |   1 +
 .../__init__.py                               |   0
 .../test_base.py                              | 142 ++++++++++++++++++
 9 files changed, 297 insertions(+)
 create mode 100644 llama_hub/youtube_channel_transcript/README.md
 create mode 100644 llama_hub/youtube_channel_transcript/__init__.py
 create mode 100644 llama_hub/youtube_channel_transcript/base.py
 create mode 100644 llama_hub/youtube_channel_transcript/requirements.txt
 create mode 100644 tests/tests_youtube_channel_transcript/__init__.py
 create mode 100644 tests/tests_youtube_channel_transcript/test_base.py

diff --git a/llama_hub/library.json b/llama_hub/library.json
index a80108067d..bbeef2661c 100644
--- a/llama_hub/library.json
+++ b/llama_hub/library.json
@@ -303,6 +303,14 @@
       "video"
     ]
   },
+  "YoutubeChannelTranscriptReader": {
+    "id": "youtube_channel_transcript",
+    "author": "kapil-malik",
+    "keywords": [
+      "video",
+      "youtube"
+    ]
+  },
   "MakeWrapper": {
     "id": "make_com"
   },
diff --git a/llama_hub/youtube_channel_transcript/README.md b/llama_hub/youtube_channel_transcript/README.md
new file mode 100644
index 0000000000..02ea323905
--- /dev/null
+++ b/llama_hub/youtube_channel_transcript/README.md
@@ -0,0 +1,62 @@
+# Youtube Channel Transcript Loader
+
+This loader fetches the text transcript of all YouTube videos for a given YouTube channel.
+
+It is based on the [Youtube Transcript Loader](https://llamahub.ai/l/youtube_transcript)
+
+## Requirements
+
+### Google API Credentials
+To use this loader, you'll need Google API credentials.
+
+This requires to have a Google Cloud Platform (GCP) project set up, enable the YouTube Data API v3, 
+and obtain API credentials before proceeding.
+
+1. **Set Up a GCP Project**:
+    * Go to the [Google Cloud Console](https://console.cloud.google.com/).
+    * Create a new project or select an existing one. 
+2. **Enable the YouTube Data API v3**:
+    * In the Google Cloud Console, navigate to the [APIs & Services](https://console.cloud.google.com/apis) > [Library](https://console.cloud.google.com/apis/library) page. 
+    * Search for "YouTube Data API v3" and enable it for your project.
+3. **Create API Credentials**:
+    * Still in the [APIs & Services](https://console.cloud.google.com/apis) section, navigate to [Credentials](https://console.cloud.google.com/apis/credentials).
+    * Create API credentials by clicking on "Create Credentials" and selecting "API Key."
+
+### Python packages
+
+You will then need to install youtube_transcript_api and google-api-python-client
+
+```
+pip install youtube_transcript_api
+pip install google-api-python-client
+```
+
+
+## Usage
+
+You instantiate the loader and then pass the Google API key and YouTube channel id into `load_data`:
+
+```python
+from llama_hub.youtube_channel_transcript import YoutubeChannelTranscriptReader
+
+loader = YoutubeChannelTranscriptReader()
+documents = loader.load_data(google_api_key='YOUR_API_KEY', yt_channel_id='UCeRjipR4_SsCddq9VZ2AeKg')
+```
+
+If the channel contains any YouTube videos that do not have a transcript, it will ignore and log them as warning.
+
+
+### Old Usage
+
+Use this syntax for earlier versions of llama_index where llama_hub loaders where loaded via separate download process:
+
+```python
+from llama_index import download_loader
+
+YoutubeChannelTranscriptReader = download_loader("YoutubeChannelTranscriptReader")
+
+loader = YoutubeChannelTranscriptReader()
+documents = loader.load_data(google_api_key='YOUR_API_KEY', yt_channel_id='UCeRjipR4_SsCddq9VZ2AeKg')
+```
+
+This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/run-llama/llama-hub/tree/main) for examples.
diff --git a/llama_hub/youtube_channel_transcript/__init__.py b/llama_hub/youtube_channel_transcript/__init__.py
new file mode 100644
index 0000000000..3170089e62
--- /dev/null
+++ b/llama_hub/youtube_channel_transcript/__init__.py
@@ -0,0 +1,6 @@
+"""Init file."""
+from llama_hub.youtube_channel_transcript.base import (
+    YoutubeChannelTranscriptReader
+)
+
+__all__ = ["YoutubeChannelTranscriptReader"]
diff --git a/llama_hub/youtube_channel_transcript/base.py b/llama_hub/youtube_channel_transcript/base.py
new file mode 100644
index 0000000000..37b7356efa
--- /dev/null
+++ b/llama_hub/youtube_channel_transcript/base.py
@@ -0,0 +1,75 @@
+"""Simple Reader that reads transcripts of all YouTube videos for a YouTube channel."""
+import logging
+from importlib.util import find_spec
+from typing import List
+
+from llama_index import download_loader
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
+
+
+class YoutubeChannelTranscriptReader(BaseReader):
+    """YouTube channel reader. Reads transcripts for all videos from a YouTube channel.
+    """
+    def __init__(self) -> None:
+        if find_spec("googleapiclient") is None:
+            raise ImportError(
+                "Missing package: googleapiclient.\n"
+                "Please `pip install google-api-python-client` to use this Reader"
+            )
+
+        try:
+            from llama_hub.utils import import_loader
+
+            YoutubeTranscriptReader = import_loader("YoutubeTranscriptReader")
+        except ImportError:
+            YoutubeTranscriptReader = download_loader("YoutubeTranscriptReader")
+
+        self._youtube_transcript_loader = YoutubeTranscriptReader()
+
+    def load_data(self, google_api_key: str, yt_channel_id: str) -> List[Document]:
+        """Load data for all videos in the YouTube channel.
+
+        Args:
+            google_api_key (str): Google API key.
+            yt_channel_id (str): YouTube channel ID.
+
+        """
+        documents = []
+        yt_links = self.get_channel_video_links(google_api_key=google_api_key, yt_channel_id=yt_channel_id)
+        logging.info("Found %s YouTube videos from the channel", len(yt_links))
+
+        # loading documents for one video at a time because it might fail for videos without transcripts
+        for yt_link in yt_links:
+            try:
+                link_documents = self._youtube_transcript_loader.load_data(ytlinks=[yt_link])
+                documents.extend(link_documents)
+            except Exception:
+                logging.warning("Failed to load data for video: %s", yt_link, exc_info=True)
+
+        logging.info("Loaded %s documents from the channel", len(documents))
+        return documents
+
+    @staticmethod
+    def get_channel_video_links(google_api_key: str, yt_channel_id: str) -> List[str]:
+        import googleapiclient.discovery
+
+        youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=google_api_key)
+        playlists_response = youtube.channels().list(part='contentDetails', id=yt_channel_id).execute()
+        playlist_id = playlists_response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
+        logging.info("Found playlist id: %s", playlist_id)
+
+        next_page_token = None
+        item_ids = []
+
+        while True:
+            items_response = youtube.playlistItems().list(part='contentDetails', playlistId=playlist_id, maxResults=50,
+                                                          pageToken=next_page_token).execute()
+            items = items_response['items']
+            video_ids = [item['contentDetails']['videoId'] for item in items]
+            item_ids.extend(video_ids)
+            next_page_token = items_response.get('nextPageToken')
+            if not next_page_token:
+                break
+
+        return ['https://www.youtube.com/watch?v=' + item_id for item_id in item_ids]
diff --git a/llama_hub/youtube_channel_transcript/requirements.txt b/llama_hub/youtube_channel_transcript/requirements.txt
new file mode 100644
index 0000000000..720425a1ed
--- /dev/null
+++ b/llama_hub/youtube_channel_transcript/requirements.txt
@@ -0,0 +1,2 @@
+youtube_transcript_api~=0.5.0
+google-api-python-client>=2.108.0
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index a868dc3ffa..f40389ca3e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ atlassian-python-api = "*"
 html2text = "*"
 psutil = "*"
 retrying = "*"
+google-api-python-client = ">=2.108.0"
 
 [tool.poetry.dev-dependencies]
 pytest = "7.2.1"
diff --git a/test_requirements.txt b/test_requirements.txt
index 215dfaa2a4..7e7e3851fb 100644
--- a/test_requirements.txt
+++ b/test_requirements.txt
@@ -13,6 +13,7 @@ llama-index>=0.6.9
 atlassian-python-api
 html2text
 olefile
+google-api-python-client>=2.108.0
 
 # hotfix
 psutil
diff --git a/tests/tests_youtube_channel_transcript/__init__.py b/tests/tests_youtube_channel_transcript/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/tests_youtube_channel_transcript/test_base.py b/tests/tests_youtube_channel_transcript/test_base.py
new file mode 100644
index 0000000000..f5bc9339a2
--- /dev/null
+++ b/tests/tests_youtube_channel_transcript/test_base.py
@@ -0,0 +1,142 @@
+import unittest
+from importlib.util import find_spec
+from unittest.mock import patch, call
+
+import pytest
+from llama_index.readers.schema.base import Document
+
+from llama_hub.youtube_channel_transcript import YoutubeChannelTranscriptReader
+
+dependencies_available = ((find_spec("googleapiclient") is not None) and
+                          (find_spec("youtube_transcript_api") is not None))
+
+CHANNELS_RESPONSE = {
+    "items": [
+        {
+            "contentDetails": {
+                "relatedPlaylists": {
+                    "uploads": "test_playlist_id"
+                }
+            }
+        }
+    ]
+}
+
+PLAYLISTS_RESPONSE = {
+    "items": [
+        {
+            "contentDetails": {
+                "videoId": "test_video_id_1"
+            }
+        },
+        {
+            "contentDetails": {
+                "videoId": "test_video_id_2"
+            }
+        }
+    ]
+}
+
+
+def dummy_load_pages(ytlinks: list[str]):
+    documents = []
+    for ytlink in ytlinks:
+        documents.append(Document(text=f"Transcript for {ytlink}"))
+    return documents
+
+
+def dummy_load_pages_with_exception(ytlinks: list[str]):
+    documents = []
+    for ytlink in ytlinks:
+        if ytlink == "https://www.youtube.com/watch?v=test_video_id_2":
+            documents.append(Document(text=f"Transcript for {ytlink}"))
+        else:
+            raise Exception("Failed to load transcript")
+    return documents
+
+
+class TestYoutubeChannelTranscriptReader(unittest.TestCase):
+
+    @pytest.mark.skipif(
+        not dependencies_available,
+        reason="Skipping since google-api-python-client or youtube_transcript_api is not installed",
+    )
+    def test_yt_channel_transcript_reader_init(self):
+        # test w/o args
+        YoutubeChannelTranscriptReader()
+
+    @pytest.mark.skipif(
+        not dependencies_available,
+        reason="Skipping since google-api-python-client or youtube_transcript_api is not installed",
+    )
+    def test_yt_channel_transcript_reader_load_data_invalid_args(self):
+        youtube_channel_transcript_reader = YoutubeChannelTranscriptReader()
+
+        with pytest.raises(
+                TypeError,
+                match="missing 2 required positional arguments: 'google_api_key' and 'yt_channel_id'",
+        ):
+            youtube_channel_transcript_reader.load_data()
+
+    @pytest.mark.skipif(
+        not dependencies_available,
+        reason="Skipping since google-api-python-client or youtube_transcript_api is not installed",
+    )
+    @patch("llama_hub.youtube_transcript.base.YoutubeTranscriptReader.load_data")
+    def test_yt_channel_transcript_reader_load_data(self, mock_load_data):
+        with patch("googleapiclient.discovery") as mock_discovery:
+            youtube_channel_transcript_reader = YoutubeChannelTranscriptReader()
+
+            mock_build = mock_discovery.build.return_value
+            mock_build.channels.return_value.list.return_value.execute.return_value = CHANNELS_RESPONSE
+            mock_build.playlistItems.return_value.list.return_value.execute.return_value = PLAYLISTS_RESPONSE
+
+            mock_load_data.side_effect = dummy_load_pages
+
+            documents = youtube_channel_transcript_reader.load_data(google_api_key="test_key",
+                                                                    yt_channel_id="test_channel_id")
+
+            mock_discovery.build.assert_called_once_with('youtube', 'v3', developerKey='test_key')
+            mock_discovery.build.return_value.channels.assert_called_once()
+            mock_discovery.build.return_value.channels.return_value.list.assert_called_once_with(
+                part='contentDetails', id='test_channel_id')
+            mock_discovery.build.return_value.channels.return_value.list.return_value.execute.assert_called_once()
+            mock_discovery.build.return_value.playlistItems.assert_called_once()
+            mock_discovery.build.return_value.playlistItems.return_value.list.assert_called_once_with(
+                part='contentDetails', playlistId='test_playlist_id', maxResults=50, pageToken=None)
+            mock_discovery.build.return_value.playlistItems.return_value.list.return_value.execute.assert_called_once()
+
+            assert mock_load_data.call_count == 2
+            mock_load_data.assert_has_calls([
+                call(ytlinks=['https://www.youtube.com/watch?v=test_video_id_1']),
+                call(ytlinks=['https://www.youtube.com/watch?v=test_video_id_2'])])
+
+            assert len(documents) == 2
+            assert documents[0].text == "Transcript for https://www.youtube.com/watch?v=test_video_id_1"
+            assert documents[1].text == "Transcript for https://www.youtube.com/watch?v=test_video_id_2"
+
+    @pytest.mark.skipif(
+        not dependencies_available,
+        reason="Skipping since google-api-python-client or youtube_transcript_api is not installed",
+    )
+    @patch("llama_hub.youtube_transcript.base.YoutubeTranscriptReader.load_data")
+    def test_yt_channel_transcript_reader_load_data_with_exceptions(self, mock_load_data):
+        with patch("googleapiclient.discovery") as mock_discovery:
+            youtube_channel_transcript_reader = YoutubeChannelTranscriptReader()
+
+            mock_build = mock_discovery.build.return_value
+            mock_build.channels.return_value.list.return_value.execute.return_value = CHANNELS_RESPONSE
+            mock_build.playlistItems.return_value.list.return_value.execute.return_value = PLAYLISTS_RESPONSE
+
+            mock_load_data.side_effect = dummy_load_pages_with_exception
+
+            documents = youtube_channel_transcript_reader.load_data(google_api_key="test_key",
+                                                                    yt_channel_id="test_channel_id")
+
+            assert mock_load_data.call_count == 2
+            mock_load_data.assert_has_calls([
+                call(ytlinks=['https://www.youtube.com/watch?v=test_video_id_1']),
+                call(ytlinks=['https://www.youtube.com/watch?v=test_video_id_2'])])
+
+            assert len(documents) == 1
+            assert documents[0].text == "Transcript for https://www.youtube.com/watch?v=test_video_id_2"