From 92646d64e2c76b47389c65bcee3494d53c5818da Mon Sep 17 00:00:00 2001 From: Kapil Malik Date: Fri, 17 Nov 2023 20:19:49 +0530 Subject: [PATCH] YoutubeChannelTranscriptReader implementation --- llama_hub/library.json | 8 + .../youtube_channel_transcript/README.md | 62 ++++++++ .../youtube_channel_transcript/__init__.py | 6 + llama_hub/youtube_channel_transcript/base.py | 75 +++++++++ .../requirements.txt | 2 + pyproject.toml | 1 + test_requirements.txt | 1 + .../__init__.py | 0 .../test_base.py | 142 ++++++++++++++++++ 9 files changed, 297 insertions(+) create mode 100644 llama_hub/youtube_channel_transcript/README.md create mode 100644 llama_hub/youtube_channel_transcript/__init__.py create mode 100644 llama_hub/youtube_channel_transcript/base.py create mode 100644 llama_hub/youtube_channel_transcript/requirements.txt create mode 100644 tests/tests_youtube_channel_transcript/__init__.py create mode 100644 tests/tests_youtube_channel_transcript/test_base.py diff --git a/llama_hub/library.json b/llama_hub/library.json index a80108067d..bbeef2661c 100644 --- a/llama_hub/library.json +++ b/llama_hub/library.json @@ -303,6 +303,14 @@ "video" ] }, + "YoutubeChannelTranscriptReader": { + "id": "youtube_channel_transcript", + "author": "kapil-malik", + "keywords": [ + "video", + "youtube" + ] + }, "MakeWrapper": { "id": "make_com" }, diff --git a/llama_hub/youtube_channel_transcript/README.md b/llama_hub/youtube_channel_transcript/README.md new file mode 100644 index 0000000000..02ea323905 --- /dev/null +++ b/llama_hub/youtube_channel_transcript/README.md @@ -0,0 +1,62 @@ +# Youtube Channel Transcript Loader + +This loader fetches the text transcript of all YouTube videos for a given YouTube channel. + +It is based on the [Youtube Transcript Loader](https://llamahub.ai/l/youtube_transcript) + +## Requirements + +### Google API Credentials +To use this loader, you'll need Google API credentials. + +This requires to have a Google Cloud Platform (GCP) project set up, enable the YouTube Data API v3, +and obtain API credentials before proceeding. + +1. **Set Up a GCP Project**: + * Go to the [Google Cloud Console](https://console.cloud.google.com/). + * Create a new project or select an existing one. +2. **Enable the YouTube Data API v3**: + * In the Google Cloud Console, navigate to the [APIs & Services](https://console.cloud.google.com/apis) > [Library](https://console.cloud.google.com/apis/library) page. + * Search for "YouTube Data API v3" and enable it for your project. +3. **Create API Credentials**: + * Still in the [APIs & Services](https://console.cloud.google.com/apis) section, navigate to [Credentials](https://console.cloud.google.com/apis/credentials). + * Create API credentials by clicking on "Create Credentials" and selecting "API Key." + +### Python packages + +You will then need to install youtube_transcript_api and google-api-python-client + +``` +pip install youtube_transcript_api +pip install google-api-python-client +``` + + +## Usage + +You instantiate the loader and then pass the Google API key and YouTube channel id into `load_data`: + +```python +from llama_hub.youtube_channel_transcript import YoutubeChannelTranscriptReader + +loader = YoutubeChannelTranscriptReader() +documents = loader.load_data(google_api_key='YOUR_API_KEY', yt_channel_id='UCeRjipR4_SsCddq9VZ2AeKg') +``` + +If the channel contains any YouTube videos that do not have a transcript, it will ignore and log them as warning. + + +### Old Usage + +Use this syntax for earlier versions of llama_index where llama_hub loaders where loaded via separate download process: + +```python +from llama_index import download_loader + +YoutubeChannelTranscriptReader = download_loader("YoutubeChannelTranscriptReader") + +loader = YoutubeChannelTranscriptReader() +documents = loader.load_data(google_api_key='YOUR_API_KEY', yt_channel_id='UCeRjipR4_SsCddq9VZ2AeKg') +``` + +This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/run-llama/llama-hub/tree/main) for examples. diff --git a/llama_hub/youtube_channel_transcript/__init__.py b/llama_hub/youtube_channel_transcript/__init__.py new file mode 100644 index 0000000000..3170089e62 --- /dev/null +++ b/llama_hub/youtube_channel_transcript/__init__.py @@ -0,0 +1,6 @@ +"""Init file.""" +from llama_hub.youtube_channel_transcript.base import ( + YoutubeChannelTranscriptReader +) + +__all__ = ["YoutubeChannelTranscriptReader"] diff --git a/llama_hub/youtube_channel_transcript/base.py b/llama_hub/youtube_channel_transcript/base.py new file mode 100644 index 0000000000..37b7356efa --- /dev/null +++ b/llama_hub/youtube_channel_transcript/base.py @@ -0,0 +1,75 @@ +"""Simple Reader that reads transcripts of all YouTube videos for a YouTube channel.""" +import logging +from importlib.util import find_spec +from typing import List + +from llama_index import download_loader +from llama_index.readers.base import BaseReader +from llama_index.readers.schema.base import Document + + +class YoutubeChannelTranscriptReader(BaseReader): + """YouTube channel reader. Reads transcripts for all videos from a YouTube channel. + """ + def __init__(self) -> None: + if find_spec("googleapiclient") is None: + raise ImportError( + "Missing package: googleapiclient.\n" + "Please `pip install google-api-python-client` to use this Reader" + ) + + try: + from llama_hub.utils import import_loader + + YoutubeTranscriptReader = import_loader("YoutubeTranscriptReader") + except ImportError: + YoutubeTranscriptReader = download_loader("YoutubeTranscriptReader") + + self._youtube_transcript_loader = YoutubeTranscriptReader() + + def load_data(self, google_api_key: str, yt_channel_id: str) -> List[Document]: + """Load data for all videos in the YouTube channel. + + Args: + google_api_key (str): Google API key. + yt_channel_id (str): YouTube channel ID. + + """ + documents = [] + yt_links = self.get_channel_video_links(google_api_key=google_api_key, yt_channel_id=yt_channel_id) + logging.info("Found %s YouTube videos from the channel", len(yt_links)) + + # loading documents for one video at a time because it might fail for videos without transcripts + for yt_link in yt_links: + try: + link_documents = self._youtube_transcript_loader.load_data(ytlinks=[yt_link]) + documents.extend(link_documents) + except Exception: + logging.warning("Failed to load data for video: %s", yt_link, exc_info=True) + + logging.info("Loaded %s documents from the channel", len(documents)) + return documents + + @staticmethod + def get_channel_video_links(google_api_key: str, yt_channel_id: str) -> List[str]: + import googleapiclient.discovery + + youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=google_api_key) + playlists_response = youtube.channels().list(part='contentDetails', id=yt_channel_id).execute() + playlist_id = playlists_response['items'][0]['contentDetails']['relatedPlaylists']['uploads'] + logging.info("Found playlist id: %s", playlist_id) + + next_page_token = None + item_ids = [] + + while True: + items_response = youtube.playlistItems().list(part='contentDetails', playlistId=playlist_id, maxResults=50, + pageToken=next_page_token).execute() + items = items_response['items'] + video_ids = [item['contentDetails']['videoId'] for item in items] + item_ids.extend(video_ids) + next_page_token = items_response.get('nextPageToken') + if not next_page_token: + break + + return ['https://www.youtube.com/watch?v=' + item_id for item_id in item_ids] diff --git a/llama_hub/youtube_channel_transcript/requirements.txt b/llama_hub/youtube_channel_transcript/requirements.txt new file mode 100644 index 0000000000..720425a1ed --- /dev/null +++ b/llama_hub/youtube_channel_transcript/requirements.txt @@ -0,0 +1,2 @@ +youtube_transcript_api~=0.5.0 +google-api-python-client>=2.108.0 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index a868dc3ffa..f40389ca3e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ atlassian-python-api = "*" html2text = "*" psutil = "*" retrying = "*" +google-api-python-client = ">=2.108.0" [tool.poetry.dev-dependencies] pytest = "7.2.1" diff --git a/test_requirements.txt b/test_requirements.txt index 215dfaa2a4..7e7e3851fb 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -13,6 +13,7 @@ llama-index>=0.6.9 atlassian-python-api html2text olefile +google-api-python-client>=2.108.0 # hotfix psutil diff --git a/tests/tests_youtube_channel_transcript/__init__.py b/tests/tests_youtube_channel_transcript/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/tests_youtube_channel_transcript/test_base.py b/tests/tests_youtube_channel_transcript/test_base.py new file mode 100644 index 0000000000..f5bc9339a2 --- /dev/null +++ b/tests/tests_youtube_channel_transcript/test_base.py @@ -0,0 +1,142 @@ +import unittest +from importlib.util import find_spec +from unittest.mock import patch, call + +import pytest +from llama_index.readers.schema.base import Document + +from llama_hub.youtube_channel_transcript import YoutubeChannelTranscriptReader + +dependencies_available = ((find_spec("googleapiclient") is not None) and + (find_spec("youtube_transcript_api") is not None)) + +CHANNELS_RESPONSE = { + "items": [ + { + "contentDetails": { + "relatedPlaylists": { + "uploads": "test_playlist_id" + } + } + } + ] +} + +PLAYLISTS_RESPONSE = { + "items": [ + { + "contentDetails": { + "videoId": "test_video_id_1" + } + }, + { + "contentDetails": { + "videoId": "test_video_id_2" + } + } + ] +} + + +def dummy_load_pages(ytlinks: list[str]): + documents = [] + for ytlink in ytlinks: + documents.append(Document(text=f"Transcript for {ytlink}")) + return documents + + +def dummy_load_pages_with_exception(ytlinks: list[str]): + documents = [] + for ytlink in ytlinks: + if ytlink == "https://www.youtube.com/watch?v=test_video_id_2": + documents.append(Document(text=f"Transcript for {ytlink}")) + else: + raise Exception("Failed to load transcript") + return documents + + +class TestYoutubeChannelTranscriptReader(unittest.TestCase): + + @pytest.mark.skipif( + not dependencies_available, + reason="Skipping since google-api-python-client or youtube_transcript_api is not installed", + ) + def test_yt_channel_transcript_reader_init(self): + # test w/o args + YoutubeChannelTranscriptReader() + + @pytest.mark.skipif( + not dependencies_available, + reason="Skipping since google-api-python-client or youtube_transcript_api is not installed", + ) + def test_yt_channel_transcript_reader_load_data_invalid_args(self): + youtube_channel_transcript_reader = YoutubeChannelTranscriptReader() + + with pytest.raises( + TypeError, + match="missing 2 required positional arguments: 'google_api_key' and 'yt_channel_id'", + ): + youtube_channel_transcript_reader.load_data() + + @pytest.mark.skipif( + not dependencies_available, + reason="Skipping since google-api-python-client or youtube_transcript_api is not installed", + ) + @patch("llama_hub.youtube_transcript.base.YoutubeTranscriptReader.load_data") + def test_yt_channel_transcript_reader_load_data(self, mock_load_data): + with patch("googleapiclient.discovery") as mock_discovery: + youtube_channel_transcript_reader = YoutubeChannelTranscriptReader() + + mock_build = mock_discovery.build.return_value + mock_build.channels.return_value.list.return_value.execute.return_value = CHANNELS_RESPONSE + mock_build.playlistItems.return_value.list.return_value.execute.return_value = PLAYLISTS_RESPONSE + + mock_load_data.side_effect = dummy_load_pages + + documents = youtube_channel_transcript_reader.load_data(google_api_key="test_key", + yt_channel_id="test_channel_id") + + mock_discovery.build.assert_called_once_with('youtube', 'v3', developerKey='test_key') + mock_discovery.build.return_value.channels.assert_called_once() + mock_discovery.build.return_value.channels.return_value.list.assert_called_once_with( + part='contentDetails', id='test_channel_id') + mock_discovery.build.return_value.channels.return_value.list.return_value.execute.assert_called_once() + mock_discovery.build.return_value.playlistItems.assert_called_once() + mock_discovery.build.return_value.playlistItems.return_value.list.assert_called_once_with( + part='contentDetails', playlistId='test_playlist_id', maxResults=50, pageToken=None) + mock_discovery.build.return_value.playlistItems.return_value.list.return_value.execute.assert_called_once() + + assert mock_load_data.call_count == 2 + mock_load_data.assert_has_calls([ + call(ytlinks=['https://www.youtube.com/watch?v=test_video_id_1']), + call(ytlinks=['https://www.youtube.com/watch?v=test_video_id_2'])]) + + assert len(documents) == 2 + assert documents[0].text == "Transcript for https://www.youtube.com/watch?v=test_video_id_1" + assert documents[1].text == "Transcript for https://www.youtube.com/watch?v=test_video_id_2" + + @pytest.mark.skipif( + not dependencies_available, + reason="Skipping since google-api-python-client or youtube_transcript_api is not installed", + ) + @patch("llama_hub.youtube_transcript.base.YoutubeTranscriptReader.load_data") + def test_yt_channel_transcript_reader_load_data_with_exceptions(self, mock_load_data): + with patch("googleapiclient.discovery") as mock_discovery: + youtube_channel_transcript_reader = YoutubeChannelTranscriptReader() + + mock_build = mock_discovery.build.return_value + mock_build.channels.return_value.list.return_value.execute.return_value = CHANNELS_RESPONSE + mock_build.playlistItems.return_value.list.return_value.execute.return_value = PLAYLISTS_RESPONSE + + mock_load_data.side_effect = dummy_load_pages_with_exception + + documents = youtube_channel_transcript_reader.load_data(google_api_key="test_key", + yt_channel_id="test_channel_id") + + assert mock_load_data.call_count == 2 + mock_load_data.assert_has_calls([ + call(ytlinks=['https://www.youtube.com/watch?v=test_video_id_1']), + call(ytlinks=['https://www.youtube.com/watch?v=test_video_id_2'])]) + + assert len(documents) == 1 + assert documents[0].text == "Transcript for https://www.youtube.com/watch?v=test_video_id_2"