Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

YoutubeChannelTranscriptReader implementation #643

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions llama_hub/library.json
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,14 @@
"video"
]
},
"YoutubeChannelTranscriptReader": {
"id": "youtube_channel_transcript",
"author": "kapil-malik",
"keywords": [
"video",
"youtube"
]
},
"MakeWrapper": {
"id": "make_com"
},
Expand Down
62 changes: 62 additions & 0 deletions llama_hub/youtube_channel_transcript/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Youtube Channel Transcript Loader

This loader fetches the text transcript of all YouTube videos for a given YouTube channel.

It is based on the [Youtube Transcript Loader](https://llamahub.ai/l/youtube_transcript)

## Requirements

### Google API Credentials
To use this loader, you'll need Google API credentials.

This requires to have a Google Cloud Platform (GCP) project set up, enable the YouTube Data API v3,
and obtain API credentials before proceeding.

1. **Set Up a GCP Project**:
* Go to the [Google Cloud Console](https://console.cloud.google.com/).
* Create a new project or select an existing one.
2. **Enable the YouTube Data API v3**:
* In the Google Cloud Console, navigate to the [APIs & Services](https://console.cloud.google.com/apis) > [Library](https://console.cloud.google.com/apis/library) page.
* Search for "YouTube Data API v3" and enable it for your project.
3. **Create API Credentials**:
* Still in the [APIs & Services](https://console.cloud.google.com/apis) section, navigate to [Credentials](https://console.cloud.google.com/apis/credentials).
* Create API credentials by clicking on "Create Credentials" and selecting "API Key."

### Python packages

You will then need to install youtube_transcript_api and google-api-python-client

```
pip install youtube_transcript_api
pip install google-api-python-client
```


## Usage

You instantiate the loader and then pass the Google API key and YouTube channel id into `load_data`:

```python
from llama_hub.youtube_channel_transcript import YoutubeChannelTranscriptReader

loader = YoutubeChannelTranscriptReader()
documents = loader.load_data(google_api_key='YOUR_API_KEY', yt_channel_id='UCeRjipR4_SsCddq9VZ2AeKg')
```

If the channel contains any YouTube videos that do not have a transcript, it will ignore and log them as warning.


### Old Usage

Use this syntax for earlier versions of llama_index where llama_hub loaders where loaded via separate download process:

```python
from llama_index import download_loader

YoutubeChannelTranscriptReader = download_loader("YoutubeChannelTranscriptReader")

loader = YoutubeChannelTranscriptReader()
documents = loader.load_data(google_api_key='YOUR_API_KEY', yt_channel_id='UCeRjipR4_SsCddq9VZ2AeKg')
```

This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/run-llama/llama-hub/tree/main) for examples.
6 changes: 6 additions & 0 deletions llama_hub/youtube_channel_transcript/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""Init file."""
from llama_hub.youtube_channel_transcript.base import (
YoutubeChannelTranscriptReader
)

__all__ = ["YoutubeChannelTranscriptReader"]
75 changes: 75 additions & 0 deletions llama_hub/youtube_channel_transcript/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""Simple Reader that reads transcripts of all YouTube videos for a YouTube channel."""
import logging
from importlib.util import find_spec
from typing import List

from llama_index import download_loader
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document


class YoutubeChannelTranscriptReader(BaseReader):
"""YouTube channel reader. Reads transcripts for all videos from a YouTube channel.
"""
def __init__(self) -> None:
if find_spec("googleapiclient") is None:
raise ImportError(
"Missing package: googleapiclient.\n"
"Please `pip install google-api-python-client` to use this Reader"
)

try:
from llama_hub.utils import import_loader

YoutubeTranscriptReader = import_loader("YoutubeTranscriptReader")
except ImportError:
YoutubeTranscriptReader = download_loader("YoutubeTranscriptReader")

self._youtube_transcript_loader = YoutubeTranscriptReader()

def load_data(self, google_api_key: str, yt_channel_id: str) -> List[Document]:
"""Load data for all videos in the YouTube channel.

Args:
google_api_key (str): Google API key.
yt_channel_id (str): YouTube channel ID.

"""
documents = []
yt_links = self.get_channel_video_links(google_api_key=google_api_key, yt_channel_id=yt_channel_id)
logging.info("Found %s YouTube videos from the channel", len(yt_links))

# loading documents for one video at a time because it might fail for videos without transcripts
for yt_link in yt_links:
try:
link_documents = self._youtube_transcript_loader.load_data(ytlinks=[yt_link])
documents.extend(link_documents)
except Exception:
logging.warning("Failed to load data for video: %s", yt_link, exc_info=True)

logging.info("Loaded %s documents from the channel", len(documents))
return documents

@staticmethod
def get_channel_video_links(google_api_key: str, yt_channel_id: str) -> List[str]:
import googleapiclient.discovery

youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=google_api_key)
playlists_response = youtube.channels().list(part='contentDetails', id=yt_channel_id).execute()
playlist_id = playlists_response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
logging.info("Found playlist id: %s", playlist_id)

next_page_token = None
item_ids = []

while True:
items_response = youtube.playlistItems().list(part='contentDetails', playlistId=playlist_id, maxResults=50,
pageToken=next_page_token).execute()
items = items_response['items']
video_ids = [item['contentDetails']['videoId'] for item in items]
item_ids.extend(video_ids)
next_page_token = items_response.get('nextPageToken')
if not next_page_token:
break

return ['https://www.youtube.com/watch?v=' + item_id for item_id in item_ids]
2 changes: 2 additions & 0 deletions llama_hub/youtube_channel_transcript/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
youtube_transcript_api~=0.5.0
google-api-python-client>=2.108.0
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ atlassian-python-api = "*"
html2text = "*"
psutil = "*"
retrying = "*"
google-api-python-client = ">=2.108.0"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can remove the dependency from pyproject


[tool.poetry.dev-dependencies]
pytest = "7.2.1"
Expand Down
1 change: 1 addition & 0 deletions test_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ llama-index>=0.6.9
atlassian-python-api
html2text
olefile
google-api-python-client>=2.108.0

# hotfix
psutil
Expand Down
Empty file.
142 changes: 142 additions & 0 deletions tests/tests_youtube_channel_transcript/test_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import unittest
from importlib.util import find_spec
from unittest.mock import patch, call

import pytest
from llama_index.readers.schema.base import Document

from llama_hub.youtube_channel_transcript import YoutubeChannelTranscriptReader

dependencies_available = ((find_spec("googleapiclient") is not None) and
(find_spec("youtube_transcript_api") is not None))

CHANNELS_RESPONSE = {
"items": [
{
"contentDetails": {
"relatedPlaylists": {
"uploads": "test_playlist_id"
}
}
}
]
}

PLAYLISTS_RESPONSE = {
"items": [
{
"contentDetails": {
"videoId": "test_video_id_1"
}
},
{
"contentDetails": {
"videoId": "test_video_id_2"
}
}
]
}


def dummy_load_pages(ytlinks: list[str]):
documents = []
for ytlink in ytlinks:
documents.append(Document(text=f"Transcript for {ytlink}"))
return documents


def dummy_load_pages_with_exception(ytlinks: list[str]):
documents = []
for ytlink in ytlinks:
if ytlink == "https://www.youtube.com/watch?v=test_video_id_2":
documents.append(Document(text=f"Transcript for {ytlink}"))
else:
raise Exception("Failed to load transcript")
return documents


class TestYoutubeChannelTranscriptReader(unittest.TestCase):

@pytest.mark.skipif(
not dependencies_available,
reason="Skipping since google-api-python-client or youtube_transcript_api is not installed",
)
def test_yt_channel_transcript_reader_init(self):
# test w/o args
YoutubeChannelTranscriptReader()

@pytest.mark.skipif(
not dependencies_available,
reason="Skipping since google-api-python-client or youtube_transcript_api is not installed",
)
def test_yt_channel_transcript_reader_load_data_invalid_args(self):
youtube_channel_transcript_reader = YoutubeChannelTranscriptReader()

with pytest.raises(
TypeError,
match="missing 2 required positional arguments: 'google_api_key' and 'yt_channel_id'",
):
youtube_channel_transcript_reader.load_data()

@pytest.mark.skipif(
not dependencies_available,
reason="Skipping since google-api-python-client or youtube_transcript_api is not installed",
)
@patch("llama_hub.youtube_transcript.base.YoutubeTranscriptReader.load_data")
def test_yt_channel_transcript_reader_load_data(self, mock_load_data):
with patch("googleapiclient.discovery") as mock_discovery:
youtube_channel_transcript_reader = YoutubeChannelTranscriptReader()

mock_build = mock_discovery.build.return_value
mock_build.channels.return_value.list.return_value.execute.return_value = CHANNELS_RESPONSE
mock_build.playlistItems.return_value.list.return_value.execute.return_value = PLAYLISTS_RESPONSE

mock_load_data.side_effect = dummy_load_pages

documents = youtube_channel_transcript_reader.load_data(google_api_key="test_key",
yt_channel_id="test_channel_id")

mock_discovery.build.assert_called_once_with('youtube', 'v3', developerKey='test_key')
mock_discovery.build.return_value.channels.assert_called_once()
mock_discovery.build.return_value.channels.return_value.list.assert_called_once_with(
part='contentDetails', id='test_channel_id')
mock_discovery.build.return_value.channels.return_value.list.return_value.execute.assert_called_once()
mock_discovery.build.return_value.playlistItems.assert_called_once()
mock_discovery.build.return_value.playlistItems.return_value.list.assert_called_once_with(
part='contentDetails', playlistId='test_playlist_id', maxResults=50, pageToken=None)
mock_discovery.build.return_value.playlistItems.return_value.list.return_value.execute.assert_called_once()

assert mock_load_data.call_count == 2
mock_load_data.assert_has_calls([
call(ytlinks=['https://www.youtube.com/watch?v=test_video_id_1']),
call(ytlinks=['https://www.youtube.com/watch?v=test_video_id_2'])])

assert len(documents) == 2
assert documents[0].text == "Transcript for https://www.youtube.com/watch?v=test_video_id_1"
assert documents[1].text == "Transcript for https://www.youtube.com/watch?v=test_video_id_2"

@pytest.mark.skipif(
not dependencies_available,
reason="Skipping since google-api-python-client or youtube_transcript_api is not installed",
)
@patch("llama_hub.youtube_transcript.base.YoutubeTranscriptReader.load_data")
def test_yt_channel_transcript_reader_load_data_with_exceptions(self, mock_load_data):
with patch("googleapiclient.discovery") as mock_discovery:
youtube_channel_transcript_reader = YoutubeChannelTranscriptReader()

mock_build = mock_discovery.build.return_value
mock_build.channels.return_value.list.return_value.execute.return_value = CHANNELS_RESPONSE
mock_build.playlistItems.return_value.list.return_value.execute.return_value = PLAYLISTS_RESPONSE

mock_load_data.side_effect = dummy_load_pages_with_exception

documents = youtube_channel_transcript_reader.load_data(google_api_key="test_key",
yt_channel_id="test_channel_id")

assert mock_load_data.call_count == 2
mock_load_data.assert_has_calls([
call(ytlinks=['https://www.youtube.com/watch?v=test_video_id_1']),
call(ytlinks=['https://www.youtube.com/watch?v=test_video_id_2'])])

assert len(documents) == 1
assert documents[0].text == "Transcript for https://www.youtube.com/watch?v=test_video_id_2"
Loading