diff --git a/.gitignore b/.gitignore index 8f7679b..ff3b4eb 100644 --- a/.gitignore +++ b/.gitignore @@ -163,3 +163,6 @@ cython_debug/ # Generated audio files *.wav + +# VS files +.vscode diff --git a/README.md b/README.md index abc2d3a..0e27c83 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,100 @@ - +[![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) +
-# Blueprint Title +# Document-to-podcast: a Blueprint by Mozilla.ai for generating podcasts from documents using local AI -This blueprint guides you to ... +This blueprint demonstrate how you can use open-source models & tools to convert input documents into a podcast featuring two speakers. +It is designed to work on most local setups or with [GitHub Codespaces](https://github.com/codespaces/new?hide_repo_select=true&ref=main&repo=888426876&skip_quickstart=true&machine=standardLinux32gb), meaning no external API calls or GPU access is required. This makes it more accessible and privacy-friendly by keeping everything local. -![Blueprint Diagram](./images/blueprint-diagram.png) +### 👉 📖 For more detailed guidance on using this project, please visit our [Docs here](https://mozilla-ai.github.io/document-to-podcast/). +## Quick-start + +Get started with Document-to-Podcast using one of the two options below: **GitHub Codespaces** for a hassle-free setup or **Local Installation** for running on your own machine. + +--- + +### **Option 1: GitHub Codespaces** + +The fastest way to get started. Click the button below to launch the project directly in GitHub Codespaces: + +[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://github.com/codespaces/new?hide_repo_select=true&ref=main&repo=888426876&skip_quickstart=true&machine=standardLinux32gb) + +Once the Codespaces environment launches, follow these steps: + +1. **Install Dependencies** + Inside the Codespaces terminal, run: + ```bash + pip install -e . --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu +2. **Run the Demo** + Inside the Codespaces terminal, start the Streamlit demo by running: + ```bash + python -m streamlit run demo/app.py + ``` + +### **Option 2: Local Installation** + +1. **Clone the Repository** + Inside the Codespaces terminal, run: + ```bash + git clone https://github.com/mozilla-ai/document-to-podcast.git + cd document-to-podcast + ``` + +2. **Install Dependencies** + Inside the terminal, run: + ```bash + pip install -e . --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu +3. **Run the Demo** + Inside the terminal, start the Streamlit demo by running: + ```bash + python -m streamlit run demo/app.py + ``` + +## How it Works + + + + +1. **Document Upload** + Start by uploading a document in a supported format (e.g., PDF, .txt, or .docx). + +2. **Document Pre-Processing** + The uploaded document is processed to extract and clean the text. This involves: + - Extracting readable text from the document. + - Removing noise such as URLs, email addresses, and special characters to ensure the text is clean and structured. + +3. **Script Generation** + The cleaned text is passed to a language model to generate a podcast transcript in the form of a conversation between two speakers. + - **Model Loading**: The system selects and loads a pre-trained LLM optimized for running locally, using the llama_cpp library. This enables the model to run efficiently on CPUs, making them more accessible and suitable for local setups. + - **Customizable Prompt**: A user-defined "system prompt" guides the LLM in shaping the conversation, specifying tone, content, speaker interaction, and format. + - **Output Transcript**: The model generates a podcast script in structured format, with each speaker's dialogue clearly labeled. + Example output: + ```json + { + "Speaker 1": "Welcome to the podcast on AI advancements.", + "Speaker 2": "Thank you! So what's new this week for the latest AI trends?", + "Speaker 1": "Where should I start.. Lots has been happening!", + ... + } + ``` + This step ensures that the podcast script is engaging, relevant, and ready for audio conversion. + +4. **Audio Generation** + - The generated transcript is converted into audio using a Text-to-Speech (TTS) model. + - Each speaker is assigned a distinct voice. + - The final output is saved as an audio file in formats like MP3 or WAV. ## Pre-requisites - **System requirements**: - OS: Windows, macOS, or Linux - Python 3.10 or higher - - Minimum RAM: 4 GB - - Disk space: 1 GB minimum + - Minimum RAM: 16 GB + - Disk space: 32 GB minimum - **Dependencies**: - - Dependencies listed in `requirements.txt` - -## Installation - ---- - -## Quick-start - ---- + - Dependencies listed in `pyproject.toml` ## License diff --git a/demo/app.py b/demo/app.py index ef0550f..4e67464 100644 --- a/demo/app.py +++ b/demo/app.py @@ -3,13 +3,13 @@ import streamlit as st -from opennotebookllm.preprocessing import DATA_LOADERS, DATA_CLEANERS -from opennotebookllm.inference.model_loaders import ( +from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS +from document_to_podcast.inference.model_loaders import ( load_llama_cpp_model, load_parler_tts_model_and_tokenizer, ) -from opennotebookllm.inference.text_to_speech import _speech_generation_parler -from opennotebookllm.inference.text_to_text import text_to_text_stream +from document_to_podcast.inference.text_to_speech import text_to_speech +from document_to_podcast.inference.text_to_text import text_to_text_stream PODCAST_PROMPT = """ @@ -112,7 +112,7 @@ def load_text_to_speech_model_and_tokenizer(): st.write(text) speaker_id = re.search(r"Speaker (\d+)", text).group(1) with st.spinner("Generating Audio..."): - speech = _speech_generation_parler( + speech = text_to_speech( text.split(f'"Speaker {speaker_id}":')[-1], speech_model, speech_tokenizer, diff --git a/docs/api.md b/docs/api.md index a7c1c65..c6da42c 100644 --- a/docs/api.md +++ b/docs/api.md @@ -1,7 +1,7 @@ # API Reference -::: opennotebookllm.preprocessing.data_cleaners +::: document_to_podcast.preprocessing.data_cleaners -::: opennotebookllm.inference.model_loaders +::: document_to_podcast.inference.model_loaders -::: opennotebookllm.inference.text_to_text +::: document_to_podcast.inference.text_to_text diff --git a/docs/assets/custom.css b/docs/assets/custom.css new file mode 100644 index 0000000..c3dc2ff --- /dev/null +++ b/docs/assets/custom.css @@ -0,0 +1,7 @@ +@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); + +:root { + --md-default-font: "Inter", sans-serif; + --md-code-font: "Fira Code", monospace; + --md-primary-font: "Inter", sans-serif; +} diff --git a/docs/customization.md b/docs/customization.md new file mode 100644 index 0000000..92b68b3 --- /dev/null +++ b/docs/customization.md @@ -0,0 +1,89 @@ +# 🎨 **Customization Guide** + +The Document-to-Podcast Blueprint is designed to be flexible and easily adaptable to your specific needs. This guide will walk you through some key areas you can customize to make the Blueprint your own. + +--- + +## 🧠 **Changing the Text-to-Text Model** +You can swap the language model used for generating podcast scripts to suit your needs, such as using a smaller model for faster processing or a larger one for higher quality outputs. + +Customizing the app: + +1. Open the `app.py` file. +2. Locate the `load_text_to_text_model` function. +3. Replace the `model_id` with the ID of your desired model from a supported repository (e.g., Hugging Face). Note: The model repository must be in GGFUF format, for example: `Qwen/Qwen2.5-1.5B-Instruct-GGUF` + +Example: + +```python +@st.cache_resource +def load_text_to_text_model(): + return load_llama_cpp_model( + model_id="Qwen/Qwen2.5-1.5B-Instruct-GGUF/qwen2.5-1.5b-instruct-q8_0.gguf" +``` + + +## 📝 **Modifying the Text Generation Prompt** +The system prompt defines the structure and tone of the generated script. Customizing this can allow you to generate conversations that align with your project’s needs. + +Customizing the app: + +1. Open the `app.py` file. +2. Locate the PODCAST_PROMPT variable. +3. Edit the instructions to suit your desired conversation style. + +Example: + +```python +PODCAST_PROMPT = """ +You are a radio show scriptwriter generating lively and humorous dialogues. +Speaker 1: A comedian who is interested in learning new things. +Speaker 2: A scientist explaining concepts in a fun way. +""" +``` + + +## 🎙️ **Customizing Speaker Descriptions** +Adjusting the speaker profiles allows you to create distinct and engaging voices for your podcast. + +Customizing the app: + +1. Open the `app.py` file. +2. Locate the SPEAKER_DESCRIPTIONS dictionary. +3. Update the descriptions to define new voice characteristics for each speaker +Example: + +```python +PODCAST_PROMPT = """ +SPEAKER_DESCRIPTIONS = { + "1": "A cheerful and animated voice with a fast-paced delivery.", + "2": "A calm and deep voice, speaking with authority and warmth." +} +""" +``` + + +## 🧠 **Changing the Text-to-Speech Model** +You can use a different TTS model to achieve specific voice styles or improve performance. + +Customizing the app: + +1. Open the `app.py` file. +2. Locate the `load_text_to_speech_model_and_tokenizer` function. +3. Replace the model_id with your preferred TTS model. + +Example: +```python +@st.cache_resource +def load_text_to_speech_model_and_tokenizer(): + return load_parler_tts_model_and_tokenizer( + "parler-tts/parler-tts-mini-expresso", "cpu") + +## 💡 Other Customization Ideas + +- Add Multiple Speakers: Modify `script_to_audio.py` to include additional speakers in your podcast. + + +## 🤝 **Contributing to the Blueprint** + +Want to help improve or extend this Blueprint? Check out the **[Future Features & Contributions Guide](../future-features-contributions)** to see how you can contribute your ideas, code, or feedback to make this Blueprint even better! diff --git a/docs/future-features-contributions.md b/docs/future-features-contributions.md new file mode 100644 index 0000000..9223ae7 --- /dev/null +++ b/docs/future-features-contributions.md @@ -0,0 +1,30 @@ +# 🚀 **Future Features & Contributions** + +The Document-to-Podcast Blueprint is an evolving project designed to grow with the help of the open-source community. Whether you’re an experienced developer or just starting, there are many ways you can contribute and help shape the future of this tool. + +--- +## 🛠️ **This Page is Evolving** +As the community grows, we’ll use this space to highlight contributions, showcase new ideas, and share guidance on expanding the Blueprint ecosystem. + +We have some ideas of how this Blueprint can be extend and improved, will be sharing these ideas and request for contributions shortly. + +--- + +## 🌟 **How You Can Contribute** + +### 💡 **Share Your Ideas** +Got a vision for how this Blueprint could be improved? Share your suggestions through [GitHub Discussions](https://github.com/mozilla-ai/document-to-podcast/discussions). Your insights can help inspire new directions for the project. + +### 🛠️ **Enhance the Code** +Dive into the codebase and contribute enhancements, optimizations, or bug fixes. Whether it's a small tweak or a big feature, every contribution helps! Start by checking our Contribution Guide (coming soon). + + +### 🌍 **Build New Blueprints** +This project is part of a larger initiative to create a collection of reusable starter code solutions that use open-source AI tools. If you’re inspired to create your own Blueprint, we’d love to see it! + +--- + +## 🤝 **Get Involved** +- Visit our [GitHub Discussions](https://github.com/mozilla-ai/document-to-podcast/discussions) to explore ongoing conversations and share your thoughts. + +Your contributions help make this Blueprint better for everyone. Thank you for being part of the journey! 🎉 diff --git a/docs/getting-started.md b/docs/getting-started.md new file mode 100644 index 0000000..08a89b9 --- /dev/null +++ b/docs/getting-started.md @@ -0,0 +1,49 @@ +Get started with Document-to-Podcast using one of the two options below: **GitHub Codespaces** for a hassle-free setup or **Local Installation** for running on your own machine. + +--- + +### ☁️ **Option 1: GitHub Codespaces** + +The fastest way to get started. Click the button below to launch the project directly in GitHub Codespaces: + +[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://github.com/codespaces/new?hide_repo_select=true&ref=main&repo=888426876&skip_quickstart=true&machine=standardLinux32gb) + +Once the Codespaces environment launches, follow these steps: + +1. **Install Dependencies** + Inside the Codespaces terminal, run: +```bash +pip install -e . --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu +``` +2. **Run the Demo** + Inside the Codespaces terminal, start the Streamlit demo by running: +```bash +python -m streamlit run demo/app.py +``` + + +### 💻 **Option 2: Local Installation** + +1. **Clone the Repository** + Inside the Codespaces terminal, run: + +```bash +git clone https://github.com/mozilla-ai/document-to-podcast.git +cd document-to-podcast +``` + + +2. **Install Dependencies** + Inside the terminal, run: + + +```bash +pip install -e . --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu +``` + +3. **Run the Demo** + Inside the terminal, start the Streamlit demo by running: + +```bash +python -m streamlit run demo/app.py +``` diff --git a/docs/images/Blueprints-logo.png b/docs/images/Blueprints-logo.png new file mode 100644 index 0000000..b865f86 Binary files /dev/null and b/docs/images/Blueprints-logo.png differ diff --git a/docs/images/document-to-podcast-diagram.png b/docs/images/document-to-podcast-diagram.png new file mode 100644 index 0000000..5969382 Binary files /dev/null and b/docs/images/document-to-podcast-diagram.png differ diff --git a/docs/index.md b/docs/index.md index e3018ee..b5345e2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1 +1,37 @@ -# Wellcome to Blueprint docs +# **Document-to-Podcast Blueprint** + +Hello, world!
"") @@ -67,7 +67,7 @@ def clean_markdown(text: str) -> str: This function removes: - markdown images - In addition, it calls [clean_with_regex][opennotebookllm.preprocessing.data_cleaners.clean_with_regex]. + In addition, it calls [clean_with_regex][document_to_podcast.preprocessing.data_cleaners.clean_with_regex]. Examples: >>> clean_markdown('# Title with image ![alt text](image.jpg "Image Title")') diff --git a/src/opennotebookllm/preprocessing/data_loaders.py b/src/document_to_podcast/preprocessing/data_loaders.py similarity index 100% rename from src/opennotebookllm/preprocessing/data_loaders.py rename to src/document_to_podcast/preprocessing/data_loaders.py diff --git a/tests/integration/test_data_load_and_clean.py b/tests/integration/test_data_load_and_clean.py index 9322753..eecae93 100644 --- a/tests/integration/test_data_load_and_clean.py +++ b/tests/integration/test_data_load_and_clean.py @@ -1,9 +1,9 @@ -from opennotebookllm.preprocessing.data_cleaners import ( +from document_to_podcast.preprocessing.data_cleaners import ( clean_html, clean_with_regex, clean_markdown, ) -from opennotebookllm.preprocessing.data_loaders import load_pdf, load_txt +from document_to_podcast.preprocessing.data_loaders import load_pdf, load_txt def test_load_and_clean_pdf(example_data): diff --git a/tests/integration/test_model_load_and_inference.py b/tests/integration/test_model_load_and_inference.py index ecdba1e..50a523a 100644 --- a/tests/integration/test_model_load_and_inference.py +++ b/tests/integration/test_model_load_and_inference.py @@ -3,8 +3,8 @@ import pytest -from opennotebookllm.inference.model_loaders import load_llama_cpp_model -from opennotebookllm.inference.text_to_text import text_to_text, text_to_text_stream +from document_to_podcast.inference.model_loaders import load_llama_cpp_model +from document_to_podcast.inference.text_to_text import text_to_text, text_to_text_stream def test_model_load_and_inference_text_to_text(): diff --git a/tests/unit/inference/test_model_loaders.py b/tests/unit/inference/test_model_loaders.py index ff1f5cd..950b62d 100644 --- a/tests/unit/inference/test_model_loaders.py +++ b/tests/unit/inference/test_model_loaders.py @@ -1,9 +1,9 @@ from llama_cpp import Llama -from opennotebookllm.inference.model_loaders import load_llama_cpp_model +from document_to_podcast.inference.model_loaders import load_llama_cpp_model from transformers import PreTrainedModel, PreTrainedTokenizerBase -from opennotebookllm.inference.model_loaders import ( +from document_to_podcast.inference.model_loaders import ( load_parler_tts_model_and_tokenizer, ) diff --git a/tests/unit/preprocessing/test_data_cleaners.py b/tests/unit/preprocessing/test_data_cleaners.py index ce11020..1aecb2f 100644 --- a/tests/unit/preprocessing/test_data_cleaners.py +++ b/tests/unit/preprocessing/test_data_cleaners.py @@ -1,4 +1,4 @@ -from opennotebookllm.preprocessing.data_cleaners import ( +from document_to_podcast.preprocessing.data_cleaners import ( clean_html, clean_with_regex, clean_markdown, diff --git a/tests/unit/preprocessing/test_data_loaders.py b/tests/unit/preprocessing/test_data_loaders.py index 6c3bada..1e445a8 100644 --- a/tests/unit/preprocessing/test_data_loaders.py +++ b/tests/unit/preprocessing/test_data_loaders.py @@ -1,4 +1,4 @@ -from opennotebookllm.preprocessing.data_loaders import load_pdf, load_txt, load_docx +from document_to_podcast.preprocessing.data_loaders import load_pdf, load_txt, load_docx def test_load_pdf(example_data):