From 5791a951e4859dffe9002f0794dcab2f89927f5d Mon Sep 17 00:00:00 2001 From: David de la Iglesia Castro Date: Wed, 4 Dec 2024 12:40:41 +0100 Subject: [PATCH] fix(docs): Update broken links. (#36) * fix(docs): Update broken links. * Add search function in docs * Add basic docstring to config.py * Add docstring to save_waveform_as_file * Add missing references and fix a typo --------- Co-authored-by: Kostis-S-Z --- docs/api.md | 4 ++++ docs/customization.md | 2 +- docs/index.md | 2 +- docs/step-by-step-guide.md | 18 +++++++++--------- mkdocs.yml | 1 + .../podcast_maker/config.py | 9 +++++++++ .../podcast_maker/script_to_audio.py | 9 +++++++++ 7 files changed, 34 insertions(+), 11 deletions(-) diff --git a/docs/api.md b/docs/api.md index 2995f23..c781ee2 100644 --- a/docs/api.md +++ b/docs/api.md @@ -7,3 +7,7 @@ ::: document_to_podcast.inference.text_to_text ::: document_to_podcast.inference.text_to_speech + +::: document_to_podcast.podcast_maker.script_to_audio + +::: document_to_podcast.podcast_maker.config diff --git a/docs/customization.md b/docs/customization.md index 92b68b3..8cab6e0 100644 --- a/docs/customization.md +++ b/docs/customization.md @@ -86,4 +86,4 @@ def load_text_to_speech_model_and_tokenizer(): ## 🤝 **Contributing to the Blueprint** -Want to help improve or extend this Blueprint? Check out the **[Future Features & Contributions Guide](../future-features-contributions)** to see how you can contribute your ideas, code, or feedback to make this Blueprint even better! +Want to help improve or extend this Blueprint? Check out the **[Future Features & Contributions Guide](future-features-contributions.md)** to see how you can contribute your ideas, code, or feedback to make this Blueprint even better! diff --git a/docs/index.md b/docs/index.md index b5345e2..e0ea982 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,7 +1,7 @@ # **Document-to-Podcast Blueprint**
- Project Logo + Project Logo
Blueprints empower developers to easily integrate AI capabilities into their projects using open-source models and tools. diff --git a/docs/step-by-step-guide.md b/docs/step-by-step-guide.md index 1fc3bbe..a84b10a 100644 --- a/docs/step-by-step-guide.md +++ b/docs/step-by-step-guide.md @@ -1,6 +1,6 @@ # **Step-by-Step Guide: How the Document-to-Podcast Blueprint Works** -Transforming static documents into engaging podcast episodes involves a integration of pre-processing, LLM-powered transcript generation, and text-to-speech generation. Here's how it all works under the hood: +Transforming static documents into engaging podcast episodes involves an integration of pre-processing, LLM-powered transcript generation, and text-to-speech generation. Here's how it all works under the hood: --- @@ -33,7 +33,7 @@ Cleaner input data ensures that the model works with reliable and consistent inf ### ⚙️ **Key Components in this Doc Pre-Processing** **1 - File Loading** - - Uses functions defined in `data_loaders.py` + - Uses functions defined in [`data_loaders.py`](api.md/#document_to_podcast.preprocessing.data_loaders) - Supports `.html`, `.pdf`, `.txt`, and `.docx` formats. @@ -41,7 +41,7 @@ Cleaner input data ensures that the model works with reliable and consistent inf **2 - Text Cleaning** - - Uses functions defined in [`data_cleaners.py`](../api/#document_to_podcast.inference.data_cleaners) + - Uses functions defined in [`data_cleaners.py`](api.md/#document_to_podcast.preprocessing.data_cleaners) - Removes unwanted elements like URLs, email addresses, and special characters using Python's `re` library, which leverages **Regular Expressions** (regex) to identify and manipulate specific patterns in text. @@ -55,7 +55,7 @@ In this step, the pre-processed text is transformed into a conversational podcas **1 - Model Loading** - - The [`model_loader.py`](../api/#document_to_podcast.inference.model_loaders) script is responsible for loading GGUF-type models using the `llama_cpp` library. + - The [`model_loader.py`](api.md/#document_to_podcast.inference.model_loaders) script is responsible for loading GGUF-type models using the `llama_cpp` library. - The function `load_llama_cpp_model` takes a model ID in the format `{org}/{repo}/{filename}` and loads the specified model. @@ -63,7 +63,7 @@ In this step, the pre-processed text is transformed into a conversational podcas **2 - Text-to-Text Generation** - - The [`text_to_text.py`](../api/#document_to_podcast.inference.text_to_text) script manages the interaction with the language model, converting input text into a structured conversational podcast script. + - The [`text_to_text.py`](api.md/#document_to_podcast.inference.text_to_text) script manages the interaction with the language model, converting input text into a structured conversational podcast script. - It uses the `chat_completion` function to process the input text and a customizable system prompt, guiding the language to generate a text output (e.g. a coherent podcast script between speakers). @@ -80,7 +80,7 @@ In this final step, the generated podcast transcript is brought to life as an au **1 - Text-to-Speech Audio Generation** - - The `text_to_speech.py` script converts text into audio using a specified TTS model and tokenizer. + - The [`text_to_speech.py`](api.md/#document_to_podcast.inference.text_to_speech) script converts text into audio using a specified TTS model and tokenizer. - A **speaker profile** defines the voice characteristics (e.g., tone, speed, clarity) for each speaker. @@ -88,7 +88,7 @@ In this final step, the generated podcast transcript is brought to life as an au **2 - Parsing and Combining Voices** -- The `script_to_audio.py` script ensures each speaker’s dialogue is spoken in their unique voice. +- The [`script_to_audio.py`](api.md/#document_to_podcast.podcast_maker.script_to_audio) script ensures each speaker’s dialogue is spoken in their unique voice. - The function `parse_script_to_waveform` splits the dialogue script by speakers and uses `text_to_speech` to generate audio for each speaker, stitching them together into a full podcast. @@ -145,8 +145,8 @@ This demo uses [Streamlit](https://streamlit.io/), an open-source Python framewo ## 🎨 **Customizing the Blueprint** -To better understand how you can tailor this Blueprint to suit your specific needs, please visit the **[Customization Guide](../customization)**. +To better understand how you can tailor this Blueprint to suit your specific needs, please visit the **[Customization Guide](customization.md)**. ## 🤝 **Contributing to the Blueprint** -Want to help improve or extend this Blueprint? Check out the **[Future Features & Contributions Guide](../future-features-contributions)** to see how you can contribute your ideas, code, or feedback to make this Blueprint even better! +Want to help improve or extend this Blueprint? Check out the **[Future Features & Contributions Guide](future-features-contributions.md)** to see how you can contribute your ideas, code, or feedback to make this Blueprint even better! diff --git a/mkdocs.yml b/mkdocs.yml index 3d3940c..78e501d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -34,6 +34,7 @@ markdown_extensions: - pymdownx.superfences plugins: +- search - mkdocstrings: handlers: python: diff --git a/src/document_to_podcast/podcast_maker/config.py b/src/document_to_podcast/podcast_maker/config.py index 47cd1b7..50e737a 100644 --- a/src/document_to_podcast/podcast_maker/config.py +++ b/src/document_to_podcast/podcast_maker/config.py @@ -4,6 +4,10 @@ class SpeakerConfig(BaseModel): + """ + Pydantic model that stores configuration of an individual speaker for the TTS model. + """ + model_config = ConfigDict(arbitrary_types_allowed=True) model: PreTrainedModel @@ -16,5 +20,10 @@ class SpeakerConfig(BaseModel): class PodcastConfig(BaseModel): + """ + Pydantic model that stores configuration of all the speakers for the TTS model. This allows different speakers to + use different models and configurations. + """ + speakers: Dict[str, SpeakerConfig] sampling_rate: int = 44_100 diff --git a/src/document_to_podcast/podcast_maker/script_to_audio.py b/src/document_to_podcast/podcast_maker/script_to_audio.py index 403ca00..ef63e94 100644 --- a/src/document_to_podcast/podcast_maker/script_to_audio.py +++ b/src/document_to_podcast/podcast_maker/script_to_audio.py @@ -40,6 +40,15 @@ def parse_script_to_waveform(script: str, podcast_config: PodcastConfig): def save_waveform_as_file( waveform: np.ndarray, sampling_rate: int, filename: str ) -> None: + """ + Save the output of the TTS (a numpy waveform) to a .wav file using the soundfile library. + + Args: + waveform: 2D numpy array of a waveform + sampling_rate: Usually 44.100, but check the specifications of the TTS model you are using. + filename: the destination filename to save the audio + + """ sf.write(filename, waveform, sampling_rate)