mozilla-ai · Kostis-S-Z · Dec 17, 2024 · Dec 17, 2024 · Dec 17, 2024 · Dec 17, 2024
diff --git a/demo/app.py b/demo/app.py
@@ -5,13 +5,13 @@
 import soundfile as sf
 import streamlit as st
 
+from document_to_podcast.inference.text_to_speech import text_to_speech
 from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
 from document_to_podcast.inference.model_loaders import (
     load_llama_cpp_model,
-    load_outetts_model,
+    load_tts_model,
 )
 from document_to_podcast.config import DEFAULT_PROMPT, DEFAULT_SPEAKERS, Speaker
-from document_to_podcast.inference.text_to_speech import text_to_speech
 from document_to_podcast.inference.text_to_text import text_to_text_stream
 
 
@@ -24,7 +24,7 @@ def load_text_to_text_model():
 
 @st.cache_resource
 def load_text_to_speech_model():
-    return load_outetts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
+    return load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
 
 
 script = "script"
@@ -153,7 +153,7 @@ def gen_button_clicked():
                             speech_model,
                             voice_profile,
                         )
-                    st.audio(speech, sample_rate=speech_model.audio_codec.sr)
+                    st.audio(speech, sample_rate=speech_model.sample_rate)
 
                     st.session_state.audio.append(speech)
                     text = ""
@@ -164,7 +164,7 @@ def gen_button_clicked():
             sf.write(
                 "podcast.wav",
                 st.session_state.audio,
-                samplerate=speech_model.audio_codec.sr,
+                samplerate=speech_model.sample_rate,
             )
             st.markdown("Podcast saved to disk!")
 

diff --git a/demo/download_models.py b/demo/download_models.py
@@ -4,10 +4,10 @@
 
 from document_to_podcast.inference.model_loaders import (
     load_llama_cpp_model,
-    load_outetts_model,
+    load_tts_model,
 )
 
 load_llama_cpp_model(
     "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf"
 )
-load_outetts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
+load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
diff --git a/example_data/config_bark.yaml b/example_data/config_bark.yaml
@@ -0,0 +1,31 @@
+input_file: "example_data/a.md"
+output_folder: "example_data/bark/"
+text_to_text_model: "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf"
+text_to_speech_model: "suno/bark"
+text_to_text_prompt: |
+  You are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format. 
+  The script features the following speakers:
+  {SPEAKERS}
+  Instructions:
+  - Write dynamic, easy-to-follow dialogue.
+  - Include natural interruptions and interjections.
+  - Avoid repetitive phrasing between speakers.
+  - Format output as a JSON conversation.
+  Example:
+  {
+    "Speaker 1": "Welcome to our podcast! Today, we're exploring...",
+    "Speaker 2": "Hi! I'm excited to hear about this. Can you explain...",
+    "Speaker 1": "Sure! Imagine it like this...",
+    "Speaker 2": "Oh, that's cool! But how does..."
+  }
+speakers:
+  - id: 1
+    name: Laura
+    description: The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.
+    voice_profile: "v2/en_speaker_0"
+
+  - id: 2
+    name: Daniel
+    description: The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm.
+    voice_profile: "v2/en_speaker_1"
+outetts_language: "en"  # Supported languages in version 0.2-500M: en, zh, ja, ko.
diff --git a/example_data/config_parler.yaml b/example_data/config_parler.yaml
@@ -0,0 +1,31 @@
+input_file: "example_data/a.md"
+output_folder: "example_data/parler/"
+text_to_text_model: "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf"
+text_to_speech_model: "parler-tts/parler-tts-mini-v1.1"
+text_to_text_prompt: |
+  You are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format. 
+  The script features the following speakers:
+  {SPEAKERS}
+  Instructions:
+  - Write dynamic, easy-to-follow dialogue.
+  - Include natural interruptions and interjections.
+  - Avoid repetitive phrasing between speakers.
+  - Format output as a JSON conversation.
+  Example:
+  {
+    "Speaker 1": "Welcome to our podcast! Today, we're exploring...",
+    "Speaker 2": "Hi! I'm excited to hear about this. Can you explain...",
+    "Speaker 1": "Sure! Imagine it like this...",
+    "Speaker 2": "Oh, that's cool! But how does..."
+  }
+speakers:
+  - id: 1
+    name: Laura
+    description: The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.
+    voice_profile: Laura's voice is calm and slow in delivery, with no background noise.
+
+  - id: 2
+    name: Daniel
+    description: The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm.
+    voice_profile: Daniel's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise.
+outetts_language: "en"  # Supported languages in version 0.2-500M: en, zh, ja, ko.
diff --git a/example_data/config_parler_multi.yaml b/example_data/config_parler_multi.yaml
@@ -0,0 +1,31 @@
+input_file: "example_data/a.md"
+output_folder: "example_data/parler_multi/"
+text_to_text_model: "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf"
+text_to_speech_model: "parler-tts/parler-tts-mini-multilingual-v1.1"
+text_to_text_prompt: |
+  You are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format. 
+  The script features the following speakers:
+  {SPEAKERS}
+  Instructions:
+  - Write dynamic, easy-to-follow dialogue.
+  - Include natural interruptions and interjections.
+  - Avoid repetitive phrasing between speakers.
+  - Format output as a JSON conversation.
+  Example:
+  {
+    "Speaker 1": "Welcome to our podcast! Today, we're exploring...",
+    "Speaker 2": "Hi! I'm excited to hear about this. Can you explain...",
+    "Speaker 1": "Sure! Imagine it like this...",
+    "Speaker 2": "Oh, that's cool! But how does..."
+  }
+speakers:
+  - id: 1
+    name: Laura
+    description: The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.
+    voice_profile: Laura's voice is calm and slow in delivery, with no background noise.
+
+  - id: 2
+    name: Daniel
+    description: The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm.
+    voice_profile: Daniel's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise.
+outetts_language: "en"  # Supported languages in version 0.2-500M: en, zh, ja, ko.
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,6 +18,7 @@ dependencies = [
   "pydantic",
   "PyPDF2[crypto]",
   "python-docx",
+  "transformers>4.31.0",
   "streamlit",
 ]
 

diff --git a/src/document_to_podcast/cli.py b/src/document_to_podcast/cli.py
@@ -12,15 +12,14 @@
     Speaker,
     DEFAULT_PROMPT,
     DEFAULT_SPEAKERS,
-    SUPPORTED_TTS_MODELS,
+    TTS_LOADERS,
 )
 from document_to_podcast.inference.model_loaders import (
     load_llama_cpp_model,
-    load_outetts_model,
-    load_parler_tts_model_and_tokenizer,
+    load_tts_model,
 )
-from document_to_podcast.inference.text_to_text import text_to_text_stream
 from document_to_podcast.inference.text_to_speech import text_to_speech
+from document_to_podcast.inference.text_to_text import text_to_text_stream
 from document_to_podcast.preprocessing import DATA_CLEANERS, DATA_LOADERS
 
 
@@ -30,8 +29,9 @@ def document_to_podcast(
     output_folder: str | None = None,
     text_to_text_model: str = "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf",
     text_to_text_prompt: str = DEFAULT_PROMPT,
-    text_to_speech_model: SUPPORTED_TTS_MODELS = "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf",
+    text_to_speech_model: TTS_LOADERS = "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf",
     speakers: list[Speaker] | None = None,
+    outetts_language: str = "en",  # Only applicable to OuteTTS models
     from_config: str | None = None,
 ):
     """
@@ -70,8 +70,10 @@ def document_to_podcast(
         speakers (list[Speaker] | None, optional): The speakers for the podcast.
             Defaults to DEFAULT_SPEAKERS.
 
-        from_config (str, optional): The path to the config file. Defaults to None.
+        outetts_language (str): For OuteTTS models we need to specify which language to use.
+            Supported languages in 0.2-500M: en, zh, ja, ko. More info: https://github.com/edwko/OuteTTS
 
+        from_config (str, optional): The path to the config file. Defaults to None.
 
             If provided, all other arguments will be ignored.
     """
@@ -86,6 +88,7 @@ def document_to_podcast(
             text_to_text_prompt=text_to_text_prompt,
             text_to_speech_model=text_to_speech_model,
             speakers=[Speaker.model_validate(speaker) for speaker in speakers],
+            outetts_language=outetts_language,
         )
 
     output_folder = Path(config.output_folder)
@@ -106,15 +109,9 @@ def document_to_podcast(
     text_model = load_llama_cpp_model(model_id=config.text_to_text_model)
 
     logger.info(f"Loading {config.text_to_speech_model}")
-    if "oute" in config.text_to_speech_model.lower():
-        speech_model = load_outetts_model(model_id=config.text_to_speech_model)
-        speech_tokenizer = None
-        sample_rate = speech_model.audio_codec.sr
-    else:
-        speech_model, speech_tokenizer = load_parler_tts_model_and_tokenizer(
-            model_id=config.text_to_speech_model
-        )
-        sample_rate = speech_model.config.sampling_rate
+    speech_model = load_tts_model(
+        model_id=config.text_to_speech_model, outetts_language=outetts_language
+    )
 
     # ~4 characters per token is considered a reasonable default.
     max_characters = text_model.n_ctx() * 4
@@ -133,33 +130,34 @@ def document_to_podcast(
     system_prompt = system_prompt.replace(
         "{SPEAKERS}", "\n".join(str(speaker) for speaker in config.speakers)
     )
-    for chunk in text_to_text_stream(
-        clean_text, text_model, system_prompt=system_prompt
-    ):
-        text += chunk
-        podcast_script += chunk
-        if text.endswith("\n") and "Speaker" in text:
-            logger.debug(text)
-            speaker_id = re.search(r"Speaker (\d+)", text).group(1)
-            voice_profile = next(
-                speaker.voice_profile
-                for speaker in config.speakers
-                if speaker.id == int(speaker_id)
-            )
-            speech = text_to_speech(
-                text.split(f'"Speaker {speaker_id}":')[-1],
-                speech_model,
-                voice_profile,
-                tokenizer=speech_tokenizer,  # Applicable only for parler models
-            )
-            podcast_audio.append(speech)
-            text = ""
-
+    try:
+        for chunk in text_to_text_stream(
+            clean_text, text_model, system_prompt=system_prompt
+        ):
+            text += chunk
+            podcast_script += chunk
+            if text.endswith("\n") and "Speaker" in text:
+                logger.debug(text)
+                speaker_id = re.search(r"Speaker (\d+)", text).group(1)
+                voice_profile = next(
+                    speaker.voice_profile
+                    for speaker in config.speakers
+                    if speaker.id == int(speaker_id)
+                )
+                speech = text_to_speech(
+                    text.split(f'"Speaker {speaker_id}":')[-1],
+                    speech_model,
+                    voice_profile,
+                )
+                podcast_audio.append(speech)
+                text = ""
+    except KeyboardInterrupt:
+        logger.warning("Podcast generation stopped by user.")
     logger.info("Saving Podcast...")
     sf.write(
         str(output_folder / "podcast.wav"),
         np.concatenate(podcast_audio),
-        samplerate=sample_rate,
+        samplerate=speech_model.sample_rate,
     )
     (output_folder / "podcast.txt").write_text(podcast_script)
     logger.success("Done!")

diff --git a/src/document_to_podcast/config.py b/src/document_to_podcast/config.py
@@ -1,10 +1,11 @@
 from pathlib import Path
-from typing import Literal
 from typing_extensions import Annotated
 
 from pydantic import BaseModel, FilePath
 from pydantic.functional_validators import AfterValidator
 
+from document_to_podcast.inference.model_loaders import TTS_LOADERS
+from document_to_podcast.inference.text_to_speech import TTS_INFERENCE
 from document_to_podcast.preprocessing import DATA_LOADERS
 
 
@@ -41,14 +42,6 @@
     },
 ]
 
-SUPPORTED_TTS_MODELS = Literal[
-    "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf",
-    "OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf",
-    "parler-tts/parler-tts-large-v1",
-    "parler-tts/parler-tts-mini-v1",
-    "parler-tts/parler-tts-mini-v1.1",
-]
-
 
 def validate_input_file(value):
     if Path(value).suffix not in DATA_LOADERS:
@@ -73,6 +66,18 @@ def validate_text_to_text_prompt(value):
     return value
 
 
+def validate_text_to_speech_model(value):
+    if value not in TTS_LOADERS:
+        raise ValueError(
+            f"Model {value} is missing a loading function. Please define it under model_loaders.py"
+        )
+    if value not in TTS_INFERENCE:
+        raise ValueError(
+            f"Model {value} is missing an inference function. Please define it under text_to_speech.py"
+        )
+    return value
+
+
 class Speaker(BaseModel):
     id: int
     name: str
@@ -88,5 +93,6 @@ class Config(BaseModel):
     output_folder: str
     text_to_text_model: Annotated[str, AfterValidator(validate_text_to_text_model)]
     text_to_text_prompt: Annotated[str, AfterValidator(validate_text_to_text_prompt)]
-    text_to_speech_model: SUPPORTED_TTS_MODELS
+    text_to_speech_model: Annotated[str, AfterValidator(validate_text_to_speech_model)]
     speakers: list[Speaker]
+    outetts_language: str = "en"