Updates to demo to include audio part (#26)

* Updates to generate audio in chunks * Update spinner * wip demo structure * Use forked parler-tts. Use setup.sh * Update demo * Fix input_text * Use cache_resource * Drop print * Add dividers * Lint
mozilla-ai · Dec 3, 2024 · be82900 · be82900
1 parent dae21b3
commit be82900
Show file tree

Hide file tree

Showing 4 changed files with 83 additions and 94 deletions.
diff --git a/.github/setup.sh b/.github/setup.sh
@@ -0,0 +1,5 @@
+python -m pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu
+git clone https://github.com/descriptinc/audiotools
+python -m pip install audiotools
+python -m pip install -e .
+rm -rf audiotools
diff --git a/demo/app.py b/demo/app.py
@@ -1,19 +1,16 @@
+import re
 from pathlib import Path
 
 import streamlit as st
-from huggingface_hub import list_repo_files
 
-from opennotebookllm.podcast_maker.config import PodcastConfig, SpeakerConfig
 from opennotebookllm.preprocessing import DATA_LOADERS, DATA_CLEANERS
 from opennotebookllm.inference.model_loaders import (
     load_llama_cpp_model,
     load_parler_tts_model_and_tokenizer,
 )
+from opennotebookllm.inference.text_to_speech import _speech_generation_parler
 from opennotebookllm.inference.text_to_text import text_to_text_stream
-from opennotebookllm.podcast_maker.script_to_audio import (
-    parse_script_to_waveform,
-    save_waveform_as_file,
-)
+
 
 PODCAST_PROMPT = """
 You are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format. The script features two speakers:
@@ -26,113 +23,100 @@
 - Format output as a JSON conversation.
 Example:
 {
-  "Speaker 1": "Welcome to our podcast! Today, we’re exploring...",
-  "Speaker 2": "Hi Laura! I’m excited to hear about this. Can you explain...",
+  "Speaker 1": "Welcome to our podcast! Today, we're exploring...",
+  "Speaker 2": "Hi Laura! I'm excited to hear about this. Can you explain...",
   "Speaker 1": "Sure! Imagine it like this...",
-  "Speaker 2": "Oh, that’s cool! But how does..."
+  "Speaker 2": "Oh, that's cool! But how does..."
 }
 """
 
-SPEAKER_1_DESC = "Laura's voice is exciting and fast in delivery with very clear audio and no background noise."
-SPEAKER_2_DESC = "Jon's voice is calm with very clear audio and no background noise."
+SPEAKER_DESCRIPTIONS = {
+    "1": "Laura's voice is exciting and fast in delivery with very clear audio and no background noise.",
+    "2": "Jon's voice is calm with very clear audio and no background noise.",
+}
+
+
+@st.cache_resource
+def load_text_to_text_model():
+    return load_llama_cpp_model(
+        model_id="allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf"
+    )
+
 
-CURATED_REPOS = [
-    "allenai/OLMoE-1B-7B-0924-Instruct-GGUF",
-    "MaziyarPanahi/SmolLM2-1.7B-Instruct-GGUF",
-    # system prompt seems to be ignored for this model.
-    # "microsoft/Phi-3-mini-4k-instruct-gguf",
-    "HuggingFaceTB/SmolLM2-360M-Instruct-GGUF",
-    "Qwen/Qwen2.5-1.5B-Instruct-GGUF",
-    "Qwen/Qwen2.5-3B-Instruct-GGUF",
-]
+@st.cache_resource
+def load_text_to_speech_model_and_tokenizer():
+    return load_parler_tts_model_and_tokenizer("parler-tts/parler-tts-mini-v1", "cpu")
+
+
+st.title("Document To Podcast")
+
+st.header("Uploading Data")
 
 uploaded_file = st.file_uploader(
     "Choose a file", type=["pdf", "html", "txt", "docx", "md"]
 )
 
+
 if uploaded_file is not None:
+    st.divider()
+    st.header("Loading and Cleaning Data")
+    st.markdown(
+        "[API Reference for data_cleaners](https://mozilla-ai.github.io/document-to-podcast/api/#opennotebookllm.preprocessing.data_cleaners)"
+    )
+
     extension = Path(uploaded_file.name).suffix
 
     col1, col2 = st.columns(2)
 
     raw_text = DATA_LOADERS[extension](uploaded_file)
     with col1:
-        st.title("Raw Text")
+        st.subheader("Raw Text")
         st.text_area(f"Total Length: {len(raw_text)}", f"{raw_text[:500]} . . .")
 
     clean_text = DATA_CLEANERS[extension](raw_text)
     with col2:
-        st.title("Cleaned Text")
+        st.subheader("Cleaned Text")
         st.text_area(f"Total Length: {len(clean_text)}", f"{clean_text[:500]} . . .")
 
-    repo_name = st.selectbox("Select Repo", CURATED_REPOS)
-    model_name = st.selectbox(
-        "Select Model",
-        [
-            x
-            for x in list_repo_files(repo_name)
-            if ".gguf" in x.lower() and ("q8" in x.lower() or "fp16" in x.lower())
-        ],
-        index=None,
+    st.divider()
+    st.header("Downloading and Loading models")
+    st.markdown(
+        "[API Reference for model_loaders](https://mozilla-ai.github.io/document-to-podcast/api/#opennotebookllm.inference.model_loaders)"
     )
-    if model_name:
-        with st.spinner("Downloading and Loading Model..."):
-            model = load_llama_cpp_model(model_id=f"{repo_name}/{model_name}")
-
-        # ~4 characters per token is considered a reasonable default.
-        max_characters = model.n_ctx() * 4
-        if len(clean_text) > max_characters:
-            st.warning(
-                f"Input text is too big ({len(clean_text)})."
-                f" Using only a subset of it ({max_characters})."
-            )
-            clean_text = clean_text[:max_characters]
-
-        system_prompt = st.text_area("Podcast generation prompt", value=PODCAST_PROMPT)
-
-        if st.button("Generate Podcast"):
-            final_script = ""
-            with st.spinner("Generating Podcast Script..."):
-                text = ""
-                for chunk in text_to_text_stream(
-                    clean_text, model, system_prompt=system_prompt.strip()
-                ):
-                    text += chunk
-                    final_script += chunk
-                    if text.endswith("\n"):
-                        st.write(text)
-                        text = ""
-
-            if final_script:
-                model.close()  # Free up memory in order to load the TTS model
-
-                filename = "demo_podcast.wav"
-
-                with st.spinner("Downloading and Loading TTS Model..."):
-                    tts_model, tokenizer = load_parler_tts_model_and_tokenizer(
-                        "parler-tts/parler-tts-mini-v1", "cpu"
-                    )
-                speaker_1 = SpeakerConfig(
-                    model=tts_model,
-                    speaker_id="1",
-                    tokenizer=tokenizer,
-                    speaker_description=SPEAKER_1_DESC,
-                )
-                speaker_2 = SpeakerConfig(
-                    model=tts_model,
-                    speaker_id="2",
-                    tokenizer=tokenizer,
-                    speaker_description=SPEAKER_2_DESC,
-                )
-                demo_podcast_config = PodcastConfig(
-                    speakers={s.speaker_id: s for s in [speaker_1, speaker_2]}
-                )
-
-                with st.spinner("Generating Audio..."):
-                    waveform = parse_script_to_waveform(
-                        final_script, demo_podcast_config
-                    )
-                save_waveform_as_file(
-                    waveform, demo_podcast_config.sampling_rate, filename
-                )
-                st.audio(filename)
+
+    text_model = load_text_to_text_model()
+    speech_model, speech_tokenizer = load_text_to_speech_model_and_tokenizer()
+
+    # ~4 characters per token is considered a reasonable default.
+    max_characters = text_model.n_ctx() * 4
+    if len(clean_text) > max_characters:
+        st.warning(
+            f"Input text is too big ({len(clean_text)})."
+            f" Using only a subset of it ({max_characters})."
+        )
+        clean_text = clean_text[:max_characters]
+
+    st.divider()
+    st.header("Podcast generation")
+
+    system_prompt = st.text_area("Podcast generation prompt", value=PODCAST_PROMPT)
+
+    if st.button("Generate Podcast"):
+        with st.spinner("Generating Podcast..."):
+            text = ""
+            for chunk in text_to_text_stream(
+                clean_text, text_model, system_prompt=system_prompt.strip()
+            ):
+                text += chunk
+                if text.endswith("\n") and "Speaker" in text:
+                    st.write(text)
+                    speaker_id = re.search(r"Speaker (\d+)", text).group(1)
+                    with st.spinner("Generating Audio..."):
+                        speech = _speech_generation_parler(
+                            text.split(f'"Speaker {speaker_id}":')[-1],
+                            speech_model,
+                            speech_tokenizer,
+                            SPEAKER_DESCRIPTIONS[speaker_id],
+                        )
+                    st.audio(speech, sample_rate=44_100)
+                    text = ""
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,7 @@ dependencies = [
   "huggingface-hub",
   "llama-cpp-python",
   "loguru",
-  "parler_tts @ git+https://github.com/huggingface/parler-tts.git",
+  "parler_tts @ git+https://github.com/daavoo/parler-tts.git",
   "pydantic",
   "PyPDF2[crypto]",
   "python-docx",

diff --git a/src/opennotebookllm/podcast_maker/script_to_audio.py b/src/opennotebookllm/podcast_maker/script_to_audio.py
@@ -21,7 +21,7 @@ def parse_script_to_waveform(script: str, podcast_config: PodcastConfig):
     podcast_waveform = []
     for part in parts:
         if ":" in part:
-            speaker_id, speaker_text = part.replace("\"", "").split(":")
+            speaker_id, speaker_text = part.replace('"', "").split(":")
             speaker_model = podcast_config.speakers[speaker_id].model
             speaker_tokenizer = podcast_config.speakers[speaker_id].tokenizer
             speaker_description = podcast_config.speakers[