Skip to content

Commit

Permalink
Updates to demo to include audio part (#26)
Browse files Browse the repository at this point in the history
* Updates to generate audio in chunks

* Update spinner

* wip demo structure

* Use forked parler-tts. Use setup.sh

* Update demo

* Fix input_text

* Use cache_resource

* Drop print

* Add dividers

* Lint
  • Loading branch information
daavoo authored Dec 3, 2024
1 parent dae21b3 commit be82900
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 94 deletions.
5 changes: 5 additions & 0 deletions .github/setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
python -m pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu
git clone https://github.com/descriptinc/audiotools
python -m pip install audiotools
python -m pip install -e .
rm -rf audiotools
168 changes: 76 additions & 92 deletions demo/app.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
import re
from pathlib import Path

import streamlit as st
from huggingface_hub import list_repo_files

from opennotebookllm.podcast_maker.config import PodcastConfig, SpeakerConfig
from opennotebookllm.preprocessing import DATA_LOADERS, DATA_CLEANERS
from opennotebookllm.inference.model_loaders import (
load_llama_cpp_model,
load_parler_tts_model_and_tokenizer,
)
from opennotebookllm.inference.text_to_speech import _speech_generation_parler
from opennotebookllm.inference.text_to_text import text_to_text_stream
from opennotebookllm.podcast_maker.script_to_audio import (
parse_script_to_waveform,
save_waveform_as_file,
)


PODCAST_PROMPT = """
You are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format. The script features two speakers:
Expand All @@ -26,113 +23,100 @@
- Format output as a JSON conversation.
Example:
{
"Speaker 1": "Welcome to our podcast! Today, were exploring...",
"Speaker 2": "Hi Laura! Im excited to hear about this. Can you explain...",
"Speaker 1": "Welcome to our podcast! Today, we're exploring...",
"Speaker 2": "Hi Laura! I'm excited to hear about this. Can you explain...",
"Speaker 1": "Sure! Imagine it like this...",
"Speaker 2": "Oh, thats cool! But how does..."
"Speaker 2": "Oh, that's cool! But how does..."
}
"""

SPEAKER_1_DESC = "Laura's voice is exciting and fast in delivery with very clear audio and no background noise."
SPEAKER_2_DESC = "Jon's voice is calm with very clear audio and no background noise."
SPEAKER_DESCRIPTIONS = {
"1": "Laura's voice is exciting and fast in delivery with very clear audio and no background noise.",
"2": "Jon's voice is calm with very clear audio and no background noise.",
}


@st.cache_resource
def load_text_to_text_model():
return load_llama_cpp_model(
model_id="allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf"
)


CURATED_REPOS = [
"allenai/OLMoE-1B-7B-0924-Instruct-GGUF",
"MaziyarPanahi/SmolLM2-1.7B-Instruct-GGUF",
# system prompt seems to be ignored for this model.
# "microsoft/Phi-3-mini-4k-instruct-gguf",
"HuggingFaceTB/SmolLM2-360M-Instruct-GGUF",
"Qwen/Qwen2.5-1.5B-Instruct-GGUF",
"Qwen/Qwen2.5-3B-Instruct-GGUF",
]
@st.cache_resource
def load_text_to_speech_model_and_tokenizer():
return load_parler_tts_model_and_tokenizer("parler-tts/parler-tts-mini-v1", "cpu")


st.title("Document To Podcast")

st.header("Uploading Data")

uploaded_file = st.file_uploader(
"Choose a file", type=["pdf", "html", "txt", "docx", "md"]
)


if uploaded_file is not None:
st.divider()
st.header("Loading and Cleaning Data")
st.markdown(
"[API Reference for data_cleaners](https://mozilla-ai.github.io/document-to-podcast/api/#opennotebookllm.preprocessing.data_cleaners)"
)

extension = Path(uploaded_file.name).suffix

col1, col2 = st.columns(2)

raw_text = DATA_LOADERS[extension](uploaded_file)
with col1:
st.title("Raw Text")
st.subheader("Raw Text")
st.text_area(f"Total Length: {len(raw_text)}", f"{raw_text[:500]} . . .")

clean_text = DATA_CLEANERS[extension](raw_text)
with col2:
st.title("Cleaned Text")
st.subheader("Cleaned Text")
st.text_area(f"Total Length: {len(clean_text)}", f"{clean_text[:500]} . . .")

repo_name = st.selectbox("Select Repo", CURATED_REPOS)
model_name = st.selectbox(
"Select Model",
[
x
for x in list_repo_files(repo_name)
if ".gguf" in x.lower() and ("q8" in x.lower() or "fp16" in x.lower())
],
index=None,
st.divider()
st.header("Downloading and Loading models")
st.markdown(
"[API Reference for model_loaders](https://mozilla-ai.github.io/document-to-podcast/api/#opennotebookllm.inference.model_loaders)"
)
if model_name:
with st.spinner("Downloading and Loading Model..."):
model = load_llama_cpp_model(model_id=f"{repo_name}/{model_name}")

# ~4 characters per token is considered a reasonable default.
max_characters = model.n_ctx() * 4
if len(clean_text) > max_characters:
st.warning(
f"Input text is too big ({len(clean_text)})."
f" Using only a subset of it ({max_characters})."
)
clean_text = clean_text[:max_characters]

system_prompt = st.text_area("Podcast generation prompt", value=PODCAST_PROMPT)

if st.button("Generate Podcast"):
final_script = ""
with st.spinner("Generating Podcast Script..."):
text = ""
for chunk in text_to_text_stream(
clean_text, model, system_prompt=system_prompt.strip()
):
text += chunk
final_script += chunk
if text.endswith("\n"):
st.write(text)
text = ""

if final_script:
model.close() # Free up memory in order to load the TTS model

filename = "demo_podcast.wav"

with st.spinner("Downloading and Loading TTS Model..."):
tts_model, tokenizer = load_parler_tts_model_and_tokenizer(
"parler-tts/parler-tts-mini-v1", "cpu"
)
speaker_1 = SpeakerConfig(
model=tts_model,
speaker_id="1",
tokenizer=tokenizer,
speaker_description=SPEAKER_1_DESC,
)
speaker_2 = SpeakerConfig(
model=tts_model,
speaker_id="2",
tokenizer=tokenizer,
speaker_description=SPEAKER_2_DESC,
)
demo_podcast_config = PodcastConfig(
speakers={s.speaker_id: s for s in [speaker_1, speaker_2]}
)

with st.spinner("Generating Audio..."):
waveform = parse_script_to_waveform(
final_script, demo_podcast_config
)
save_waveform_as_file(
waveform, demo_podcast_config.sampling_rate, filename
)
st.audio(filename)

text_model = load_text_to_text_model()
speech_model, speech_tokenizer = load_text_to_speech_model_and_tokenizer()

# ~4 characters per token is considered a reasonable default.
max_characters = text_model.n_ctx() * 4
if len(clean_text) > max_characters:
st.warning(
f"Input text is too big ({len(clean_text)})."
f" Using only a subset of it ({max_characters})."
)
clean_text = clean_text[:max_characters]

st.divider()
st.header("Podcast generation")

system_prompt = st.text_area("Podcast generation prompt", value=PODCAST_PROMPT)

if st.button("Generate Podcast"):
with st.spinner("Generating Podcast..."):
text = ""
for chunk in text_to_text_stream(
clean_text, text_model, system_prompt=system_prompt.strip()
):
text += chunk
if text.endswith("\n") and "Speaker" in text:
st.write(text)
speaker_id = re.search(r"Speaker (\d+)", text).group(1)
with st.spinner("Generating Audio..."):
speech = _speech_generation_parler(
text.split(f'"Speaker {speaker_id}":')[-1],
speech_model,
speech_tokenizer,
SPEAKER_DESCRIPTIONS[speaker_id],
)
st.audio(speech, sample_rate=44_100)
text = ""
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ dependencies = [
"huggingface-hub",
"llama-cpp-python",
"loguru",
"parler_tts @ git+https://github.com/huggingface/parler-tts.git",
"parler_tts @ git+https://github.com/daavoo/parler-tts.git",
"pydantic",
"PyPDF2[crypto]",
"python-docx",
Expand Down
2 changes: 1 addition & 1 deletion src/opennotebookllm/podcast_maker/script_to_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def parse_script_to_waveform(script: str, podcast_config: PodcastConfig):
podcast_waveform = []
for part in parts:
if ":" in part:
speaker_id, speaker_text = part.replace("\"", "").split(":")
speaker_id, speaker_text = part.replace('"', "").split(":")
speaker_model = podcast_config.speakers[speaker_id].model
speaker_tokenizer = podcast_config.speakers[speaker_id].tokenizer
speaker_description = podcast_config.speakers[
Expand Down

0 comments on commit be82900

Please sign in to comment.