diff --git a/app.py b/app.py index 13754bd..8cf6e56 100644 --- a/app.py +++ b/app.py @@ -104,7 +104,7 @@ def callback(indata, frames, time, status): with console.status("Generating response...", spinner="earth"): response = get_llm_response(text) - sample_rate, audio_array = tts.synthesize(response) + sample_rate, audio_array = tts.long_form_synthesize(response) console.print(f"[cyan]Assistant: {response}") sd.play(audio_array, sample_rate) diff --git a/pyproject.toml b/pyproject.toml index 24de13f..c3d5667 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ speechrecognition = "^3.10.1" pyaudio = "^0.2.14" langchain = "^0.1.13" vocos = "^0.1.0" +nltk = "^3.8.1" [tool.poetry.group.dev.dependencies] diff --git a/tts.py b/tts.py index fe5c866..bb7e4dc 100644 --- a/tts.py +++ b/tts.py @@ -1,5 +1,7 @@ +import nltk import torch import warnings +import numpy as np from transformers import AutoProcessor, BarkModel warnings.filterwarnings( @@ -15,7 +17,7 @@ def __init__(self, device: str = "cuda" if torch.cuda.is_available() else "cpu") self.model = BarkModel.from_pretrained("suno/bark-small") self.model.to(self.device) - def synthesize(self, text: str, voice_preset: str = "v2/en_speaker_1"): + def synthesize(self, text: str, voice_preset: str): inputs = self.processor(text, voice_preset=voice_preset, return_tensors="pt") inputs = {k: v.to(self.device) for k, v in inputs.items()} @@ -25,3 +27,14 @@ def synthesize(self, text: str, voice_preset: str = "v2/en_speaker_1"): audio_array = audio_array.cpu().numpy().squeeze() sample_rate = self.model.generation_config.sample_rate return sample_rate, audio_array + + def long_form_synthesize(self, text: str, voice_preset: str = "v2/en_speaker_1"): + pieces = [] + sentences = nltk.sent_tokenize(text) + silence = np.zeros(int(0.25 * self.model.generation_config.sample_rate)) + + for sent in sentences: + sample_rate, audio_array = self.synthesize(sent, voice_preset) + pieces += [audio_array, silence.copy()] + + return self.model.generation_config.sample_rate, np.concatenate(pieces)