Skip to content

Commit

Permalink
feat: add bark long form speech synthesize
Browse files Browse the repository at this point in the history
  • Loading branch information
vndee committed Mar 28, 2024
1 parent 1b39489 commit 1645470
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 2 deletions.
2 changes: 1 addition & 1 deletion app.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def callback(indata, frames, time, status):

with console.status("Generating response...", spinner="earth"):
response = get_llm_response(text)
sample_rate, audio_array = tts.synthesize(response)
sample_rate, audio_array = tts.long_form_synthesize(response)

console.print(f"[cyan]Assistant: {response}")
sd.play(audio_array, sample_rate)
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ speechrecognition = "^3.10.1"
pyaudio = "^0.2.14"
langchain = "^0.1.13"
vocos = "^0.1.0"
nltk = "^3.8.1"


[tool.poetry.group.dev.dependencies]
Expand Down
15 changes: 14 additions & 1 deletion tts.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import nltk
import torch
import warnings
import numpy as np
from transformers import AutoProcessor, BarkModel

warnings.filterwarnings(
Expand All @@ -15,7 +17,7 @@ def __init__(self, device: str = "cuda" if torch.cuda.is_available() else "cpu")
self.model = BarkModel.from_pretrained("suno/bark-small")
self.model.to(self.device)

def synthesize(self, text: str, voice_preset: str = "v2/en_speaker_1"):
def synthesize(self, text: str, voice_preset: str):
inputs = self.processor(text, voice_preset=voice_preset, return_tensors="pt")
inputs = {k: v.to(self.device) for k, v in inputs.items()}

Expand All @@ -25,3 +27,14 @@ def synthesize(self, text: str, voice_preset: str = "v2/en_speaker_1"):
audio_array = audio_array.cpu().numpy().squeeze()
sample_rate = self.model.generation_config.sample_rate
return sample_rate, audio_array

def long_form_synthesize(self, text: str, voice_preset: str = "v2/en_speaker_1"):
pieces = []
sentences = nltk.sent_tokenize(text)
silence = np.zeros(int(0.25 * self.model.generation_config.sample_rate))

for sent in sentences:
sample_rate, audio_array = self.synthesize(sent, voice_preset)
pieces += [audio_array, silence.copy()]

return self.model.generation_config.sample_rate, np.concatenate(pieces)

0 comments on commit 1645470

Please sign in to comment.