Skip to content

Commit

Permalink
feat: add tts docstring
Browse files Browse the repository at this point in the history
  • Loading branch information
vndee committed Mar 29, 2024
1 parent eb8e890 commit 1af6236
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 5 deletions.
8 changes: 4 additions & 4 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,14 @@ def callback(indata, frames, time, status):


if __name__ == "__main__":
console.print(
"[cyan]Assistant started! Press Ctrl+C to exit."
)
console.print("[cyan]Assistant started! Press Ctrl+C to exit.")

try:
while True:
# Wait for the user to press Enter to start recording
console.input("Press Enter to start recording, then press Enter again to stop.")
console.input(
"Press Enter to start recording, then press Enter again to stop."
)

data_queue = Queue() # type: ignore[var-annotated]
stop_event = threading.Event()
Expand Down
29 changes: 28 additions & 1 deletion tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,29 @@

class TextToSpeechService:
def __init__(self, device: str = "cuda" if torch.cuda.is_available() else "cpu"):
"""
Initializes the TextToSpeechService class.
Args:
device (str, optional): The device to be used for the model, either "cuda" if a GPU is available or "cpu".
Defaults to "cuda" if available, otherwise "cpu".
"""
self.device = device
self.processor = AutoProcessor.from_pretrained("suno/bark-small")
self.model = BarkModel.from_pretrained("suno/bark-small")
self.model.to(self.device)

def synthesize(self, text: str, voice_preset: str):
def synthesize(self, text: str, voice_preset: str = "v2/en_speaker_1"):
"""
Synthesizes audio from the given text using the specified voice preset.
Args:
text (str): The input text to be synthesized.
voice_preset (str, optional): The voice preset to be used for the synthesis. Defaults to "v2/en_speaker_1".
Returns:
tuple: A tuple containing the sample rate and the generated audio array.
"""
inputs = self.processor(text, voice_preset=voice_preset, return_tensors="pt")
inputs = {k: v.to(self.device) for k, v in inputs.items()}

Expand All @@ -29,6 +46,16 @@ def synthesize(self, text: str, voice_preset: str):
return sample_rate, audio_array

def long_form_synthesize(self, text: str, voice_preset: str = "v2/en_speaker_1"):
"""
Synthesizes audio from the given long-form text using the specified voice preset.
Args:
text (str): The input text to be synthesized.
voice_preset (str, optional): The voice preset to be used for the synthesis. Defaults to "v2/en_speaker_1".
Returns:
tuple: A tuple containing the sample rate and the generated audio array.
"""
pieces = []
sentences = nltk.sent_tokenize(text)
silence = np.zeros(int(0.25 * self.model.generation_config.sample_rate))
Expand Down

0 comments on commit 1af6236

Please sign in to comment.