diff --git a/app.py b/app.py index c6643db..830ba3d 100644 --- a/app.py +++ b/app.py @@ -62,14 +62,14 @@ def callback(indata, frames, time, status): if __name__ == "__main__": - console.print( - "[cyan]Assistant started! Press Ctrl+C to exit." - ) + console.print("[cyan]Assistant started! Press Ctrl+C to exit.") try: while True: # Wait for the user to press Enter to start recording - console.input("Press Enter to start recording, then press Enter again to stop.") + console.input( + "Press Enter to start recording, then press Enter again to stop." + ) data_queue = Queue() # type: ignore[var-annotated] stop_event = threading.Event() diff --git a/tts.py b/tts.py index bb7e4dc..13b9613 100644 --- a/tts.py +++ b/tts.py @@ -12,12 +12,29 @@ class TextToSpeechService: def __init__(self, device: str = "cuda" if torch.cuda.is_available() else "cpu"): + """ + Initializes the TextToSpeechService class. + + Args: + device (str, optional): The device to be used for the model, either "cuda" if a GPU is available or "cpu". + Defaults to "cuda" if available, otherwise "cpu". + """ self.device = device self.processor = AutoProcessor.from_pretrained("suno/bark-small") self.model = BarkModel.from_pretrained("suno/bark-small") self.model.to(self.device) - def synthesize(self, text: str, voice_preset: str): + def synthesize(self, text: str, voice_preset: str = "v2/en_speaker_1"): + """ + Synthesizes audio from the given text using the specified voice preset. + + Args: + text (str): The input text to be synthesized. + voice_preset (str, optional): The voice preset to be used for the synthesis. Defaults to "v2/en_speaker_1". + + Returns: + tuple: A tuple containing the sample rate and the generated audio array. + """ inputs = self.processor(text, voice_preset=voice_preset, return_tensors="pt") inputs = {k: v.to(self.device) for k, v in inputs.items()} @@ -29,6 +46,16 @@ def synthesize(self, text: str, voice_preset: str): return sample_rate, audio_array def long_form_synthesize(self, text: str, voice_preset: str = "v2/en_speaker_1"): + """ + Synthesizes audio from the given long-form text using the specified voice preset. + + Args: + text (str): The input text to be synthesized. + voice_preset (str, optional): The voice preset to be used for the synthesis. Defaults to "v2/en_speaker_1". + + Returns: + tuple: A tuple containing the sample rate and the generated audio array. + """ pieces = [] sentences = nltk.sent_tokenize(text) silence = np.zeros(int(0.25 * self.model.generation_config.sample_rate))