feat: add tts docstring

sj80231 · Mar 29, 2024 · 1af6236 · 1af6236
1 parent eb8e890
commit 1af6236
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 5 deletions.
diff --git a/app.py b/app.py
@@ -62,14 +62,14 @@ def callback(indata, frames, time, status):
 
 
 if __name__ == "__main__":
-    console.print(
-        "[cyan]Assistant started! Press Ctrl+C to exit."
-    )
+    console.print("[cyan]Assistant started! Press Ctrl+C to exit.")
 
     try:
         while True:
             # Wait for the user to press Enter to start recording
-            console.input("Press Enter to start recording, then press Enter again to stop.")
+            console.input(
+                "Press Enter to start recording, then press Enter again to stop."
+            )
 
             data_queue = Queue()  # type: ignore[var-annotated]
             stop_event = threading.Event()

diff --git a/tts.py b/tts.py
@@ -12,12 +12,29 @@
 
 class TextToSpeechService:
     def __init__(self, device: str = "cuda" if torch.cuda.is_available() else "cpu"):
+        """
+        Initializes the TextToSpeechService class.
+
+        Args:
+            device (str, optional): The device to be used for the model, either "cuda" if a GPU is available or "cpu".
+            Defaults to "cuda" if available, otherwise "cpu".
+        """
         self.device = device
         self.processor = AutoProcessor.from_pretrained("suno/bark-small")
         self.model = BarkModel.from_pretrained("suno/bark-small")
         self.model.to(self.device)
 
-    def synthesize(self, text: str, voice_preset: str):
+    def synthesize(self, text: str, voice_preset: str = "v2/en_speaker_1"):
+        """
+        Synthesizes audio from the given text using the specified voice preset.
+
+        Args:
+            text (str): The input text to be synthesized.
+            voice_preset (str, optional): The voice preset to be used for the synthesis. Defaults to "v2/en_speaker_1".
+
+        Returns:
+            tuple: A tuple containing the sample rate and the generated audio array.
+        """
         inputs = self.processor(text, voice_preset=voice_preset, return_tensors="pt")
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
 
@@ -29,6 +46,16 @@ def synthesize(self, text: str, voice_preset: str):
         return sample_rate, audio_array
 
     def long_form_synthesize(self, text: str, voice_preset: str = "v2/en_speaker_1"):
+        """
+        Synthesizes audio from the given long-form text using the specified voice preset.
+
+        Args:
+            text (str): The input text to be synthesized.
+            voice_preset (str, optional): The voice preset to be used for the synthesis. Defaults to "v2/en_speaker_1".
+
+        Returns:
+            tuple: A tuple containing the sample rate and the generated audio array.
+        """
         pieces = []
         sentences = nltk.sent_tokenize(text)
         silence = np.zeros(int(0.25 * self.model.generation_config.sample_rate))