Separate recording_interval from frame_duration.

ionic-bond · Dec 16, 2024 · 0410b3e · 0410b3e
1 parent 5022e80
commit 0410b3e
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -152,6 +152,8 @@ python3 ./stream-translator-gpt/translator.py
 | `--cookies`                        |                           | Used to open member-only stream, this parameter will be passed directly to yt-dlp.                                                                                                                       |
 | `--input_proxy`                    |                           | Use the specified HTTP/HTTPS/SOCKS proxy for yt-dlp, e.g. http://127.0.0.1:7890.                                                                                                                         |
 | `--device_index`                   |                           | The index of the device that needs to be recorded. If not set, the system default recording device will be used.                                                                                         |
+| `--print_all_devices`              |                           | Print all audio devices info then exit.                                                                                                                                                                  |
+| `--device_recording_interval`      | 0.5                       | The shorter the recording interval, the lower the latency, but it will increase CPU usage. It is recommended to set it between 0.1 and 1.0.                                                              |
 | **Audio Slicing Options**          |
 | `--frame_duration`                 | 0.1                       | The unit that processes live streaming data in seconds, should be >= 0.03                                                                                                                                |
 | `--continuous_no_speech_threshold` | 0.5                       | Slice if there is no speech for a continuous period in second.                                                                                                                                           |

diff --git a/README_CN.md b/README_CN.md
@@ -150,6 +150,8 @@ python3 ./stream-translator-gpt/translator.py
 | `--cookies`                        |                           | 用于打开仅会员可看的直播流，此参数将直接传递给yt-dlp。                                                                     |
 | `--input_proxy`                    |                           | 为 yt-dlp 使用指定的 HTTP/HTTPS/SOCKS 代理，例如 http://127.0.0.1:7890。                                                   |
 | `--device_index`                   |                           | 音频输入设备的index。如果未设置，则使用系统默认音频输入设备。                                                              |
+| `--print_all_devices`              |                           | 打印所有音频设备信息然后退出。                                                                                             |
+| `--device_recording_interval`      | 0.5                       | 录音间隔越短，延迟越低，但会增加CPU使用率。建议将其设置在0.1到1.0之间。                                                    |
 | **音频切割选项**                   |
 | `--frame_duration`                 | 0.1                       | 处理实时流数据的单位（以秒为单位），需大于等于0.03。                                                                       |
 | `--continuous_no_speech_threshold` | 0.5                       | 如果连续一段时间内没有语音，则进行切片（以秒为单位）。                                                                     |

diff --git a/stream_translator_gpt/audio_getter.py b/stream_translator_gpt/audio_getter.py
@@ -124,19 +124,22 @@ def loop(self, output_queue: queue.SimpleQueue[np.array]):
 
 class DeviceAudioGetter(LoopWorkerBase):
 
-    def __init__(self, device_index: int, frame_duration: float) -> None:
+    def __init__(self, device_index: int, frame_duration: float, recording_interval: float) -> None:
         import sounddevice as sd
         if device_index:
             sd.default.device[0] = device_index
         sd.default.dtype[0] = np.float32
         self.frame_duration = frame_duration
+        self.recording_frame_num = max(1, round(recording_interval / frame_duration))
         print('Recording device: {}'.format(sd.query_devices(sd.default.device[0])['name']))
 
     def loop(self, output_queue: queue.SimpleQueue[np.array]):
         import sounddevice as sd
         while True:
-            audio = sd.rec(frames=round(SAMPLE_RATE * self.frame_duration),
+            audio = sd.rec(frames=round(SAMPLE_RATE * self.frame_duration * self.recording_frame_num),
                            samplerate=SAMPLE_RATE,
                            channels=1,
                            blocking=True).flatten()
-            output_queue.put(audio)
+            split_audios = np.array_split(audio, self.recording_frame_num)
+            for split_audio in split_audios:
+                output_queue.put(split_audio)
diff --git a/stream_translator_gpt/translator.py b/stream_translator_gpt/translator.py
@@ -21,7 +21,7 @@ def _start_daemon_thread(func, *args, **kwargs):
     thread.start()
 
 
-def main(url, format, cookies, input_proxy, device_index, frame_duration,
+def main(url, format, cookies, input_proxy, device_index, device_recording_interval, frame_duration,
          continuous_no_speech_threshold, min_audio_length, max_audio_length,
          prefix_retention_length, vad_threshold, model, language, use_faster_whisper,
          use_whisper_api, whisper_filters, openai_api_key, google_api_key, gpt_translation_prompt,
@@ -141,6 +141,7 @@ def main(url, format, cookies, input_proxy, device_index, frame_duration,
         DeviceAudioGetter.work(
             device_index=device_index,
             frame_duration=frame_duration,
+            recording_interval=device_recording_interval,
             output_queue=getter_to_slicer_queue,
         )
     elif os.path.isabs(url):
@@ -196,6 +197,11 @@ def cli():
     parser.add_argument('--print_all_devices',
                         action='store_true',
                         help='Print all audio devices info then exit.')
+    parser.add_argument('--device_recording_interval',
+                        type=float,
+                        default=0.5,
+                        help='The shorter the recording interval, the lower the latency,'
+                        'but it will increase CPU usage. It is recommended to set it between 0.1 and 1.0.')
     parser.add_argument('--frame_duration',
                         type=float,
                         default=0.1,