From 405e748d736ec9308bc514f486664033458cbc6f Mon Sep 17 00:00:00 2001 From: ionic-bond Date: Mon, 25 Mar 2024 01:33:33 +0800 Subject: [PATCH] Change audio slicing default value --- README.md | 6 +++--- README_CN.md | 6 +++--- stream_translator_gpt/translator.py | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index db559e7..4bfcb44 100644 --- a/README.md +++ b/README.md @@ -148,11 +148,11 @@ python3 ./stream-translator-gpt/translator.py | `--device_index` | | The index of the device that needs to be recorded. If not set, the system default recording device will be used. | | **Audio Slicing Options** | | `--frame_duration` | 0.1 | The unit that processes live streaming data in seconds. | -| `--continuous_no_speech_threshold` | 0.8 | Slice if there is no speech for a continuous period in second. | +| `--continuous_no_speech_threshold` | 0.5 | Slice if there is no speech for a continuous period in second. | | `--min_audio_length` | 3.0 | Minimum slice audio length in seconds. | | `--max_audio_length` | 30.0 | Maximum slice audio length in seconds. | -| `--prefix_retention_length` | 0.8 | The length of the retention prefix audio during slicing. | -| `--vad_threshold` | 0.5 | The threshold of Voice activity detection. if the speech probability of a frame is higher than this value, then this frame is speech. | +| `--prefix_retention_length` | 0.5 | The length of the retention prefix audio during slicing. | +| `--vad_threshold` | 0.35 | The threshold of Voice activity detection. if the speech probability of a frame is higher than this value, then this frame is speech. | | **Transcription Options** | | `--model` | small | Select model size. See [here](https://github.com/openai/whisper#available-models-and-languages) for available models. | | `--language` | auto | Language spoken in the stream. See [here](https://github.com/openai/whisper#available-models-and-languages) for available languages. | diff --git a/README_CN.md b/README_CN.md index 4c6b2ca..6f65170 100644 --- a/README_CN.md +++ b/README_CN.md @@ -146,11 +146,11 @@ python3 ./stream-translator-gpt/translator.py | `--device_index` | | 音频输入设备的index。如果未设置,则使用系统默认音频输入设备。 | | **音频切割选项** | | `--frame_duration` | 0.1 | 处理实时流数据的单位(以秒为单位)。 | -| `--continuous_no_speech_threshold` | 0.8 | 如果连续一段时间内没有语音,则进行切片(以秒为单位)。 | +| `--continuous_no_speech_threshold` | 0.5 | 如果连续一段时间内没有语音,则进行切片(以秒为单位)。 | | `--min_audio_length` | 3.0 | 切片音频的最小长度(以秒为单位)。 | | `--max_audio_length` | 30.0 | 切片音频的最大长度(以秒为单位)。 | -| `--prefix_retention_length` | 0.8 | 在切割过程中保留前缀音频的长度。 | -| `--vad_threshold` | 0.5 | 人声检测阈值。如果一个帧的语音概率高于此值,那么这个帧就是人声。 | +| `--prefix_retention_length` | 0.5 | 在切割过程中保留前缀音频的长度。 | +| `--vad_threshold` | 0.35 | 人声检测阈值。如果一个帧的语音概率高于此值,那么这个帧就是人声。 | | **语音转文字选项** | | `--model` | small | Whisper模型大小。请在[此处](https://github.com/openai/whisper#available-models-and-languages)查看可用模型。 | | `--language` | auto | 直播流中的语言。请在[此处](https://github.com/openai/whisper#available-models-and-languages)查看可用语言。 | diff --git a/stream_translator_gpt/translator.py b/stream_translator_gpt/translator.py index d4b6be6..fc86be0 100644 --- a/stream_translator_gpt/translator.py +++ b/stream_translator_gpt/translator.py @@ -166,7 +166,7 @@ def cli(): help='The unit that processes live streaming data in seconds.') parser.add_argument('--continuous_no_speech_threshold', type=float, - default=0.8, + default=0.5, help='Slice if there is no speech for a continuous period in second.') parser.add_argument('--min_audio_length', type=float, @@ -178,11 +178,11 @@ def cli(): help='Maximum slice audio length in seconds.') parser.add_argument('--prefix_retention_length', type=float, - default=0.8, + default=0.5, help='The length of the retention prefix audio during slicing.') parser.add_argument('--vad_threshold', type=float, - default=0.5, + default=0.35, help='The threshold of Voice activity detection.' 'if the speech probability of a frame is higher than this value, ' 'then this frame is speech.')