From c533f1669fe25db7b9b52858b7a636aab0be012b Mon Sep 17 00:00:00 2001 From: Audran Bert <57795315+AudranBert@users.noreply.github.com> Date: Wed, 17 Apr 2024 12:07:08 +0200 Subject: [PATCH 1/9] Add transcription (offline/streaming) mode in summary of whisper readme --- whisper/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/whisper/README.md b/whisper/README.md index b4f1e3b..fa6c513 100644 --- a/whisper/README.md +++ b/whisper/README.md @@ -2,7 +2,9 @@ LinTO-STT-Whisper is an API for Automatic Speech Recognition (ASR) based on [Whisper models](https://openai.com/research/whisper). -LinTO-STT-Whisper can either be used as a standalone transcription service or deployed within a micro-services infrastructure using a message broker connector. +LinTO-STT-Whisper can either be used as a standalone transcription service or deployed within a micro-services infrastructure using a message broker connector. + +It can be used to do offline or real-time transcriptions. ## Pre-requisites @@ -371,4 +373,4 @@ This project is developped under the AGPLv3 License (see LICENSE). * [HuggingFace Transformers](https://github.com/huggingface/transformers) * [SpeechBrain](https://github.com/speechbrain/speechbrain) * [TorchAudio](https://github.com/pytorch/audio) -* [Whisper_Streaming](https://github.com/ufal/whisper_streaming) \ No newline at end of file +* [Whisper_Streaming](https://github.com/ufal/whisper_streaming) From 794fed4a06753d49b149c4ec7b473d59439267c5 Mon Sep 17 00:00:00 2001 From: Audran Bert <57795315+AudranBert@users.noreply.github.com> Date: Wed, 17 Apr 2024 12:08:38 +0200 Subject: [PATCH 2/9] Add transcription (offline/streaming) mode in summary of kaldi readme --- kaldi/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kaldi/README.md b/kaldi/README.md index ee5e222..4389170 100644 --- a/kaldi/README.md +++ b/kaldi/README.md @@ -4,6 +4,8 @@ LinTO-STT-Kaldi is an API for Automatic Speech Recognition (ASR) based on models LinTO-STT-Kaldi can either be used as a standalone transcription service or deployed within a micro-services infrastructure using a message broker connector. +It can be used to do offline or real-time transcriptions. + ## Pre-requisites ### Hardware From 15bf676b8cd5bc4b5f890e82ceca2bf9ce253fdf Mon Sep 17 00:00:00 2001 From: Audran Bert <57795315+AudranBert@users.noreply.github.com> Date: Wed, 17 Apr 2024 12:09:20 +0200 Subject: [PATCH 3/9] Add transcription (offline/streaming) mode in main readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 90c1e36..fd9d143 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ LinTO-STT is an API for Automatic Speech Recognition (ASR). LinTO-STT can either be used as a standalone transcription service or deployed within a micro-services infrastructure using a message broker connector. +It can be used to do offline or real-time transcriptions. + The following families of STT models are currently supported (please refer to respective documentation for more details): * [Kaldi models](kaldi/README.md) * [Whisper models](whisper/README.md) From af62af6737ce319eb5e24f3b2db721c5b4057e7a Mon Sep 17 00:00:00 2001 From: Jeronymous Date: Sun, 21 Apr 2024 19:19:34 +0200 Subject: [PATCH 4/9] Fix possible failure when NUM_THREADS was not specified --- whisper/RELEASE.md | 3 +++ whisper/stt/__init__.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/whisper/RELEASE.md b/whisper/RELEASE.md index 84de80f..78190fb 100644 --- a/whisper/RELEASE.md +++ b/whisper/RELEASE.md @@ -1,3 +1,6 @@ +# 1.0.4 +- Fix NUM_THREADS env variable (don't fail when not specified) + # 1.0.3 - Make Voice Activity Detection (VAD) configurable - Change default VAD from silero (neural approach) to auditok (heuristical approach), because silero can have unpredictable behaviour on different corner cases diff --git a/whisper/stt/__init__.py b/whisper/stt/__init__.py index 8bac458..774fe98 100644 --- a/whisper/stt/__init__.py +++ b/whisper/stt/__init__.py @@ -24,7 +24,6 @@ VAD_MIN_SILENCE_DURATION = float(os.environ.get("VAD_MAX_SILENCE_DURATION", 0.1)) NUM_THREADS = os.environ.get("NUM_THREADS", os.environ.get("OMP_NUM_THREADS")) -NUM_THREADS = int(NUM_THREADS) try: import faster_whisper @@ -55,6 +54,7 @@ def set_num_threads(n): # os.environ["OMP_NUM_THREADS"] = str(n) pass + DEFAULT_NUM_THREADS = None else: import torch DEFAULT_NUM_THREADS = torch.get_num_threads() From 8a3b536f99370c951a1c965a5f405265f428bbf8 Mon Sep 17 00:00:00 2001 From: Jeronymous Date: Sun, 21 Apr 2024 19:27:31 +0200 Subject: [PATCH 5/9] Do not enforce NUM_THREADS by default in .envdefault --- whisper/.envdefault | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whisper/.envdefault b/whisper/.envdefault index a8f8794..126fd51 100644 --- a/whisper/.envdefault +++ b/whisper/.envdefault @@ -55,7 +55,7 @@ PROMPT= # CUDA_VISIBLE_DEVICES=0 # Number of threads per worker when running on CPU -NUM_THREADS=4 +# NUM_THREADS=4 # Number of workers minus one (all except from the main one) CONCURRENCY=2 From deda3d985d02318cbda6e56885bdce3d84ca436b Mon Sep 17 00:00:00 2001 From: Jeronymous Date: Sun, 21 Apr 2024 19:30:43 +0200 Subject: [PATCH 6/9] Add link to .envdefault file (for doc on DockerHub) --- kaldi/README.md | 2 ++ whisper/README.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/kaldi/README.md b/kaldi/README.md index 4389170..c179d9e 100644 --- a/kaldi/README.md +++ b/kaldi/README.md @@ -50,6 +50,8 @@ Have the acoustic and language model ready at AM_PATH and LM_PATH if you are usi **3- Fill the .env** +An example of .env file is provided in [kaldi/.envdefault](https://github.com/linto-ai/linto-stt/blob/master/kaldi/.envdefault). + ```bash cp kaldi/.envdefault kaldi/.env ``` diff --git a/whisper/README.md b/whisper/README.md index fa6c513..10a752b 100644 --- a/whisper/README.md +++ b/whisper/README.md @@ -110,6 +110,8 @@ docker pull lintoai/linto-stt-whisper ### 2- Fill the .env +An example of .env file is provided in [whisper/.envdefault](https://github.com/linto-ai/linto-stt/blob/master/whisper/.envdefault). + ```bash cp whisper/.envdefault whisper/.env ``` From c4164e8e0217b56a5bfb8b764220f704ccb546c8 Mon Sep 17 00:00:00 2001 From: Jeronymous Date: Sun, 21 Apr 2024 19:36:47 +0200 Subject: [PATCH 7/9] Tune doc --- kaldi/README.md | 10 +++------- whisper/README.md | 13 +++++-------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/kaldi/README.md b/kaldi/README.md index c179d9e..c7bd22b 100644 --- a/kaldi/README.md +++ b/kaldi/README.md @@ -48,14 +48,10 @@ docker pull lintoai/linto-stt-kaldi Have the acoustic and language model ready at AM_PATH and LM_PATH if you are using LinTO models. If you are using a Vosk model, have it ready at MODEL. -**3- Fill the .env** +**3- Fill the .env file** An example of .env file is provided in [kaldi/.envdefault](https://github.com/linto-ai/linto-stt/blob/master/kaldi/.envdefault). -```bash -cp kaldi/.envdefault kaldi/.env -``` - | PARAMETER | DESCRIPTION | EXEMPLE | |---|---|---| | SERVICE_MODE | STT serving mode see [Serving mode](#serving-mode) | http\|task\|websocket | @@ -89,7 +85,7 @@ docker run --rm \ -p HOST_SERVING_PORT:80 \ -v AM_PATH:/opt/AM \ -v LM_PATH:/opt/LM \ ---env-file kaldi/.env \ +--env-file .env \ linto-stt-kaldi:latest ``` @@ -115,7 +111,7 @@ docker run --rm \ -v AM_PATH:/opt/AM \ -v LM_PATH:/opt/LM \ -v SHARED_AUDIO_FOLDER:/opt/audio \ ---env-file kaldi/.env \ +--env-file .env \ linto-stt-kaldi:latest ``` diff --git a/whisper/README.md b/whisper/README.md index 10a752b..07a1a53 100644 --- a/whisper/README.md +++ b/whisper/README.md @@ -108,14 +108,10 @@ or docker pull lintoai/linto-stt-whisper ``` -### 2- Fill the .env +### 2- Fill the .env file An example of .env file is provided in [whisper/.envdefault](https://github.com/linto-ai/linto-stt/blob/master/whisper/.envdefault). -```bash -cp whisper/.envdefault whisper/.env -``` - | PARAMETER | DESCRIPTION | EXEMPLE | |---|---|---| | SERVICE_MODE | (Required) STT serving mode see [Serving mode](#serving-mode) | `http` \| `task` | @@ -188,7 +184,7 @@ yo(yoruba), zh(chinese) ``` and also `yue(cantonese)` since large-v3. -### Serving mode +#### SERVING_MODE ![Serving Modes](https://i.ibb.co/qrtv3Z6/platform-stt.png) STT can be used in two ways: @@ -199,6 +195,7 @@ Mode is specified using the .env value or environment variable ```SERVING_MODE`` ```bash SERVICE_MODE=http ``` + ### HTTP Server The HTTP serving mode deploys a HTTP server and a swagger-ui to allow transcription request on a dedicated route. @@ -207,7 +204,7 @@ The SERVICE_MODE value in the .env should be set to ```http```. ```bash docker run --rm \ -p HOST_SERVING_PORT:80 \ ---env-file whisper/.env \ +--env-file .env \ linto-stt-whisper:latest ``` @@ -240,7 +237,7 @@ You need a message broker up and running at MY_SERVICE_BROKER. ```bash docker run --rm \ -v SHARED_AUDIO_FOLDER:/opt/audio \ ---env-file whisper/.env \ +--env-file .env \ linto-stt-whisper:latest ``` From 3b769c54527339a314a7fb816ca822d2780aaa32 Mon Sep 17 00:00:00 2001 From: Jeronymous Date: Sun, 21 Apr 2024 19:39:06 +0200 Subject: [PATCH 8/9] move first definition of NUM_THREADS close to post-processing --- whisper/stt/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/whisper/stt/__init__.py b/whisper/stt/__init__.py index 774fe98..767f800 100644 --- a/whisper/stt/__init__.py +++ b/whisper/stt/__init__.py @@ -23,8 +23,6 @@ VAD_MIN_SPEECH_DURATION = float(os.environ.get("VAD_MIN_SPEECH_DURATION", 0.1)) VAD_MIN_SILENCE_DURATION = float(os.environ.get("VAD_MAX_SILENCE_DURATION", 0.1)) -NUM_THREADS = os.environ.get("NUM_THREADS", os.environ.get("OMP_NUM_THREADS")) - try: import faster_whisper @@ -62,6 +60,7 @@ def set_num_threads(n): torch.set_num_threads(n) # Number of CPU threads +NUM_THREADS = os.environ.get("NUM_THREADS", os.environ.get("OMP_NUM_THREADS")) if NUM_THREADS is None: NUM_THREADS = DEFAULT_NUM_THREADS if NUM_THREADS is not None: From e9b0fbcb8b590185751f0deb808a19f650d6f9e3 Mon Sep 17 00:00:00 2001 From: Jeronymous Date: Mon, 22 Apr 2024 08:17:16 +0200 Subject: [PATCH 9/9] Do not increase version for hotfix --- whisper/RELEASE.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/whisper/RELEASE.md b/whisper/RELEASE.md index 78190fb..84de80f 100644 --- a/whisper/RELEASE.md +++ b/whisper/RELEASE.md @@ -1,6 +1,3 @@ -# 1.0.4 -- Fix NUM_THREADS env variable (don't fail when not specified) - # 1.0.3 - Make Voice Activity Detection (VAD) configurable - Change default VAD from silero (neural approach) to auditok (heuristical approach), because silero can have unpredictable behaviour on different corner cases