linto-ai · Jeronymous · Dec 11, 2024 · Nov 30, 2024 · Nov 30, 2024 · Nov 30, 2024
diff --git a/README.md b/README.md
diff --git a/RELEASE.md b/RELEASE.md
@@ -1,3 +1,8 @@
+# 1.3.0
+ - Add input option "language" that can be passed at each request
+ - Add result of language detection (or given language) in the output, for each segment
+ - Add speaker identification ("speakerIdentification" option in "diarizationConfig")
+
 # 1.2.12
  - Do not fail when asking to convert numbers with env. variable LANGUAGE=*
 

diff --git a/transcriptionservice/broker/celeryapp.py b/transcriptionservice/broker/celeryapp.py
@@ -25,8 +25,7 @@
     {
         "task_routes": {
             "transcription_task": {"queue": "{}_requests".format(service_name)},
-            # Not Implemented
-            # "transcription_task_multi": {"queue": "{}_requests".format(service_name)},
+            # Future: "transcription_task_multi": {"queue": "{}_requests".format(service_name)},
         }
     }
 )
diff --git a/transcriptionservice/document/swagger.yaml b/transcriptionservice/document/swagger.yaml
@@ -293,19 +293,36 @@ components:
     transcriptionConfig:
       type: object
       properties:
-        punctuationConfig:
+        language:
+          type: string
+          default: null
+        vadConfig:
           type: object
-          $ref: '#/components/schemas/punctuationConfig'
+          $ref: '#/components/schemas/vadConfig'
         diarizationConfig:
           type: object
           $ref: '#/components/schemas/diarizationConfig'
-    multiTranscriptionConfig:
-      type: object
-      properties:
         punctuationConfig:
           type: object
           $ref: '#/components/schemas/punctuationConfig'
 
+    vadConfig:
+      type: object
+      properties:
+        enableVad:
+          type: boolean
+          default: false
+        methodName:
+          type: string
+          default: "WebRTC"
+        minDuration:
+          type: number
+          default: 0
+        maxDuration:
+          type: number
+          default: 1200
+
+
     diarizationConfig:
       type: object
       properties:

diff --git a/transcriptionservice/server/formating/formatresult.py b/transcriptionservice/server/formating/formatresult.py
@@ -31,6 +31,16 @@ def formatResult(
     """
 
     language = os.environ.get("LANGUAGE", "")
+    # Get the detected language if any
+    if (not language or language == "*") and result.get("segments"):
+        detected_language = result["segments"][0].get("language")
+        if detected_language and detected_language != "*" and not language.startswith(detected_language):
+            language = detected_language
+
+        # If STT is capable of language detection, it is probably Whisper STT, which also returns numbers...
+        # This is a ugly hack to avoid converting numbers to digits for Whisper STT (Tom said it was complicated to send convert_numbers=False...)
+        convert_numbers = False
+
     if convert_numbers:
         fulltext_cleaner = lambda text: textToNum(cleanText(text, language, user_sub), language)
     else:
@@ -57,17 +67,15 @@ def formatResult(
         return final_result
 
     elif return_format == "text/vtt":
-        # TODO: pass "fulltext_cleaner" instead of "convert_numbers"
         t_result = TranscriptionResult.fromDict(result)
         return Subtitles(t_result, language).toVTT(
-            return_raw=raw_return, convert_numbers=convert_numbers, user_sub=user_sub
+            return_raw=raw_return, text_cleaner=fulltext_cleaner, user_sub=user_sub
         )
 
     elif return_format == "text/srt":
-        # TODO: pass "fulltext_cleaner" instead of "convert_numbers"
         t_result = TranscriptionResult.fromDict(result)
         return Subtitles(t_result, language).toSRT(
-            return_raw=raw_return, convert_numbers=convert_numbers, user_sub=user_sub
+            return_raw=raw_return, text_cleaner=fulltext_cleaner, user_sub=user_sub
         )
 
     else:

diff --git a/transcriptionservice/server/formating/normalization.py b/transcriptionservice/server/formating/normalization.py
@@ -42,8 +42,8 @@ def _alpha2digit(text: str, language: str) -> str:
         text_small = text
         if len(text) > 200:
             text_small = text[:100] + "..." + text[-100:]
-        logger.error(f"Error converting '{text_small}' to digits: {err}")
-        raise RuntimeError(f"Error converting '{text}' to digits with {language=}") from err
+        logger.warning(f"Error converting '{text_small}' to digits with {language=} ({err})")
+        return text
 
 def cleanText(text: str, language: str, user_sub: list) -> str:
 

diff --git a/transcriptionservice/server/formating/subtitling.py b/transcriptionservice/server/formating/subtitling.py
@@ -19,18 +19,18 @@ def __init__(self, words: List[Tuple[Word, str]], language: str = ""):
         self.end = max([w.end for w in self.words])
 
     def formatUtterance(
-        self, utterance: str, convert_numbers: bool, user_sub: List[Tuple[str, str]]
+        self, utterance: str, text_cleaner, user_sub: List[Tuple[str, str]]
     ) -> str:
-        if convert_numbers:
-            utterance = textToNum(utterance, self.language)
+        if text_cleaner:
+            utterance = text_cleaner(utterance)
         return cleanText(utterance, self.language, user_sub)
 
     def toSRT(
         self,
         index_start: int = 0,
         max_char_line: int = 40,
         return_raw: bool = False,
-        convert_numbers: bool = False,
+        text_cleaner = lambda x: x,
         user_sub: List[Tuple[str, str]] = [],
         max_lines: int = 2,
         display_spk: bool = False,
@@ -50,7 +50,7 @@ def toSRT(
                 c_l += 1
                 c_w = 0
                 if c_l >= max_lines:
-                    final_item = self.formatUtterance(current_item, convert_numbers, user_sub)
+                    final_item = self.formatUtterance(current_item, text_cleaner, user_sub)
                     output += "{}\n{} --> {}\n{}\n\n".format(
                         index_start + i + 1,
                         self.timeStampSRT(words[0].start),
@@ -65,7 +65,7 @@ def toSRT(
             c_w += len(word.word if return_raw else final_word)
             finals.append(word.word if return_raw else final_word)
         current_item += "{}\n".format(" ".join(finals))
-        final_item = self.formatUtterance(current_item, convert_numbers, user_sub)
+        final_item = self.formatUtterance(current_item, text_cleaner, user_sub)
         output += "{}\n{} --> {}\n{}\n\n".format(
             index_start + i + 1,
             self.timeStampSRT(words[0].start),
@@ -77,7 +77,7 @@ def toSRT(
     def toVTT(
         self,
         return_raw: bool = False,
-        convert_numbers: bool = False,
+        text_cleaner = None,
         user_sub: List[Tuple[str, str]] = [],
         max_char_line: int = 40,
         max_line: int = 2,
@@ -95,7 +95,7 @@ def toVTT(
                         self.timeStampVTT(words[-1].end),
                     )
                     final_item = self.formatUtterance(
-                        "{}\n\n".format(" ".join(finals)), convert_numbers, user_sub
+                        "{}\n\n".format(" ".join(finals)), text_cleaner, user_sub
                     )
                     output += final_item
                     words = []
@@ -108,7 +108,7 @@ def toVTT(
                 self.timeStampVTT(words[0].start), self.timeStampVTT(words[-1].end)
             )
             final_item = self.formatUtterance(
-                "{}\n\n".format(" ".join(finals)), convert_numbers, user_sub
+                "{}\n\n".format(" ".join(finals)), text_cleaner, user_sub
             )
             output += final_item
             return output
@@ -118,11 +118,11 @@ def toVTT(
         if return_raw:
             output += "{}\n\n".format(
                 self.formatUtterance(
-                    " ".join([w.word for w in self.words]), convert_numbers, user_sub
+                    " ".join([w.word for w in self.words]), text_cleaner, user_sub
                 )
             )
         else:
-            output += "{}\n\n".format(self.formatUtterance(str(self), convert_numbers, user_sub))
+            output += "{}\n\n".format(self.formatUtterance(str(self), text_cleaner, user_sub))
         return output
 
     def timeStampSRT(self, t_str) -> str:
@@ -186,7 +186,7 @@ def segmentsToSubtitleItems(
     def toSRT(
         self,
         return_raw: bool = False,
-        convert_numbers: bool = False,
+        text_cleaner = None,
         user_sub: List[Tuple[str, str]] = [],
     ) -> str:
         output = ""
@@ -195,7 +195,7 @@ def toSRT(
             r, n = item.toSRT(
                 i,
                 return_raw=return_raw,
-                convert_numbers=convert_numbers,
+                text_cleaner=text_cleaner,
                 user_sub=user_sub,
             )
             output += r
@@ -205,14 +205,14 @@ def toSRT(
     def toVTT(
         self,
         return_raw: bool = False,
-        convert_numbers: bool = False,
+        text_cleaner = None,
         user_sub: List[Tuple[str, str]] = [],
     ) -> str:
         output = "WEBVTT Kind: captions; Language: {}\n\n".format(self.language)
         for item in self.subtitleItems:
             output += item.toVTT(
                 return_raw=return_raw,
-                convert_numbers=convert_numbers,
+                text_cleaner=text_cleaner,
                 user_sub=user_sub,
             )
         return output