Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add language input option / detected language in output / speaker identification #22

Merged
merged 11 commits into from
Dec 11, 2024
Merged
210 changes: 118 additions & 92 deletions README.md

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# 1.3.0
- Add input option "language" that can be passed at each request
- Add result of language detection (or given language) in the output, for each segment
- Add speaker identification ("speakerIdentification" option in "diarizationConfig")

# 1.2.12
- Do not fail when asking to convert numbers with env. variable LANGUAGE=*

Expand Down
3 changes: 1 addition & 2 deletions transcriptionservice/broker/celeryapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@
{
"task_routes": {
"transcription_task": {"queue": "{}_requests".format(service_name)},
# Not Implemented
# "transcription_task_multi": {"queue": "{}_requests".format(service_name)},
# Future: "transcription_task_multi": {"queue": "{}_requests".format(service_name)},
}
}
)
27 changes: 22 additions & 5 deletions transcriptionservice/document/swagger.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -293,19 +293,36 @@ components:
transcriptionConfig:
type: object
properties:
punctuationConfig:
language:
type: string
default: null
vadConfig:
type: object
$ref: '#/components/schemas/punctuationConfig'
$ref: '#/components/schemas/vadConfig'
diarizationConfig:
type: object
$ref: '#/components/schemas/diarizationConfig'
multiTranscriptionConfig:
type: object
properties:
punctuationConfig:
type: object
$ref: '#/components/schemas/punctuationConfig'

vadConfig:
type: object
properties:
enableVad:
type: boolean
default: false
methodName:
type: string
default: "WebRTC"
minDuration:
type: number
default: 0
maxDuration:
type: number
default: 1200


diarizationConfig:
type: object
properties:
Expand Down
16 changes: 12 additions & 4 deletions transcriptionservice/server/formating/formatresult.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,16 @@ def formatResult(
"""

language = os.environ.get("LANGUAGE", "")
# Get the detected language if any
if (not language or language == "*") and result.get("segments"):
detected_language = result["segments"][0].get("language")
if detected_language and detected_language != "*" and not language.startswith(detected_language):
language = detected_language

# If STT is capable of language detection, it is probably Whisper STT, which also returns numbers...
# This is a ugly hack to avoid converting numbers to digits for Whisper STT (Tom said it was complicated to send convert_numbers=False...)
convert_numbers = False

if convert_numbers:
fulltext_cleaner = lambda text: textToNum(cleanText(text, language, user_sub), language)
else:
Expand All @@ -57,17 +67,15 @@ def formatResult(
return final_result

elif return_format == "text/vtt":
# TODO: pass "fulltext_cleaner" instead of "convert_numbers"
t_result = TranscriptionResult.fromDict(result)
return Subtitles(t_result, language).toVTT(
return_raw=raw_return, convert_numbers=convert_numbers, user_sub=user_sub
return_raw=raw_return, text_cleaner=fulltext_cleaner, user_sub=user_sub
)

elif return_format == "text/srt":
# TODO: pass "fulltext_cleaner" instead of "convert_numbers"
t_result = TranscriptionResult.fromDict(result)
return Subtitles(t_result, language).toSRT(
return_raw=raw_return, convert_numbers=convert_numbers, user_sub=user_sub
return_raw=raw_return, text_cleaner=fulltext_cleaner, user_sub=user_sub
)

else:
Expand Down
4 changes: 2 additions & 2 deletions transcriptionservice/server/formating/normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ def _alpha2digit(text: str, language: str) -> str:
text_small = text
if len(text) > 200:
text_small = text[:100] + "..." + text[-100:]
logger.error(f"Error converting '{text_small}' to digits: {err}")
raise RuntimeError(f"Error converting '{text}' to digits with {language=}") from err
logger.warning(f"Error converting '{text_small}' to digits with {language=} ({err})")
return text

def cleanText(text: str, language: str, user_sub: list) -> str:

Expand Down
30 changes: 15 additions & 15 deletions transcriptionservice/server/formating/subtitling.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,18 @@ def __init__(self, words: List[Tuple[Word, str]], language: str = ""):
self.end = max([w.end for w in self.words])

def formatUtterance(
self, utterance: str, convert_numbers: bool, user_sub: List[Tuple[str, str]]
self, utterance: str, text_cleaner, user_sub: List[Tuple[str, str]]
) -> str:
if convert_numbers:
utterance = textToNum(utterance, self.language)
if text_cleaner:
utterance = text_cleaner(utterance)
return cleanText(utterance, self.language, user_sub)

def toSRT(
self,
index_start: int = 0,
max_char_line: int = 40,
return_raw: bool = False,
convert_numbers: bool = False,
text_cleaner = lambda x: x,
user_sub: List[Tuple[str, str]] = [],
max_lines: int = 2,
display_spk: bool = False,
Expand All @@ -50,7 +50,7 @@ def toSRT(
c_l += 1
c_w = 0
if c_l >= max_lines:
final_item = self.formatUtterance(current_item, convert_numbers, user_sub)
final_item = self.formatUtterance(current_item, text_cleaner, user_sub)
output += "{}\n{} --> {}\n{}\n\n".format(
index_start + i + 1,
self.timeStampSRT(words[0].start),
Expand All @@ -65,7 +65,7 @@ def toSRT(
c_w += len(word.word if return_raw else final_word)
finals.append(word.word if return_raw else final_word)
current_item += "{}\n".format(" ".join(finals))
final_item = self.formatUtterance(current_item, convert_numbers, user_sub)
final_item = self.formatUtterance(current_item, text_cleaner, user_sub)
output += "{}\n{} --> {}\n{}\n\n".format(
index_start + i + 1,
self.timeStampSRT(words[0].start),
Expand All @@ -77,7 +77,7 @@ def toSRT(
def toVTT(
self,
return_raw: bool = False,
convert_numbers: bool = False,
text_cleaner = None,
user_sub: List[Tuple[str, str]] = [],
max_char_line: int = 40,
max_line: int = 2,
Expand All @@ -95,7 +95,7 @@ def toVTT(
self.timeStampVTT(words[-1].end),
)
final_item = self.formatUtterance(
"{}\n\n".format(" ".join(finals)), convert_numbers, user_sub
"{}\n\n".format(" ".join(finals)), text_cleaner, user_sub
)
output += final_item
words = []
Expand All @@ -108,7 +108,7 @@ def toVTT(
self.timeStampVTT(words[0].start), self.timeStampVTT(words[-1].end)
)
final_item = self.formatUtterance(
"{}\n\n".format(" ".join(finals)), convert_numbers, user_sub
"{}\n\n".format(" ".join(finals)), text_cleaner, user_sub
)
output += final_item
return output
Expand All @@ -118,11 +118,11 @@ def toVTT(
if return_raw:
output += "{}\n\n".format(
self.formatUtterance(
" ".join([w.word for w in self.words]), convert_numbers, user_sub
" ".join([w.word for w in self.words]), text_cleaner, user_sub
)
)
else:
output += "{}\n\n".format(self.formatUtterance(str(self), convert_numbers, user_sub))
output += "{}\n\n".format(self.formatUtterance(str(self), text_cleaner, user_sub))
return output

def timeStampSRT(self, t_str) -> str:
Expand Down Expand Up @@ -186,7 +186,7 @@ def segmentsToSubtitleItems(
def toSRT(
self,
return_raw: bool = False,
convert_numbers: bool = False,
text_cleaner = None,
user_sub: List[Tuple[str, str]] = [],
) -> str:
output = ""
Expand All @@ -195,7 +195,7 @@ def toSRT(
r, n = item.toSRT(
i,
return_raw=return_raw,
convert_numbers=convert_numbers,
text_cleaner=text_cleaner,
user_sub=user_sub,
)
output += r
Expand All @@ -205,14 +205,14 @@ def toSRT(
def toVTT(
self,
return_raw: bool = False,
convert_numbers: bool = False,
text_cleaner = None,
user_sub: List[Tuple[str, str]] = [],
) -> str:
output = "WEBVTT Kind: captions; Language: {}\n\n".format(self.language)
for item in self.subtitleItems:
output += item.toVTT(
return_raw=return_raw,
convert_numbers=convert_numbers,
text_cleaner=text_cleaner,
user_sub=user_sub,
)
return output
Loading