Skip to content

Commit

Permalink
factorize text cleaning for subtitles formats (should fix things)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeronymous committed Nov 30, 2024
1 parent 5bfdbea commit 42acc06
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 19 deletions.
6 changes: 2 additions & 4 deletions transcriptionservice/server/formating/formatresult.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,17 +67,15 @@ def formatResult(
return final_result

elif return_format == "text/vtt":
# TODO: pass "fulltext_cleaner" instead of "convert_numbers"
t_result = TranscriptionResult.fromDict(result)
return Subtitles(t_result, language).toVTT(
return_raw=raw_return, convert_numbers=convert_numbers, user_sub=user_sub
return_raw=raw_return, text_cleaner=fulltext_cleaner, user_sub=user_sub
)

elif return_format == "text/srt":
# TODO: pass "fulltext_cleaner" instead of "convert_numbers"
t_result = TranscriptionResult.fromDict(result)
return Subtitles(t_result, language).toSRT(
return_raw=raw_return, convert_numbers=convert_numbers, user_sub=user_sub
return_raw=raw_return, text_cleaner=fulltext_cleaner, user_sub=user_sub
)

else:
Expand Down
30 changes: 15 additions & 15 deletions transcriptionservice/server/formating/subtitling.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,18 @@ def __init__(self, words: List[Tuple[Word, str]], language: str = ""):
self.end = max([w.end for w in self.words])

def formatUtterance(
self, utterance: str, convert_numbers: bool, user_sub: List[Tuple[str, str]]
self, utterance: str, text_cleaner, user_sub: List[Tuple[str, str]]
) -> str:
if convert_numbers:
utterance = textToNum(utterance, self.language)
if text_cleaner:
utterance = text_cleaner(utterance)
return cleanText(utterance, self.language, user_sub)

def toSRT(
self,
index_start: int = 0,
max_char_line: int = 40,
return_raw: bool = False,
convert_numbers: bool = False,
text_cleaner = lambda x: x,
user_sub: List[Tuple[str, str]] = [],
max_lines: int = 2,
display_spk: bool = False,
Expand All @@ -50,7 +50,7 @@ def toSRT(
c_l += 1
c_w = 0
if c_l >= max_lines:
final_item = self.formatUtterance(current_item, convert_numbers, user_sub)
final_item = self.formatUtterance(current_item, text_cleaner, user_sub)
output += "{}\n{} --> {}\n{}\n\n".format(
index_start + i + 1,
self.timeStampSRT(words[0].start),
Expand All @@ -65,7 +65,7 @@ def toSRT(
c_w += len(word.word if return_raw else final_word)
finals.append(word.word if return_raw else final_word)
current_item += "{}\n".format(" ".join(finals))
final_item = self.formatUtterance(current_item, convert_numbers, user_sub)
final_item = self.formatUtterance(current_item, text_cleaner, user_sub)
output += "{}\n{} --> {}\n{}\n\n".format(
index_start + i + 1,
self.timeStampSRT(words[0].start),
Expand All @@ -77,7 +77,7 @@ def toSRT(
def toVTT(
self,
return_raw: bool = False,
convert_numbers: bool = False,
text_cleaner = None,
user_sub: List[Tuple[str, str]] = [],
max_char_line: int = 40,
max_line: int = 2,
Expand All @@ -95,7 +95,7 @@ def toVTT(
self.timeStampVTT(words[-1].end),
)
final_item = self.formatUtterance(
"{}\n\n".format(" ".join(finals)), convert_numbers, user_sub
"{}\n\n".format(" ".join(finals)), text_cleaner, user_sub
)
output += final_item
words = []
Expand All @@ -108,7 +108,7 @@ def toVTT(
self.timeStampVTT(words[0].start), self.timeStampVTT(words[-1].end)
)
final_item = self.formatUtterance(
"{}\n\n".format(" ".join(finals)), convert_numbers, user_sub
"{}\n\n".format(" ".join(finals)), text_cleaner, user_sub
)
output += final_item
return output
Expand All @@ -118,11 +118,11 @@ def toVTT(
if return_raw:
output += "{}\n\n".format(
self.formatUtterance(
" ".join([w.word for w in self.words]), convert_numbers, user_sub
" ".join([w.word for w in self.words]), text_cleaner, user_sub
)
)
else:
output += "{}\n\n".format(self.formatUtterance(str(self), convert_numbers, user_sub))
output += "{}\n\n".format(self.formatUtterance(str(self), text_cleaner, user_sub))
return output

def timeStampSRT(self, t_str) -> str:
Expand Down Expand Up @@ -186,7 +186,7 @@ def segmentsToSubtitleItems(
def toSRT(
self,
return_raw: bool = False,
convert_numbers: bool = False,
text_cleaner = None,
user_sub: List[Tuple[str, str]] = [],
) -> str:
output = ""
Expand All @@ -195,7 +195,7 @@ def toSRT(
r, n = item.toSRT(
i,
return_raw=return_raw,
convert_numbers=convert_numbers,
text_cleaner=text_cleaner,
user_sub=user_sub,
)
output += r
Expand All @@ -205,14 +205,14 @@ def toSRT(
def toVTT(
self,
return_raw: bool = False,
convert_numbers: bool = False,
text_cleaner = None,
user_sub: List[Tuple[str, str]] = [],
) -> str:
output = "WEBVTT Kind: captions; Language: {}\n\n".format(self.language)
for item in self.subtitleItems:
output += item.toVTT(
return_raw=return_raw,
convert_numbers=convert_numbers,
text_cleaner=text_cleaner,
user_sub=user_sub,
)
return output

0 comments on commit 42acc06

Please sign in to comment.