-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean_transcripts.py
63 lines (50 loc) · 2.38 KB
/
clean_transcripts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import re
import glob
def clean_transcript_w_timestamps(input_file, output_file):
with open(input_file, "r", encoding="utf-8") as file:
lines = file.readlines()
cleaned_lines = []
timestamp_pattern = re.compile(r"^\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}")
caption_text = ""
current_start_timestamp = None
for line in lines:
line = line.strip()
if timestamp_pattern.match(line):
timestamps = line.split(" align:start position:0%")[0].split(" --> ")
current_start_timestamp, current_end_timestamp = timestamps
elif line and "<c>" not in line and not line.startswith(("WEBVTT", "Kind:", "Language:")):
if caption_text and line == caption_text:
# Update the end time of the last timestamp
previous_timestamps[-1] = current_end_timestamp
else:
if caption_text:
cleaned_lines.append(f"{previous_timestamps[0]} --> {previous_timestamps[-1]}\n{caption_text}\n\n")
caption_text = line
previous_timestamps = [current_start_timestamp, current_end_timestamp]
if caption_text:
cleaned_lines.append(f"{previous_timestamps[0]} --> {previous_timestamps[-1]}\n{caption_text}\n\n")
with open(output_file, "w", encoding="utf-8") as file:
file.writelines(cleaned_lines)
def clean_transcript_wo_timestamps(input_file, output_file):
with open(input_file, "r", encoding="utf-8") as file:
lines = file.readlines()
cleaned_lines = []
timestamp_pattern = re.compile(r"^\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}$")
for line in lines:
line = line.strip()
if not line or timestamp_pattern.match(line):
continue
cleaned_lines.append(line)
with open(output_file, "w", encoding="utf-8") as file:
for line in cleaned_lines:
file.write(line + "\n")
if __name__ == "__main__":
for original_transcript in glob.glob("wikihow_val_transcripts/*.vtt"):
clean_transcript_w_timestamps(
original_transcript,
f"transcripts/w_timestamps/{original_transcript.split('/')[-1]}",
)
clean_transcript_wo_timestamps(
f"transcripts/w_timestamps/{original_transcript.split('/')[-1]}",
f"transcripts/wo_timestamps/{original_transcript.split('/')[-1]}",
)