forked from voidf/bertalign
-
Notifications
You must be signed in to change notification settings - Fork 1
/
g.py
40 lines (31 loc) · 1.18 KB
/
g.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import datetime
import os
from datasets import load_dataset
from bertalign import Bertalign
langs = ['zh', 'fr', 'es', 'ru']
def main(row, id):
dst = 'en'
dst_text = row[dst].replace('\n----\n', '\n')
# zh = row['zh'].replace('\n----\n', '\n')
for src in langs:
src_text = row[src].replace('\n----\n', '\n')
output_dir = f'aligned/{src}_{dst}/'
if not os.path.exists(output_dir):
os.mkdir(output_dir)
output_filename = f'{output_dir}{id}.txt'
if os.path.exists(output_filename):
print('skip', output_filename)
continue
aligner = Bertalign(src_text, dst_text, src_lang=src, tgt_lang=dst)
aligner.align_sents()
with open(f'{output_filename}', 'w', encoding='utf-8') as f:
for aligned in aligner.yield_sents():
f.write(aligned + '=' * 10 + '\n')
# aligner.
# aligner.print_sents()
if __name__ == "__main__":
begin_time = datetime.datetime.now()
dataset = load_dataset("ranWang/UN_PDF_TEXT_DATA", split='randomTest')
dataset.map(main, with_indices=True)
end_time = datetime.datetime.now()
print('Time elapsed:', end_time - begin_time)