forked from facebookresearch/LASER
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract.py
81 lines (75 loc) · 3.11 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/bin/python3
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
#
# LASER Language-Agnostic SEntence Representations
# is a toolkit to calculate multilingual sentence embeddings
# and to use them for document classification, bitext filtering
# and mining
#
# --------------------------------------------------------
#
# Tool to extract subset of mined bitexts in a tsv.gz file
import os
import sys
import gzip
import argparse
###############################################################################
#
# Main
#
###############################################################################
parser = argparse.ArgumentParser(description='Tool to extract bitext from the WikiMatrix')
parser.add_argument('--encoding', default='utf-8',
help='character encoding for input/output')
parser.add_argument('--tsv', type=str, required=True,
help='File with mined bitexts')
parser.add_argument('--bitext', type=str, required=True,
help='Text file after sentence splitting')
parser.add_argument('--src-lang', type=str, required=True,
help='Source language')
parser.add_argument('--trg-lang', type=str, required=True,
help='Traget language')
parser.add_argument('--threshold', type=float, default=1.05,
help='Threshold on margin score')
parser.add_argument('--nb-sents', type=int, default=999999999,
help='Maximal number of sentences')
parser.add_argument('--nb-words-src', type=int, default=999999999,
help='Maxmimal numer of total words in the source language')
parser.add_argument('--nb-words-trg', type=int, default=999999999,
help='Maxmimal numer of total words in the target language')
args = parser.parse_args()
print('Tool to extract bitext from the WikiMatrix')
nl = 0
nw_src = 0
nw_trg = 0
print('Processing {}'.format(args.tsv))
with gzip.open(args.tsv, 'rt', encoding=args.encoding) as tsv:
with open(args.bitext + '.' + args.src_lang, 'wt', encoding=args.encoding) as fsrc:
with open(args.bitext + '.' + args.trg_lang, 'wt', encoding=args.encoding) as ftrg:
while nl < args.nb_sents:
line = tsv.readline()
if not line:
break
fields = line.split('\t')
cur_src = len(fields[1].split())
cur_trg = len(fields[2].split())
if float(fields[0]) < args.threshold:
break
if nw_src + cur_src > args.nb_words_src:
break
if nw_trg + cur_trg > args.nb_words_trg:
break
fsrc.write(fields[1].strip() + '\n')
ftrg.write(fields[2].strip() + '\n')
nw_src += cur_src
nw_trg += cur_trg
nl += 1
if nl % 100000 == 0:
print('\r - {:d} lines read'.format(nl), end='')
print('\r - wrote {:d} lines'.format(nl))
print(' - with {:d} source and {:d} target words'.format(nw_src, nw_trg))
print(' - last threshold is {:.4f}'.format(float(fields[0])))