-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_alignment_features.py
executable file
·55 lines (44 loc) · 2.2 KB
/
extract_alignment_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import argparse
from collections import Counter
from utils import read_file
import pickle
def get_features(jointfname: str, fwdfname: str, revname: str, outfname: str) -> None:
"""
This converts data in the shared task format into standard machine translation format (one sentence per line, languages in separate files.)
For training data, it combines the prompt with all accepted translations.
For dev or test data, it combines the prompt only with the most popular translation.
"""
with open(fwdfname, 'rb') as fwdf, open(revname, 'rb') as revf, \
open(jointfname, 'r', encoding='utf-8') as f, open(outfname, 'w', encoding='utf-8') as outf:
for lineno, (line, fwd_line, rev_line) in enumerate(zip(f, fwdf, revf)):
fields = line.strip().split(' ||| ')
src_sent = fields[0].split()
trg_sent = fields[1].split()
fwd_links = [tuple(map(int, s.split(b'-')))
for s in fwd_line.split()]
rev_links = [tuple(map(int, s.split(b'-')))
for s in rev_line.split()]
fwd_fert = Counter(i for i, j in fwd_links)
features = [len(trg_sent) - len(fwd_fert)]
rev_fert = Counter(j for i, j in rev_links)
features.append(len(src_sent) - len(rev_fert))
vals = [0]*3
i = 0
for k, v in fwd_fert.most_common(3):
vals[i] = v
i+=1
features.extend(vals)
i = 0
for k, v in rev_fert.most_common(3):
vals[i] = v
i+=1
features.extend(vals)
print((" ").join(map(str,features)), file=outf)
if __name__ == "__main__":
parser = argparse.ArgumentParser("This extracts alignment features")
parser.add_argument("-i", "--joint", help="Path of the joint file", required=True)
parser.add_argument("-f", "--fwd", help="fwd alignment generated by eflomal", required=True)
parser.add_argument("-r", "--rev", help="rev alignment generated by eflomal", required=True)
parser.add_argument("-o", "--out", help="output feature fname", required=True)
args = parser.parse_args()
get_features(args.joint, args.fwd, args.rev, args.out)