-
Notifications
You must be signed in to change notification settings - Fork 2
/
wordlist2entriesusingcorpus.py
138 lines (122 loc) · 5.04 KB
/
wordlist2entriesusingcorpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# wordlist2entriesusingcorpus.py
copyright = """Copyright © 2017, Kimmo Koskenniemi
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or (at
your option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import hfst, sys, argparse, pprint
pp = pprint.PrettyPrinter(indent=1, compact=True)
argparser = argparse.ArgumentParser(
"python3 corpguesser.py",
description="Produces lexicon entries using also corpus but without interaction")
argparser.add_argument(
"guesser",
help="a guesser fst produced entry-pattern.py")
argparser.add_argument(
"corpguesses",
help="fst composed out of a word list fst and a guesser fst")
argparser.add_argument(
"-u", "--unique", type=int, default=0,
help="accept an entry which has at least UNIQUE forms in corpus"
" and the set is not a subset of any other entry")
argparser.add_argument(
"-r", "--reject", default=1000000, type=int,
help="reject candidates with penalty worse than the best by REJECTION or more")
argparser.add_argument(
"-v", "--verbosity", type=int, default=0,
help="level of diagnostic output")
args = argparser.parse_args()
guesser_fil = hfst.HfstInputStream(args.guesser)
guesser_fst = guesser_fil.read()
guesser_fil.close()
guesser_fst.invert()
guesser_fst.minimize()
guesser_fst.lookup_optimize()
def unique_entry(word_form_set, word_form):
"""Returns the set of entries which accept all word forms in word_form_set"""
remaining_entries = {0}
first = True
for word_form in word_form_set | {word_form}:
entries_and_weights = guesser_fst.lookup(word_form, output="tuple")
entries = set()
for e,w in entries_and_weights:
entries.add(e)
if remaining_entries == {0}:
remaining_entries = entries
else:
remaining_entries = remaining_entries & entries
if not remaining_entries:
break
return remaining_entries
corp_fil = hfst.HfstInputStream(args.corpguesses)
corp_fst = corp_fil.read()
corp_fil.close()
corp_fst.minimize()
corp_fst.lookup_optimize()
def check_corp(entry):
"""Finds what word forms in the corpus the 'entry' could have.
Returns a set of word forms occurring corpus which 'entry' would accept.
"""
result = corp_fst.lookup(entry, output="tuple")
corp_words = [wd for wd,wg in result]
return set(corp_words)
def nextline():
linenl = sys.stdin.readline()
if not linenl:
exit()
return linenl.strip()
def comparer(rec):
(e,ws,w,u) = rec
key = "{:4d}{:4.1f}{}".format(100-len(ws), 1000000-w, "A" if u else "B")
return key
if args.verbosity >= 1:
print("\nENTER FORMS OF A WORD:\n")
while True:
word_form = nextline()
#
entry_weight_tuple_lst = guesser_fst.lookup(word_form, output="tuple")
#
if not entry_weight_tuple_lst:
print("!!!", word_form)
continue
remaining_entries = set()
weights = {}
word_form_sets = {}
for (entry, weight) in entry_weight_tuple_lst:
remaining_entries.add(entry)
weights[entry] = weight
word_form_set = check_corp(entry) # words in the corpus that 'entry' accepts
word_form_sets[entry] = word_form_set
# now 'remaining_entries' is the set of entries which would accept 'word_form'
entry_list = []
for entry in remaining_entries:
#
ent_set = unique_entry(word_form_sets[entry], word_form) # this and possibly other entries which accept 'word_form_set'
#
# 'word_form_sets[entry]' uniquely defines 'entry' if 'ent_set' is {'entry'}
entry_list.append((entry, word_form_sets[entry], weights[entry], len(ent_set)==1))
entry_lst = sorted(entry_list, key=comparer)
# ordered primarily so that entries with most matches in corpus are at top,
# secondarily so that entries with less penalty weight come first
# finally, entries which are uniquely by their matching words precede those that are not
#
i = 0
for (entry, word_form_set, weight, uniq) in entry_lst:
i = i + 1
word_form_lst = list(word_form_set)
if i == 1 and uniq and len(word_form_set) >= args.unique:
print("{} ! {} = [{}]".format(entry, word_form, " ".join(word_form_lst)))
break
elif len(entry_lst) == 1:
print("{} ! {} : {} = [{}]".format(entry, weight, word_form, " ".join(word_form_lst)))
elif len(word_form_lst) >= args.unique:
print("! {} ! {} : {} [{}]".format(word_form, entry, weight, " ".join(word_form_lst)))
else:
print("!! {} ! {} : {} [{}]".format(word_form, entry, weight, " ".join(word_form_lst)))