-
Notifications
You must be signed in to change notification settings - Fork 0
/
NE_Recognizer.py_cap
220 lines (194 loc) · 7.85 KB
/
NE_Recognizer.py_cap
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
from helper import *
NAME_WINDOW=5
class Node :
def __init__(self, tok) :
self.tok = tok
self.count = 0
self.children = dict()
def incr(self) :
self.count += 1
def get_tok(self) :
return self.tok
def get_count(self) :
return self.count
def has_child(self, tok) :
return self.children.has_key(tok)
def get_child(self, tok) :
if not self.children.has_key(tok) :
self.children[tok] = Node(tok)
return self.children[tok]
def get_score(self, tok_seq, conditioned_denom) :
if len(tok_seq) == 0 :
''' This is the destination node. Return Score '''
return float(self.count)/float(conditioned_denom)
child = tok_seq.pop(0)
if not self.children.has_key(child) :
return 0
else :
node = self.children[child]
return node.get_score(tok_seq, float(self.count))
def printme(self, num_tab=0) :
print '\t\t'*num_tab+self.tok+" - "+str(self.count)
for c in self.children.values() :
c.printme(num_tab+1)
class CounterTrie :
''' Corresponds to a candidate word for he start of a name '''
def __init__(self, root_token) :
self.root = Node(root_token)
self.root.incr()
def invalid_seq(self, seq) :
''' Check if the sequence is 0-length or is not rooted at the same root
as this structure '''
return len(seq) == 0 or seq[0] != self.root.get_tok()
def count_seq(self, tok_seq) :
if self.invalid_seq(tok_seq) :
return
else :
curr_node = self.root
curr_node.incr()
tok_seq.pop(0)
for tok in tok_seq :
'''
Do not allow un-cap words with long length. Definitely not in
proper name
'''
if not is_token_cap(tok) and len(tok) > 4 :
return
curr_node = curr_node.get_child(tok)
curr_node.incr()
def get_seq_score(self, seq) :
if self.invalid_seq(seq) or len(seq) == 1:
''' 1-length sequence should not contribute score'''
return 0
else :
seq.pop(0)
score = self.root.get_score(seq, self.root.get_count())
return score
def printme(self) :
self.root.printme()
class NE_Recognizer :
def __init__(self) :
self.cap_token_stats = dict()
self.trie_dict = dict()
def train_document(self, doc) :
def count_token(src_doc, counter) :
counter[0] += 1
counter[1].add(src_doc)
return counter
tok_nopunct = doc.tokens_nopunct
#Collect statistics for only the capitalized tokens
for i in range(0, len(tok_nopunct)) :
tok = tok_nopunct[i]
if not is_token_cap(tok) :
continue
if not self.trie_dict.has_key(tok) :
self.trie_dict[tok] = CounterTrie(tok)
seq = tok_nopunct[i:min(i+NAME_WINDOW,len(tok_nopunct))]
self.trie_dict[tok].count_seq(seq)
def extract_names(self, tok_list) :
def merge_into_dict(src, dest) :
for i in src :
name = i[1][1]
score = i[1][0]
if not dest.has_key(name) :
dest[name] = score
else :
dest[name] += score
return dest
name_dict = dict()
''' Wrapper to extract_names '''
punct_blacklist = [',','(',')','.','!','?']
#Split the original token list by punctuation
tok_chunks = []
agg = []
for tok in tok_list :
if tok[len(tok)-1] in punct_blacklist :
agg.append(tok[:-1])
tok_chunks.append(agg)
agg = []
else :
agg.append(tok)
for seq in tok_chunks :
if len(seq) == 0 :
continue
tok_scores = self.extract_names_(seq)
name_dict = merge_into_dict(tok_scores, name_dict)
return name_dict
def extract_names_(self, tok_list) :
''' Assumes the token list has punctuation.
General extraction rules : Name starts with a capitalized word.
Names do not cross punctuation boundaries : [',' , '.', '()'].
Last word of a name is capitalized.
'''
result_set = dict()
for i in range(0, len(tok_list)) :
print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@New Iteration"
if is_token_cap(tok_list[i]) :
tok_seq = tok_list[i:min(i+NAME_WINDOW, len(tok_list))]
found, score = self.extract_names_from_seq(tok_seq)
print "Found:"+str(found) + "score:"+str(score)
for name in found :
t = tuple(name)
if not result_set.has_key(t) :
result_set[t] = [score, name]
else :
prev_score = result_set[t][0]
result_set[t] = [prev_score+score ,name]
tok_scoring = sorted(result_set.items(), key=lambda x:x[1][0], reverse=True)
return tok_scoring
'''
for i in range(len(tok_scoring)) :
print str(tok_scoring[i][1][1])+ " - " + str(tok_scoring[i][1][0])
'''
def extract_names_from_seq(self, tok_seq) :
''' Extract NE from a sequence of tokens '''
def add_score(score_dict, candidate_name, other_names, total_score) :
print "Score Dict:"+str(score_dict.items())
print "Other Names:"+str(other_names)
print "Candidate_name"+str(candidate_name)
if other_names == None or len(other_names) == 0 :
name_list = [candidate_name]
else :
name_list = [candidate_name]
name_list.extend(other_names)
name_list = sorted(name_list)
print "Name Tuple"+str(tuple(name_list))
#Needs to be hashable.
name_tuple = tuple(name_list)
if score_dict.has_key(name_tuple) :
score_dict[name_tuple] = max(total_score, score_dict[name_tuple])
else :
score_dict[name_tuple] = total_score
return score_dict
def cut_trailing_lowercase(seq) :
i = len(seq)-1
while i > 0 and not is_token_cap(seq[i]) :
i -= 1
return seq[:i+1]
if len(tok_seq) == 0 or not self.trie_dict.has_key(tok_seq[0]) :
''' Score is 0 is the token does not have a trie structure or token
sequence is empty'''
return [], 0
root = tok_seq[0]
if len(tok_seq) == 1 and self.trie_dict.has_key(root) :
''' Base case for 1-len word '''
return [root], 0
scores = dict()
for i in range(1, len(tok_seq)+1) :
tokens = cut_trailing_lowercase(tok_seq[:i])
candidate_name = " ".join(tokens)
#print "---- Entire Tok Seq:"+str(tok_seq)
#print "i="+str(i)
#print "TokSeq:"+str(tokens)
#print "CandidateName:"+candidate_name
candidate_score = self.trie_dict[root].get_seq_score(tokens)
#print "CandidateScore:"+str(candidate_score)
remaining = tok_seq[i:]
#print "Remaining:"+str(remaining)
rem_names, rem_score = self.extract_names_from_seq(remaining)
#print "rem names:"+str(rem_names) + " rem score:"+str(rem_score)
total_score = candidate_score+float(rem_score)
#print "What's the candidate name now?:"+candidate_name
scores = add_score(scores, candidate_name, rem_names, total_score)
#print "-------SEQ Ret:"+str( max(scores.items(), key=lambda x: x[1]))
return max(scores.items(), key=lambda x: x[1])