NE_Recognizer.py_cap

from helper import *
NAME_WINDOW=5
class Node :
    def __init__(self, tok) :
        self.tok = tok
        self.count = 0
        self.children = dict()

    def incr(self) :
        self.count += 1

    def get_tok(self) :
        return self.tok

    def get_count(self) :
        return self.count

    def has_child(self, tok) :
        return self.children.has_key(tok)

    def get_child(self, tok) :
        if not self.children.has_key(tok) :
            self.children[tok] = Node(tok)
        return self.children[tok]

    def get_score(self, tok_seq, conditioned_denom) :
        if len(tok_seq) == 0 :
            ''' This is the destination node. Return Score '''
            return float(self.count)/float(conditioned_denom)
        child = tok_seq.pop(0)
        if not self.children.has_key(child) :
            return 0
        else :
            node = self.children[child]
            return node.get_score(tok_seq, float(self.count))

    def printme(self, num_tab=0) :
        print '\t\t'*num_tab+self.tok+" - "+str(self.count)
        for c in self.children.values() :
            c.printme(num_tab+1)

class CounterTrie :
    ''' Corresponds to a candidate word for he start of a name '''
    def __init__(self, root_token) :
        self.root = Node(root_token) 
        self.root.incr()

    def invalid_seq(self, seq) :
        ''' Check if the sequence is 0-length or is not rooted at the same root
        as this structure '''
        return len(seq) == 0 or seq[0] != self.root.get_tok()

    def count_seq(self, tok_seq) :
        if self.invalid_seq(tok_seq) :
            return
        else :
            curr_node = self.root
            curr_node.incr()
            tok_seq.pop(0)

        for tok in tok_seq :
            '''
                Do not allow un-cap words with long length. Definitely not in
                proper name
                '''
            if not is_token_cap(tok) and len(tok) > 4 :
                return
            curr_node = curr_node.get_child(tok)
            curr_node.incr()

    def get_seq_score(self, seq) :
        if self.invalid_seq(seq) or len(seq) == 1:
            ''' 1-length sequence should not contribute score'''
            return 0
        else :
            seq.pop(0)
            score = self.root.get_score(seq, self.root.get_count())
            return score

    def printme(self) :
        self.root.printme()

class NE_Recognizer :

    def __init__(self) :
        self.cap_token_stats = dict()
        self.trie_dict = dict()

    def train_document(self, doc) :
        def count_token(src_doc, counter) :
            counter[0] += 1
            counter[1].add(src_doc)
            return counter

        tok_nopunct = doc.tokens_nopunct
        #Collect statistics for only the capitalized tokens
        for i in range(0, len(tok_nopunct)) :
            tok = tok_nopunct[i]
            if not is_token_cap(tok) :
                continue
            if not self.trie_dict.has_key(tok) :
                self.trie_dict[tok] = CounterTrie(tok)
            seq = tok_nopunct[i:min(i+NAME_WINDOW,len(tok_nopunct))]
            self.trie_dict[tok].count_seq(seq)

    def extract_names(self, tok_list) :
        def merge_into_dict(src, dest) :
            for i in src :
                name = i[1][1]
                score = i[1][0]
                if not dest.has_key(name) :
                    dest[name] = score
                else :
                    dest[name] += score
            return dest

        name_dict = dict()
        ''' Wrapper to extract_names '''
        punct_blacklist = [',','(',')','.','!','?']
        #Split the original token list by punctuation
        tok_chunks = []
        agg = []
        for tok in tok_list :
            if tok[len(tok)-1] in punct_blacklist :
                agg.append(tok[:-1])
                tok_chunks.append(agg)
                agg = []
            else :
                agg.append(tok)
        for seq in tok_chunks :
            if len(seq) == 0 :
                continue
            tok_scores = self.extract_names_(seq)
            name_dict = merge_into_dict(tok_scores, name_dict)
        return name_dict

    def extract_names_(self, tok_list) :
        ''' Assumes the token list has punctuation.
        General extraction rules : Name starts with a capitalized word.
        Names do not cross punctuation boundaries : [',' , '.', '()'].
        Last word of a name is capitalized.
        '''
        result_set = dict()
        for i in range(0, len(tok_list)) :
            print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@New Iteration"
            if is_token_cap(tok_list[i]) :
                tok_seq = tok_list[i:min(i+NAME_WINDOW, len(tok_list))]
                found, score = self.extract_names_from_seq(tok_seq)
                print "Found:"+str(found) + "score:"+str(score)
                for name in found :
                    t = tuple(name)
                    if not result_set.has_key(t) :
                        result_set[t] = [score, name]
                    else :
                        prev_score = result_set[t][0]
                        result_set[t] = [prev_score+score ,name]
        tok_scoring =  sorted(result_set.items(), key=lambda x:x[1][0], reverse=True)
        return tok_scoring
        '''
        for i in range(len(tok_scoring)) :
            print str(tok_scoring[i][1][1])+ " - " + str(tok_scoring[i][1][0])
        '''

    def extract_names_from_seq(self, tok_seq) :
        ''' Extract NE from a sequence of tokens '''

        def add_score(score_dict, candidate_name, other_names, total_score) :
            print "Score Dict:"+str(score_dict.items())
            print "Other Names:"+str(other_names)
            print "Candidate_name"+str(candidate_name)
            if other_names == None or len(other_names) == 0 :
                name_list = [candidate_name]
            else :
                name_list = [candidate_name]
                name_list.extend(other_names)
                name_list = sorted(name_list)
            print "Name Tuple"+str(tuple(name_list))
            #Needs to be hashable.
            name_tuple = tuple(name_list)
            if score_dict.has_key(name_tuple) :
                score_dict[name_tuple] = max(total_score, score_dict[name_tuple])
            else :
                score_dict[name_tuple] = total_score
            return score_dict

        def cut_trailing_lowercase(seq) :
            i = len(seq)-1
            while i > 0 and not is_token_cap(seq[i]) :
                i -= 1
            return seq[:i+1]

        if len(tok_seq) == 0 or not self.trie_dict.has_key(tok_seq[0]) :
            ''' Score is 0 is the token does not have a trie structure or token
            sequence is empty'''
            return [], 0

        root = tok_seq[0]
        if len(tok_seq) == 1 and self.trie_dict.has_key(root) :
            ''' Base case for 1-len word '''
            return [root], 0

        scores = dict()
        for i in range(1, len(tok_seq)+1) :
            tokens = cut_trailing_lowercase(tok_seq[:i])
            candidate_name = " ".join(tokens)
            #print "---- Entire Tok Seq:"+str(tok_seq)
            #print "i="+str(i)
            #print "TokSeq:"+str(tokens)
            #print "CandidateName:"+candidate_name
            candidate_score = self.trie_dict[root].get_seq_score(tokens)
            #print "CandidateScore:"+str(candidate_score)
            remaining = tok_seq[i:]
            #print "Remaining:"+str(remaining)
            rem_names, rem_score = self.extract_names_from_seq(remaining)
            #print "rem names:"+str(rem_names) + " rem score:"+str(rem_score)
            total_score = candidate_score+float(rem_score)
            #print "What's the candidate name now?:"+candidate_name
            scores = add_score(scores, candidate_name, rem_names, total_score)
        #print "-------SEQ Ret:"+str( max(scores.items(), key=lambda x: x[1]))
        return max(scores.items(), key=lambda x: x[1])