-
Notifications
You must be signed in to change notification settings - Fork 0
/
key_list.py
38 lines (31 loc) · 1.14 KB
/
key_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import nltk
from nltk.tokenize import word_tokenize
def generate_key_list(sentence):
tokenized = word_tokenize(sentence)
pos_tagged = nltk.pos_tag(tokenized)
grammar = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammar)
noun_phrases_list = [' '.join(leaf[0] for leaf in tree.leaves())
for tree in cp.parse(pos_tagged).subtrees()
if tree.label() == 'NP']
return noun_phrases_list
def generate_candidate_keys(noun_phrases):
possible_keys=[]
for phrase in noun_phrases:
tokenized = word_tokenize(phrase)
pos_tagged = nltk.pos_tag(tokenized)
most_worthy = ""
for i in pos_tagged:
if i[1] == 'NN' or i[1] == 'NNS' or i[1]=='NNP' or i[1]=='NNPS':
most_worthy = i[0]
break
for i in pos_tagged:
if i[1] == 'JJ' or i[1] == 'JJR' or i[1]=='JJS':
most_worthy = i[0]
break
for i in pos_tagged:
if i[1] == 'CD':
most_worthy = i[0]
break
possible_keys.append(most_worthy)
return possible_keys