-
Notifications
You must be signed in to change notification settings - Fork 1
/
NLTKWrapper.py
85 lines (65 loc) · 2.04 KB
/
NLTKWrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python
#file: NLTKWrapper.py
import nltk
import nltk.data
import porter
punctuations = ".?!:;-()[]'\"/,"
def splitSentence(paragraph):
sentences = []
paragraph = paragraph.strip()
paragraph = paragraph.replace("\r\n", "\n")
firstSplitsentences = paragraph.split("\n")
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
for s in firstSplitsentences:
secondspits = tokenizer.tokenize(s)
sentences = sentences + secondspits
return sentences
def wordtokenizer(s, punct = True):
if punct:
return nltk.wordpunct_tokenize(s)
else:
return nltk.word_tokenize(s)
def getNgram(sentence, n, punct = True):
#n is the number of grams, such as 1 means unigram
ngrams = []
#tokens = summary.split()
tokens = wordtokenizer(sentence, punct)
N = len(tokens)
for i in range(N):
if i+n > N: continue
ngram = tokens[i:i+n]
ngrams.append(" ".join(ngram))
return ngrams
def getNgramTokened(word, n, tag = None):
#n is the number of grams, such as 1 means unigram
ngram_tags = []
ngram_words = []
if tag != None:
assert(len(tag) == len(word))
#tokens = summary.split()
N = len(word)
for i in range(N):
if i+n > N: continue
if tag != None:
ngram_tag = tag[i:i+n]
ngram_tags.append(" ".join(ngram_tag))
ngram_word = word[i:i+n]
ngram_words.append(" ".join(ngram_word))
if tag != None:
return ngram_words, ngram_tags
return ngram_words
def getWordList(file):
f = open(file,'r')
lines = f.readlines()
f.close()
words = []
for line in lines:
tokens = wordtokenizer(line)
words = words + tokens
return words
if __name__ == '__main__':
#print splitSentence("[1] I love you. [2] Sent 2. [3] sentence 3")
import fio
lines = fio.ReadFile("../../data/test.txt")
for line in lines:
print nltk.word_tokenize(line.decode('utf-8'))