-
Notifications
You must be signed in to change notification settings - Fork 3
/
bigrammer.py
456 lines (347 loc) · 18.4 KB
/
bigrammer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
import sensplit
import copy
# Bigrammer corpus analyzing program
# by Logan Mitchell, Lindsay McNamara, and Colleen Darling (2011)
#
# This program is designed to analyze the chance of bigrams (word pairs) appearing
# in small to medium sized corpora. It can also pull various statistics about the
# corpora, such as word frequency and likelyhood.
#Main functions --------------------------------------------------------------------------------------------------
#Checks for the probability of a given bigram in a known corpus
#(note: this assumes the bigram is for two words, not letters/syllables.)
# @corpusFilename The filename of the corpus to load. Usually a txt file.
# @bigramword1 The first word of the bigram to check
# @bigramword2 The second word of the bigram to check
def getBigramChance(corpusFilename, bigramWord1, bigramWord2):
bigramFrequencyList = makeBigramFreqList(corpusFilename) #get frequency of each bigram
#quick check if we should continue - if file is not found, don't move on.
if len(bigramFrequencyList) == 0:
return
tokenFrequencyList = makeWordFreqList(corpusFilename) #track frequency of each token
#now that we have our lists, we do the statistical analysis part (as long as we have lists):
if len(tokenFrequencyList) > 0 and len(bigramFrequencyList) > 0:
try:
#get bigram probability tables
bigramProbabilityTable = getBigramProbTable(bigramFrequencyList, tokenFrequencyList)
except MemoryError:
print "Unable to get bigram chance; corpus size was too large. (Out of memory)"
return
#lastly check the user bigram pair.
userBigram = bigramWord1.lower() + " " + bigramWord2.lower()
#once we have the probilities, check if the input bigram exists in the table.
#if it exists, print the probability.
if userBigram in bigramProbabilityTable:
print "Probability of [" + userBigram + "]: " + str(bigramProbabilityTable[userBigram]*100) + "%"
else:
print "Probability of [" + userBigram + "]: 0% (Does not occur)"
#Predicts which word is likely to come next, using a given corpus as training data and a word to follow up on.
# @corpusFilename The filename of the corpus to load. Usually a txt file.
# @bigramword1 The first word of the bigram to check
# @bigramword2 The second word of the bigram to check
# @predictWord The word which will be checked for the most likely followup word, based on the corpus.
def predictNextWord(corpusFilename, predictWord):
#display an error if they want to analyze a stop word.
if isStopWord(predictWord):
print "Cannot predict word following '" + predictWord + "'; '" + predictWord + "' is removed in corpus processing."
return
bigramFrequencyList = makeBigramFreqList(corpusFilename) #get frequency of each bigram
tokenFrequencyList = makeWordFreqList(corpusFilename) #track frequency of each token
if len(tokenFrequencyList) > 0 and len(bigramFrequencyList) > 0:
try:
#get bigram probability table
bigramProbabilityTable = getBigramProbTable(bigramFrequencyList, tokenFrequencyList)
except MemoryError:
print "Unable to predict word; corpus size was too large. (Out of memory)"
return
#get the most likely word and print the details about it
probPair = getLikelyNextWord(bigramProbabilityTable, predictWord)
if len(probPair[0]) > 0:
print "Next most likely word following '" + predictWord + "' is '" + probPair[0] + "'"
else:
#in cases where the word does not appear in the corpus at all...
print "Unable to predict word based on chosen corpus."
#Print a likely sentence (minus function words) given a corpus, based on highest probabilities alone.
#finds a probable word, then makes a sentence out of it based on what is most likely to come next.
# @corpusFilename The filename of the corpus to load. Usually a txt file.
def printLikelySen(corpusFilename):
bigramFrequencyList = makeBigramFreqList(corpusFilename) #get frequency of each bigram
tokenFrequencyList = makeWordFreqList(corpusFilename) #track frequency of each token
if len(tokenFrequencyList) > 0 and len(bigramFrequencyList) > 0:
#get probability tables
try:
wordProbabilityTable = getWordProbTable(tokenFrequencyList)
bigramProbabilityTable = getBigramProbTable(bigramFrequencyList, tokenFrequencyList)
except MemoryError:
print "Unable to form sentence; corpus size was too large. (Out of memory)"
return
#find largest probability for a single word
likelyWord = ''
likelyProb = 0
for word in wordProbabilityTable.keys():
prob = wordProbabilityTable[word]
if prob > likelyProb:
likelyProb = prob
likelyWord = word
#now that we have the most likely word, make a sentence of 8 or less words using the bigram probabilities...
senLength = 1
sen = likelyWord
prevSenWord = likelyWord
while senLength < 8:
nextWord = getLikelyNextWord(bigramProbabilityTable, prevSenWord)[0]
if len(nextWord) == 0:
sen += "."
break
sen += " " + nextWord
prevSenWord = nextWord
senLength += 1
if senLength == 8:
sen += "."
print sen
#prints the bigram frequency list for a corpus
def printFreqList(filename):
try:
bigramFrequencyList = makeBigramFreqList(filename)
except MemoryError:
print "Unable to create frequency list; corpus size was too large. (Out of memory)"
return
for key in bigramFrequencyList.keys():
print "Freq [" + key + "]:" + str(bigramFrequencyList[key])
#prints the bigram probability table for a corpus
def printBigramProbTable(filename):
try:
bigramFrequencyList = makeBigramFreqList(filename) #get frequency of each bigram
tokenFrequencyList = makeWordFreqList(filename) #track frequency of each token
except MemoryError:
print "Unable to create frequency tables; corpus size was too large. (Out of memory)"
return
#now that we have our lists, we do the statistical analysis part (as long as we have lists):
if len(tokenFrequencyList) > 0 and len(bigramFrequencyList) > 0:
#get bigram probability tables
try:
bigramProbabilityTable = getBigramProbTable(bigramFrequencyList, tokenFrequencyList)
except MemoryError:
print "Unable to create bigram probability table; corpus size was too large. (Out of memory)"
return
total = 0
for key in bigramProbabilityTable.keys():
print "Probability [" + key + "]:" + str(bigramProbabilityTable[key])
total += bigramProbabilityTable[key]
print "Total: " + str(total)
#Helper Functions ----------------------------------------------------------------------------------------------
#Gets the most likely word to follow the predcitWord given the bigramProbabilityTable
#@return Returns a list of two elements; the first is the likely next word, and the second is the probability of that word
def getLikelyNextWord(bigramProbabilityTable, predictWord):
returnPair = [] #holds word and predictChance, to be returned.
predictList = {}
#cycle through all bigrams and check for ones that start with the input word
for bigram in bigramProbabilityTable.keys():
bigramParts = bigram.split()
if bigramParts[0] == predictWord.lower():
predictList[bigramParts[1]] = bigramProbabilityTable[bigram]
if len(predictList) > 0:
#find largest probability bigram of the ones we found, and note the values for it
likelyWord = ''
likelyProb = 0
totalPredictChance = 0
for word in predictList.keys():
prob = predictList[word]
totalPredictChance += prob
if prob > likelyProb:
likelyProb = prob
likelyWord = word
returnPair.append(likelyWord)
#gives probability of this word out of all the recorded bigrams that have this word in it
returnPair.append(int(float(likelyProb)*100))
return returnPair
else:
#if we find nothing, return empty strings.
return ["",""]
#Makes and returns the bigram probability table (dictionary) given a bigrm frequency list and a token frequency list
def getBigramProbTable(bigramFrequencyList, tokenFrequencyList):
bigramProbabilityTable = {} #holds the final probability table
#check the input lists exist:
if len(tokenFrequencyList) > 0 and len(bigramFrequencyList) > 0:
#copy all the bigrams into a new table that will report their probabilities.
#causes memory issues; ideally we would just reuse bigramFrequencyList but no time.
bigramProbabilityTable = copy.deepcopy(bigramFrequencyList)
#Now calculate the probability of the bigram by using conditional probability
#the two words are statistically independant, so the formula that follows describes the bigram probability:
#P(word1 + word2) = count(bigram) / count(word1)
for bigram in bigramProbabilityTable.keys():
#get the two words that make up the bigram
parts = bigram.split()
#print "Freq [" + bigram + "] = " + str(bigramFrequencyList[bigram]) + "/" + str(tokenFrequencyList[parts[0]])
bigramProbabilityTable[bigram] = float(bigramFrequencyList[bigram]) / tokenFrequencyList[parts[0]]
#lastly, normalize
total = 0
for key in bigramProbabilityTable.keys():
total += bigramProbabilityTable[key]
for key in bigramProbabilityTable.keys():
bigramProbabilityTable[key] = bigramProbabilityTable[key]/total
return bigramProbabilityTable
#Returns the number of bigrams that have X frequency
def getNumBigramsOfFreq(bigramFrequencyList, x):
occurances = 0
for bigram in bigramFrequencyList.keys():
#get how many times this bigram occurs
countNum = bigramFrequencyList[bigram]
if countNum == x:
occurances += 1
#print str(occurances) + " bigrams exist of frequency " + str(x)
return occurances
#finds the maximum frequency out of all bigrams
def getMaxBigramFreq(bigramFrequencyList):
maxFreq = 0
for bigram in bigramFrequencyList.keys():
#get how many times this bigram occurs
countNum = bigramFrequencyList[bigram]
if countNum > maxFreq:
maxFreq = countNum
return maxFreq
#returns an array with how many bigrams of each frequency occur
def queryBigramStats(bigramFrequencyList):
statArray = []
#first check the maximum count that is reached ('C' in Good-Turing discounting)
maxFreq = getMaxBigramFreq(bigramFrequencyList)
#go through 0-maxFreq and get how many of each frequency occurs. +1 due to 0-based index.
for i in range(0,maxFreq+1):
occurs = getNumBigramsOfFreq(bigramFrequencyList, i)
statArray.append(occurs)
#return array has C as index and value as Nc
return statArray
#Gets the probability table for single words.
def getWordProbTable(tokenFrequencyList):
#maybe do the same for single word probabilities if we need those to calculate bigram probabilities.
wordProbabilityTable = copy.deepcopy(tokenFrequencyList)
totalWords = 0
#get total number of words (non-stop words)
for word in tokenFrequencyList.keys():
totalWords += tokenFrequencyList[word]
#Get the probability of each word
for word in wordProbabilityTable.keys():
wordProbabilityTable[word] = float(tokenFrequencyList[word])/totalWords
return wordProbabilityTable
#checks if a word exists in a text file full of stop words. returns True or False.
def isStopWord(word):
wordFile = open("stopwords.txt")
stopWords = wordFile.readlines()
#check if any of the words in the file equal the input word
for stopWord in stopWords:
fixedWord = stopWord[:stopWord.find('\n')]
#if they do, return true
if fixedWord == word:
return True
#else...
return False
#checks if a word has punctuation attached to it, and if so, returns the word stripped of punctuation.
def removePunctuation(word):
punctuationList = [".",",",":",";","!","?", "(", ")", "[", "]", '"', "'", "-"] #maybe more?
updatedWord = word
#check a piece of punctuation exists in the updatedWord, anywhere
for punct in punctuationList:
#if it does, remove the punctuation, set the new updatedWord, and start the search over
#do this until no punctuation can be found in the word anymore
while updatedWord.find(punct) >=0:
pIndex = updatedWord.find(punct)
updatedWord = updatedWord[:pIndex] + updatedWord[pIndex+1:]
#lastly, return the updatedWord
return updatedWord
#makes a bigram frequency list for a given corpus
def makeBigramFreqList(filename):
bigramFrequencyList = {} #track freq of each bigram
#split the corpus into sentences. (Due to assuming bigrams cannot cross sentence ends.)
corpusSentences = sensplit.sen_splitter(filename)
#quick check if we should continue - if file is not found, don't move on.
if len(corpusSentences) == 0:
return []
wordFreqList = makeWordFreqList(filename)
#make a combinatorial list of all bigram pairs first. (extras smoothed later)
for word1 in wordFreqList:
for word2 in wordFreqList:
bigram = word1 + " " + word2
bigramFrequencyList[bigram] = 0
#for each sentence...
for sen in corpusSentences:
#Make tokens (words) from the sentence by splitting on whitespace.
senTokens = sen.split()
tokenPair = [] #keep track of our current bigram pair as we go through the sentence
#for each 'word' in the current sentence...
for token in senTokens:
#ignore any stop words (function words, etc).
#stop words will be found in stopwords.txt
#first turn the token lowercase for easier comparison!
token = token.lower()
#remove punctuation if it exists in the current token.
token = removePunctuation(token)
#if not a stop word...
if not isStopWord(token) and len(token.strip()) > 0:
#add to our current tokenpair
tokenPair.append(token)
#if our tokenpair is now two words, add the pair to the bigramFreq list by
# combining the two words as the key; ex: "word1 word2" as a key, seperated by space
# Steps: check if pair exists already, if not add pair and set freq to 1
# Ex. bigramFrequencyList["word1 word2"] = 1
# If pair exists already, simply increment frequency for that pair by 1.
if len(tokenPair) == 2:
pairKey = tokenPair[0] + ' ' + tokenPair[1]
if pairKey in bigramFrequencyList:
bigramFrequencyList[pairKey] += 1
else:
bigramFrequencyList[pairKey] = 1
#if we put the tokenpair into the bigramFreq list, clear the current pair and start a new pair
# this pair will start with the current token as the first word.
tokenPair = []
tokenPair.append(token)
#SMOOTHING TIME
#use Good Turing discount formula to modify the frequency of the bigram table
#Formula:
# c* = (c+1) * NumBigramsOfFreq(C+1) / NumBigramsOfFreq(C)
#NOTE: There is an inherent issue with Good Turing smoothing when numBigramsOfFreq(C+1) == 0
#This becomes almost a non-issue with any regular sized corpus, but this smoothing will ruin
#the frequencies by setting them to 0 if there are any interm frequency counts that are 0.
#Moral: Don't use tiny data sets.
#Source: http://www.ee.ucla.edu/~weichu/htkbook/node214_mn.html
#Now, get some data we'll need for our formula...
#get a list of all frequencies and occurances of those frequencies in the freq. list
bigramStats = queryBigramStats(bigramFrequencyList)
#make a new list to hold the new bigram frequencies we will replace the old ones with
newBigramFrequencies = copy.deepcopy(bigramStats)
#for each frequency... (Use -1 due to using c+1 and c being an index)
for c in range(0,len(bigramStats)-1):
#avoid division by 0 error
if bigramStats[c] != 0:
#Adjust the counts using the Good Turing Discount formula
newBigramFrequencies[c] = float((c+1)*bigramStats[c+1])/bigramStats[c]
else:
newBigramFrequencies[c] = 0
#Then replace the old frequency counts with the new frequency counts
#The old frequency will be the index into the new array to get the updated count
for bigram in bigramFrequencyList.keys():
oldBigramFreq = bigramFrequencyList[bigram]
bigramFrequencyList[bigram] = newBigramFrequencies[oldBigramFreq]
return bigramFrequencyList
#makes a monogram (word) frequency list for a given corpus
def makeWordFreqList(filename):
tokenFrequencyList = {} #track freq of each token
#split the corpus into sentences.
corpusSentences = sensplit.sen_splitter(filename)
#for each sentence...
for sen in corpusSentences:
#Make tokens (words) from the sentence by splitting on whitespace.
senTokens = sen.split()
#for each 'word' in the current sentence...
for token in senTokens:
#ignore any stop words (function words, etc).
#stop words will be found in stopwords.txt
#first turn the token lowercase for easier comparison!
token = token.lower()
#remove punctuation if it exists in the current token.
token = removePunctuation(token)
#if not a stop word...
if not isStopWord(token) and len(token.strip()) > 0:
#add to the list
if token in tokenFrequencyList:
tokenFrequencyList[token] += 1
else:
tokenFrequencyList[token] = 1
return tokenFrequencyList