-
Notifications
You must be signed in to change notification settings - Fork 0
/
spellcheck.py
115 lines (99 loc) · 4.86 KB
/
spellcheck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import re, collections
import string
import operator
def words(text): return re.findall('[a-z]+', text.lower())
# Training function which adds 1 if not exists
def train(features):
model = collections.defaultdict(lambda: 1)
for f in features:
model[f] += 1
return model
NWORDS = train(words(file('words.txt').read()))
# Definition of alphabet
alphabet = 'abcdefghijklmnopqrstuvwxyz'
# Assumed that if the controlled word is already found,
# number of occurrences will be counted x100 (100%) so that
# its probability will calculated as increased
weight_0away = 100
# Distance weights gathered from
# "Design of an interactive spell checker: optimizing the list of
# offered words" by Garfinkel, Fernandez, Gopal
weight_1away = 95
weight_2away = 4
weight_3away = 1
# 1-away edit function, gathered from the file in COW
def edits1(word):
# Split for helper
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
# Deleting a letter
deletes = [a + b[1:] for a, b in splits if b]
# Transposing 2 letters
transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
# Replacing a letter with another letter
replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b]
# Inserting a letter into word
inserts = [a + c + b for a, b in splits for c in alphabet]
# Return complete set
return set(deletes + transposes + replaces + inserts)
# 2-away edit function, it uses 1-away edit function twice
def edits2(word):
return set(two_away for one_away in edits1(word) for two_away in edits1(one_away))
# Function for words that are 1-away in distance
# Input: Complete list of 1-away distant words
# Return: Set of words that are already occurring in corpus
# Count of 1-away words is multiplied with "weight_1away" and
# count of original word is multiplied with "weight_0away"
def found_1away_not_sorted(word):
temp2 = dict([(x, NWORDS.get(x)*weight_1away) for x in edits1(word) if x in NWORDS])
if word in NWORDS and word in temp2.keys():
temp2[word] = temp2[word] * weight_0away
return temp2
# Function for words that are 2-away in distance
# Input: Complete list of 2-away distant words
# Return: Set of words that are already occurring in corpus
# Count of 2-away words is multiplied with "weight_2away" and
# count of original word is multiplied with "weight_0away"
def found_2away_not_sorted(word):
temp2 = dict([(x, NWORDS.get(x)*weight_2away) for x in edits2(word) if x in NWORDS])
if word in NWORDS and word in temp2.keys():
temp2[word] = temp2[word] * weight_0away
return temp2
# Function for words that are 3-away in distance
# Return: Set of words that are already occurring in corpus, sorted according to their counts
# Count of 3-away words is multiplied with "weight_3away" and
# count of original word is multiplied with "weight_0away"
def found_3away_with_numbers(word):
temp1 = [three_away for two_away in edits2(word) for three_away in edits1(two_away)]
temp2 = [x for x in temp1 if x in NWORDS]
temp3 = dict([(x, NWORDS.get(x)*weight_3away) for x in temp2])
if word in NWORDS and word in temp3.keys():
temp3[word] = temp3[word] * weight_0away
return sorted(temp3.iteritems(), key=operator.itemgetter(1),reverse=True)
# If words is short (<4) then only consider 1-edit-away, otherwise:
# Firstly, if there are any, combines 1-edit-away and 2-edit-away known words
# Then sorts and returns the top 5 according to their weighted counts
# If there is no 1 or 2 away known words, it returns the 3-edit-away words that already sorted
def found_all(word, limited):
if len(word) < 4 and len(found_1away_not_sorted(word)) != 0:
temp1 = dict(found_1away_not_sorted(word).items())
return sorted(temp1.iteritems(), key=operator.itemgetter(1),reverse=True) [:5]
elif len(found_1away_not_sorted(word)) != 0 and len(found_2away_not_sorted(word)) != 0:
temp1 = dict((found_2away_not_sorted(word).items() + found_1away_not_sorted(word).items()))
return sorted(temp1.iteritems(), key=operator.itemgetter(1),reverse=True) [:5]
elif len(found_2away_not_sorted(word)) != 0:
temp1 = dict(found_2away_not_sorted(word).items())
return sorted(temp1.iteritems(), key=operator.itemgetter(1),reverse=True) [:5]
# Considering MemoryError where 3-edit-away
elif limited < 1:
return found_3away_with_numbers(word) [:5]
else:
return []
# Correction function that returns words and their probabilities
def correct_word(word):
total_counts = sum(j for i, j in found_all(word,0))
return [(x,round((float(y)/float(total_counts)),3)) for x,y in found_all(word,0)] [:5]
# Correction function that returns words and their probabilities
def correct_word_memory_error(word):
total_counts = sum(j for i, j in found_all(word,1))
return [(x,round((float(y)/float(total_counts)),3)) for x,y in found_all(word,1)] [:5]
# End of code