This repository has been archived by the owner on Sep 6, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
IBM1.py
182 lines (159 loc) · 5.34 KB
/
IBM1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
"""
Implementation of IBM Model 1 and EM algorithm
"""
def not_converged(p_prob, curr_prob, h_words, e_words):
"""
Method to check whether the values in the
translation table have converged or not
:param p_prob: Previous values in the translation table (dictionary)
:param curr_prob: Current translation table (dictionary)
:param h_words: Foreign language words (list)
:param e_words: English language words (list)
:return:
"""
# First call of this function when the
# previous table is empty
if p_prob == {}:
return 1
val = 0
for ew in e_words:
for hw in h_words:
t = (p_prob[ew][hw] - curr_prob[ew][hw])**2
val += t
val = val**0.5
# Condition for convergence
if val < 0.0005:
return 0
else:
return 1
def get_alignments(sen1, sen2, prob):
"""
Method to get the alignment given two sentences and the word
to word translation probabilities
:param sen1: List of words in sentence 1 in order
:param sen2: List of words in sentence 2 in order
:param prob: Probabilities of word to word translations
:return: alignment for the translation of sentence 1 to sentence 2
"""
alignment = []
e = 0
while e < len(sen1):
# List of words in sen2 having max probability
# for the current word in sen1
al = []
p_val = 0
f = 0
while f < len(sen2):
if p_val < prob[sen1[e]][sen2[f]]:
p_val = prob[sen1[e]][sen2[f]]
al.clear()
al.append(f)
else:
if p_val == prob[sen1[e]][sen2[f]]:
al.append(f)
f += 1
# Appending the tuples to the alignment list
for f in al:
alignment.append((e, f))
e += 1
return alignment
def run(flag):
"""
:param flag: True if the texts need to be printed
:return:
"""
# Importing Data
translations = []
try:
file = open("data\\hin.txt", 'r', encoding='utf-8')
text = file.read()
translations = text.split("\n")
except:
pass
# Corpus maintains a list of 2 lists (english words list
# and the corresponding foreign language words list) for
# each sentence extracted from the dataset
corpus = []
# Unique english words in the dataset
eng_words = []
# Unique foreign language words in the dataset
hin_words = []
# List of sentence pairs of english and the foreign language
sentence_pairs = []
# Processing data
for i in translations:
s = i.split('\t')
if len(s) < 2:
pass
else:
sen = []
eng = s[0]
hin = s[1]
sen.append(eng)
sen.append(hin)
sentence_pairs.append(sen)
eng = eng.split('.')
eng = eng[0].split('!')
eng = eng[0].split(',')
eng = eng[0].split('?')
eng = eng[0].split(' ')
hin = hin.split('?')
hin = hin[0].split('!')
hin = hin[0].split(',')
hin = hin[0].split('।')
hin = hin[0].split(' ')
for h in hin:
if h != '':
hin_words.append(h)
hin_words = list(set(hin_words))
for e in eng:
if e != '':
eng_words.append(e)
eng_words = list(set(eng_words))
temp = []
temp.append(eng)
temp.append(hin)
corpus.append(temp)
# Initializing Translational Probabilities (Considering all possible alignments)
t_prob = {}
for eng in eng_words:
t_prob[eng] = {hin: 1 / len(eng_words) for hin in hin_words}
prev_prob = {}
s_total = {}
# Looping through this loop till the
# probabilities do not converge
while not_converged(prev_prob, t_prob, hin_words, eng_words):
count = {}
for eng in eng_words:
count[eng] = {hin: 0 for hin in hin_words}
total = {}
for hin in hin_words:
total[hin] = 0
for pair in corpus:
for e_word in pair[0]:
s_total[e_word] = 0
for h_word in pair[1]:
s_total[e_word] += t_prob[e_word][h_word]
for e_word in pair[0]:
for h_word in pair[1]:
count[e_word][h_word] += (t_prob[e_word][h_word]) / s_total[e_word]
total[h_word] += (t_prob[e_word][h_word]) / s_total[e_word]
prev_prob = t_prob
for hin in hin_words:
for eng in eng_words:
t_prob[eng][hin] = count[eng][hin] / total[hin]
alignments = []
for c in corpus:
align = get_alignments(c[0], c[1], t_prob)
alignments.append(align)
if flag is True:
i = 0
while i < len(alignments):
print('English Text : {}'.format(sentence_pairs[i][0]))
print('Foreign Text : {}'.format(sentence_pairs[i][1]))
print('Alignment : {}'.format(alignments[i]))
print()
i += 1
return alignments
if __name__ == '__main__':
run(flag=True)