-
Notifications
You must be signed in to change notification settings - Fork 0
/
unigram_model_classifier.py
176 lines (140 loc) · 6.36 KB
/
unigram_model_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#training language (unigram/bigram) models for positive and negative articles
#imports -----------------------------------------------------------------------
import re
import os
import math
# create separate corpus/vocab with Pos/Neg articles ---------------------------
corpus_pos_train = ""
corpus_neg_train = ""
#counters for documents in each class & total
N_pos = 0
N_neg = 0
N_doc = 0
for filename in os.listdir('movies/train'):
filepath = os.path.join('movies/train',filename)
if os.path.isfile(filepath):
N_doc += 1
#if positive, add to positive
if re.match(r'P-[a-z]*[0-9]*.txt',filename):
text = ' '.join(open(filepath,'r').readlines())
corpus_pos_train += '<S> '+text+' ' #add "<S>" to indicate start of article
N_pos += 1
#if negative, add to negative
elif re.match(r'N-[a-z]*[0-9]*.txt',filename):
text = ' '.join(open(filepath,'r').readlines())
corpus_neg_train += '<S> '+text+' ' #add "<S>" to indicate start of article
N_neg += 1
# preprocessing functions ------------------------------------------------------
#function: tokenize
def heavy_norm_tokenizer(text):
text = re.sub('\n',' ',text) #remove newline character
text = re.sub('\'',' \'',text) #remove apostrophes (e.g. so 'haven't -> havent')
words = re.sub(r'[^\w\s\'\<S>]',' ',text) #replace all punctuation with spaces (leave apostrophes and <S> alone)
normalized = words.lower()
tokens = normalized.split(' ') #tokenise
tokens = [tok for tok in tokens if tok != ''] #remove empty-strings
return tokens
#function: create n-gram with given n
def create_ngrams(n, tokens):
ngram_list = []
for tok_location, tok in enumerate(tokens):
if tok_location <= len(tokens)-n: #stops at last possible index for ngram length
ngram = ''
for i in range(n): #n=3 -> [0,1,2]
ngram += tokens[tok_location+i]
ngram += ' '
ngram_list.append(ngram)
return ngram_list
# unigram language model -------------------------------------------------------
#training---------
def train_model(unigrams, Vtot, N_class, N_doc, k):
#P(c) = logprior is log(docs in class c/tot docs) -> MLE!
logprior = math.log((float(N_class)/float(N_doc)),2)
#V = Vtot -> total vocabulary disregarding classes
#bigdoc[c] = unigrams -> all unigrams from docs in class c
#smoothing factor k to avoid negative probabilities
loglikelihoods = {}
for word in Vtot: #calculate P(w|c) terms
count_w_c = float(unigrams.count(word)) #occurrences of word in bigdoc[c]
len_class = float(len(unigrams)) #number of words in class
len_Vtot = float(len(Vtot))
likelihood_w_c = (count_w_c + k)/(len_class+(k*len_Vtot))
loglikelihood_w_c = math.log(likelihood_w_c,2)
loglikelihoods[word] = loglikelihood_w_c
return logprior , loglikelihoods
#testing -----------
#test does NOT compare classes (i.e. lang models for each class), just returns probabilities for one
def test_model(testdoc_tokens, model, Vtot):
prob_modelclass = model[0] #add logprior of class from trained model
for word in testdoc_tokens:
if word in Vtot: #ignore if not in trained vocab
prob_modelclass += model[1][word] #add loglikelihood for word from trained model
return prob_modelclass
#this DOES give most likely class based on both models
def test_getclass(testdoc_tokens,model_pos,model_neg,Vtot):
prob_pos = test_model(testdoc_tokens, model_pos, Vtot)
print('Pos with: '+str(prob_pos))
prob_neg = test_model(testdoc_tokens, model_neg,Vtot)
print('Neg with: '+str(prob_neg))
if prob_pos > prob_neg:
print("Classification: P")
return "P"
elif prob_neg > prob_pos:
print("Classification: N")
return "N"
else:
print("Classes are equally probable")
return "?"
# actually train models (training phase) ------------------------------------------------
#set input parameters:
print("Tokenizing and preparing unigrams ...")
#tokens per class
pos_unigrams = create_ngrams(1,heavy_norm_tokenizer(corpus_pos_train))
neg_unigrams = create_ngrams(1,heavy_norm_tokenizer(corpus_neg_train))
#whole vocabulary (unique tokens)
print("Setting parameters ...\nThis takes a while, because we are removing all words appearing less than 25 times in the whole collection in a really inefficient way! Please be patient...")
Vtot = list(set(pos_unigrams + neg_unigrams))
#######To remove all unigrams that appear less than 25 times:
Vtot = [uni for uni in Vtot if (pos_unigrams+neg_unigrams).count(uni) >= 25]
print('vocab: '+str(len(Vtot)))
pos_unigrams = [uni for uni in pos_unigrams if (uni in Vtot)]
neg_unigrams = [uni for uni in neg_unigrams if (uni in Vtot)]
print('positive unigrams: '+str(len(pos_unigrams)))
print('negative unigrams: '+str(len(neg_unigrams)))
#smoothing factor
k = 10
print("Training models...k is set to "+str(k))
#train
pos_model = train_model(pos_unigrams, Vtot, N_pos, N_doc, k)
neg_model = train_model(neg_unigrams, Vtot, N_neg, N_doc, k)
print("Models trained!")
# test models ------------------------------------------------------------------
print("Testing models, classifying test data...\n")
correct_classification = 0
false_classification = 0
#open output document:
f = open("output5_k10.txt","w+")
#get results for each test file (iterate through test directory):
for filename in os.listdir('movies/test'):
filepath = os.path.join('movies/test',filename)
if os.path.isfile(filepath):
print(filename)
#get text
test_text = ' '.join(open(filepath,'r').readlines())
#normalize/tokenize/n-gram-ize in same way as training data
test_unigrams = create_ngrams(1,heavy_norm_tokenizer(test_text))
#use models to predict category!
c = test_getclass(test_unigrams,pos_model,neg_model,Vtot)
print('\n')
if c in filename:
correct_classification +=1
else:
false_classification +=1
#Append new info to output file
f = open("output5_k10.txt","a+")
f.write(filename[:-4]+'\t'+c+'\n')
f.close()
print("Correctly classified: "+str(correct_classification))
print("Falsely classified: "+str(false_classification))
accuracy = (float(correct_classification)/(correct_classification+false_classification))*100
print("Accuracy: "+str(accuracy)+"%")