-
Notifications
You must be signed in to change notification settings - Fork 0
/
combining_classifiers.py
127 lines (103 loc) · 5.08 KB
/
combining_classifiers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import nltk
import random
from nltk.corpus import movie_reviews
import pickle
import os.path
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self.classifiers = classifiers
def classify(self, features):
votes = []
for c in self.classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = []
for c in self.classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]
def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
print(find_features(movie_reviews.words('neg/cv000_29416.txt')))
featuresets = [(find_features(rev), category) for (rev, category) in documents]
training_set = featuresets[:1900]
testing_set = featuresets[1900:]
if os.path.exists("naivebayes.pickle"):
with open("naivebayes.pickle", "rb") as classifier_f:
classifier = pickle.load(classifier_f)
else:
with open("naivebayes.pickle", "wb") as save_classifier:
classifier = nltk.NaiveBayesClassifier.train(training_set)
pickle.dump(classifier, save_classifier)
print("Original Naive Bayes Algo accuracy perecent: ", (nltk.classify.accuracy(classifier, testing_set)) * 100)
classifier.show_most_informative_features(15)
MMB_classifier = SklearnClassifier(MultinomialNB())
MMB_classifier.train(training_set)
print("MultinomialNB accuracy perecent: ", (nltk.classify.accuracy(MMB_classifier, testing_set)) * 100)
# GMB_classifier = SklearnClassifier(GaussianNB())
# GMB_classifier.train(training_set)
# print("GaussianNB accuracy perecent: ", (nltk.classify.accuracy(classifier, testing_set)) * 100)
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB accuracy perecent: ", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set)) * 100)
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print(
"LogisticRegression accuracy perecent: ",
(nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) * 100)
SGD_classifier = SklearnClassifier(SGDClassifier())
SGD_classifier.train(training_set)
print("SGDClassifier accuracy perecent: ", (nltk.classify.accuracy(SGD_classifier, testing_set)) * 100)
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC accuracy perecent: ", (nltk.classify.accuracy(SVC_classifier, testing_set)) * 100)
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC accuracy perecent: ", (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100)
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC accuracy perecent: ", (nltk.classify.accuracy(NuSVC_classifier, testing_set)) * 100)
# Combining the classifiers
voted_classifier = VoteClassifier(classifier,
MMB_classifier,
BernoulliNB_classifier,
LogisticRegression_classifier,
SGD_classifier,
LinearSVC_classifier,
NuSVC_classifier)
print("voted_classifier accuracy perecent: ", (nltk.classify.accuracy(voted_classifier, testing_set)) * 100)
print("Classfication: ", voted_classifier.classify(testing_set[0][0]), " Confidence %: ",
voted_classifier.confidence(testing_set[0][0]) * 100)
print("Classfication: ", voted_classifier.classify(testing_set[0][0]), " Confidence %: ",
voted_classifier.confidence(testing_set[1][0]) * 100)
print("Classfication: ", voted_classifier.classify(testing_set[0][0]), " Confidence %: ",
voted_classifier.confidence(testing_set[2][0]) * 100)
print("Classfication: ", voted_classifier.classify(testing_set[0][0]), " Confidence %: ",
voted_classifier.confidence(testing_set[3][0]) * 100)
print("Classfication: ", voted_classifier.classify(testing_set[0][0]), " Confidence %: ",
voted_classifier.confidence(testing_set[4][0]) * 100)
print("Classfication: ", voted_classifier.classify(testing_set[0][0]), " Confidence %: ",
voted_classifier.confidence(testing_set[5][0]) * 100)