-
Notifications
You must be signed in to change notification settings - Fork 40
/
tfidf_test.py
92 lines (68 loc) · 3.06 KB
/
tfidf_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import unittest
from math import log
from tfidf import TfIdf, kUNK, log10
class TestSequenceFunctions(unittest.TestCase):
def setUp(self):
self.unk_cutoff = 2
self.vocab = TfIdf(unk_cutoff=self.unk_cutoff)
def test_vocab(self):
self.vocab.train_seen("a", 300)
self.vocab.train_seen("b")
self.vocab.train_seen("c")
self.vocab.finalize()
# Infrequent words should look the same
self.assertEqual(self.vocab.vocab_lookup("b"),
self.vocab.vocab_lookup("c"))
# Infrequent words should look the same as never seen words
self.assertEqual(self.vocab.vocab_lookup("b"),
self.vocab.vocab_lookup("d"),
"")
# The frequent word should be different from the infrequent word
self.assertNotEqual(self.vocab.vocab_lookup("a"),
self.vocab.vocab_lookup("b"))
def test_censor(self):
self.vocab.train_seen("a", 300)
self.vocab.train_seen("b")
self.vocab.train_seen("c")
self.vocab.finalize()
censored_a = [str(x) for x in self.vocab.tokenize("a b d")]
censored_b = [str(x) for x in self.vocab.tokenize("d b a")]
censored_c = [str(x) for x in self.vocab.tokenize("a b d")]
censored_d = [str(x) for x in self.vocab.tokenize("b d a")]
self.assertEqual(censored_a, censored_c)
self.assertEqual(censored_b, censored_d)
# Should add start and end tag
print(censored_a)
self.assertEqual(len(censored_a), 3)
self.assertEqual(censored_a[0], censored_b[2])
self.assertEqual(censored_a[1], censored_b[0])
def test_tf(self):
self.vocab.train_seen("a", 300)
self.vocab.finalize()
self.vocab.add_document("a a b")
# Test MLE
word_a = self.vocab.vocab_lookup("a")
word_b = self.vocab.vocab_lookup("b")
word_c = self.vocab.vocab_lookup("c")
self.assertAlmostEqual(self.vocab.term_freq(word_a), 0.66666666)
self.assertAlmostEqual(self.vocab.term_freq(word_b), 0.33333333)
self.assertAlmostEqual(self.vocab.term_freq(word_c), 0.33333333)
def test_df(self):
self.vocab.train_seen("a", 300)
self.vocab.train_seen("b", 100)
self.vocab.finalize()
self.vocab.add_document("a a b")
self.vocab.add_document("b b c")
self.vocab.add_document("a a a")
self.vocab.add_document("a a a")
# Test MLE
word_a = self.vocab.vocab_lookup("a")
word_b = self.vocab.vocab_lookup("b")
word_c = self.vocab.vocab_lookup("c")
word_d = self.vocab.vocab_lookup("d")
self.assertAlmostEqual(self.vocab.inv_docfreq(word_a), log10(1.3333333))
self.assertAlmostEqual(self.vocab.inv_docfreq(word_b), log10(2.0))
self.assertAlmostEqual(self.vocab.inv_docfreq(word_c), log10(4.0))
self.assertAlmostEqual(self.vocab.inv_docfreq(word_d), log10(4.0))
if __name__ == '__main__':
unittest.main()