-
Notifications
You must be signed in to change notification settings - Fork 4
/
document.py
58 lines (48 loc) · 1.58 KB
/
document.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# -*- coding: utf-8 -*-
__author__ = 'tan'
# 单个文档
class Document(object):
def __init__(self):
self.words = []
self.length = 0
# 整个文档集合
class DataSet(object):
def __init__(self):
self.M = 0
self.V = 0
self.docs = []
self.word2id = {}
self.id2word = {}
# 从文件中加载文档,并转换为词汇
def load(self, filename):
with open(filename, "r") as f:
print("Loading data from " + filename)
lines = f.readlines()
idx = 0
docnum = 0
for line in lines:
line = line.strip()
if len(line) == 0:
continue
docnum += 1
words = line.split()
doc = Document()
for word in words:
if word not in self.word2id:
self.word2id[word] = idx
self.id2word[idx] = word
doc.words.append(idx)
idx += 1
else:
doc.words.append(self.word2id[word])
doc.length = len(words)
self.docs.append(doc)
self.M = docnum
self.V = len(self.word2id)
print('There are %d documents' % self.M)
print('There are %d items' % self.V)
# 保存词汇列表
def save_vocabulary(self, filename):
with open(filename, 'w') as f:
for k, v in self.word2id.items():
f.write(k + '\t' + str(v) + '\n')