-
Notifications
You must be signed in to change notification settings - Fork 1
/
NJU_Topic_model.py
124 lines (95 loc) · 3.23 KB
/
NJU_Topic_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*- coding: utf-8 -*-
"""
@author:
Huang Ronggui
Department of Sociology
Fudan University
Note: gensim 3.2.0 does not work with numpy 1.13.3 from anaconda
need to install numpy using pip
"""
import csv
import jieba
import os
import re
import sys
import zhon.hanzi as hanzi
from gensim import corpora
from gensim import models
import itertools
import matplotlib.pylab as plt
import numpy as np
from sqlitedict import SqliteDict
os.chdir('南大寒假课程/实例')
csv.field_size_limit(sys.maxsize)
def word_tokenizer(doc):
pat_uid = re.compile(u"@[a-zA-Z0-9一-龥-_]{2,30}")
doc = pat_uid.sub('', doc)
# remove user name before tokenizing
doc = re.compile("(http:)[a-zA-Z0-9.\\\\]*").sub('', doc)
# remove short urls
tokens = jieba.cut(doc)
tokens = [el for el in tokens if len(el) > 1]
# remove single character
tokens = [el for el in tokens if el not in hanzi.punctuation]
# remove Chinese punctuation
pat_num = re.compile("[0-9a-zA-Z]+")
return [el for el in tokens if pat_num.sub('', el) != '']
# remove all number and letters
def get_perplexity(model, corpus):
p = np.exp2(- model.log_perplexity(corpus))
return p
file_path = "南大课程2018.csv"
file = open(file_path, errors="replace", encoding="utf8")
reader = csv.reader(file)
reader4d, reader4c = itertools.tee(reader, 2) # 2 independent iterators
# dictionary
corpus4dict = (word_tokenizer(line[0]) for line in reader4d)
dictionary = corpora.Dictionary()
dictionary.add_documents(corpus4dict)
dictionary.filter_extremes(no_below=20, no_above=0.5)
filtered_tokens = ['微博', '转发', '回复']
dictionary.filter_tokens(bad_ids=[dictionary.token2id[k] for k in filtered_tokens])
dictionary.compactify()
print(dictionary)
dictionary.save("NJU_dictionary")
# dictionary = corpora.Dictionary.load("NJU_dictionary")
# serialized corpus
corpus = (dictionary.doc2bow(word_tokenizer(line[0])) for line in reader4c
if dictionary.doc2bow(word_tokenizer(line[0])))
corpora.MmCorpus.serialize("南大课程2018.mm", corpus)
# load the serialized corpus
mm_corpus = corpora.MmCorpus("南大课程2018.mm")
print(mm_corpus)
# LDA topic model
model_dict = SqliteDict("ladmodels.sqlite", autocommit=True)
for k in range(2, 16):
model = models.LdaModel(corpus=mm_corpus, num_topics=k, id2word=dictionary)
# model evaluation
cm = models.CoherenceModel(model=model, corpus=mm_corpus, dictionary=dictionary, coherence="u_mass")
umass = cm.get_coherence()
model_dict[k] = (model, umass)
print(k, "is finished", "u-mass is", umass)
# visualization of coherence scores
plt.plot(range(2, 16), [model_dict[e][1] for e in range(2, 16)], "ko-")
perplexity = []
for k in range(2, 16):
print("calculating perplexity for model with %s topics" % k)
model = model_dict[k][0]
p = get_perplexity(model, mm_corpus)
perplexity.append(p)
plt.plot(range(2, 16), perplexity, "ko-")
# cohrence of each topic
m5 = model_dict[5][0]
m5t = m5.top_topics(mm_corpus)
[t[1] for t in m5t]
m7 = model_dict[7][0]
m7t = m7.top_topics(mm_corpus)
[t[1] for t in m7t]
# choose the final model
model = model_dict[5][0]
# key words of topics
model.show_topics()
# topic probabilities of docs
doc0 = mm_corpus[0]
model[doc0]