-
Notifications
You must be signed in to change notification settings - Fork 0
/
LDA_Model.py
140 lines (106 loc) · 4.99 KB
/
LDA_Model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 17 17:25:05 2022
@author: Acer
"""
#topics (i.e. dockets) = 37
#Tutorial: https://github.com/rsreetech/LDATopicModelling
# https://www.youtube.com/watch?v=nNvPvvuPnGs
import pandas as pd
import string
import json
import argparse
import pickle
import spacy
from gensim import corpora
from nltk.corpus import stopwords
from gensim.models.ldamodel import LdaModel
from Data_Preprocessing_for_Topic_Models import clean_text
from Data_Preprocessing_for_Topic_Models import read_cases
from Data_Preprocessing_for_Topic_Models import lemmatization
from Data_Preprocessing_for_Topic_Models import create_term_matrix
def fit_model(dictionary, dataset, output_filename, num_topics):
model = LdaModel(corpus=dataset,
num_topics=num_topics,
id2word=dictionary,
passes=100,
random_state=1)
model.save(output_filename)
return model
def load_model(filename):
return LdaModel.load(filename)
def output_topics(model, num_topics):
'''
topics = ''
for topicno in range(num_topics):
#sorted_topic_terms = sorted(model.print_topic(topicno=topicno, topn=10), key=lambda x: x[1], reverse=True)
sorted_topic_terms = model.print_topic(topicno=topicno, topn=1000)
topics += str(topicno) + ': ' + str(sorted_topic_terms) + '\n'
print('topics =', topics)
topics_file = open("lda_model_topics_num_topics=" + str(num_topics) + ".txt", "w")
n = topics_file.write(str(topics))
topics_file.close()
return topics_file
'''
words_per_topic_dict = {}
for topicno in range(num_topics):
sorted_topic_terms = model.print_topic(topicno=topicno, topn=10)
words_per_topic_dict[topicno] = sorted_topic_terms
with open('LDA_words_per_topic_num_topics=' + str(num_topics) + '.json', 'wb') as f:
pickle.dump(words_per_topic_dict, f)
with open('LDA_words_per_topic_num_topics=' + str(num_topics) + '.json', 'rb') as f:
words_per_topic_dict = pickle.load(f)
return words_per_topic_dict
def output_doc_topic_distribution(model, doc_term_matrix, num_topics):
all_topics = model.get_document_topics(doc_term_matrix)#, minimum_probability=0.1)
#topic_distribution_per_doc_file = open("lda_model_topics_per_doc_num_topics=" + str(num_topics) + ".txt", "w")
topics_per_doc_dict = {}
case_id = 0
for doc_topics in all_topics:
case_id += 1
doc_topics = sorted(doc_topics, key=lambda x: x[1], reverse=True)
if len(doc_topics) > 3:
doc_topics = doc_topics[:3] #only select top 3 topics per doc
print('Document ' + str(case_id) + ' has most likely topics: ' + str(doc_topics) + '\n')
topics_per_doc_dict[case_id] = doc_topics
with open('LDA_topics_per_doc_num_topics=' + str(num_topics) + '.json', 'wb') as f:
pickle.dump(topics_per_doc_dict, f)
with open('LDA_topics_per_doc_num_topics=' + str(num_topics) + '.json', 'rb') as f:
topics_per_doc_dict = pickle.load(f)
return topics_per_doc_dict
#n = topic_distribution_per_doc_file.write(
# 'Document ' + str(case_id) + ' has most likely topics: ' + str(doc_topics) + '\n')
#topic_distribution_per_doc_file.close()
#return topic_distribution_per_doc_file
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Fit LDA on court cases")
parser.add_argument('--limit', type=int,
default=-1, help="Limit of number of cases")
parser.add_argument('--cases_source', type=str,
default="bverfg230107_with_break_noNaN.csv")
parser.add_argument('--num_topics', type=int, default=200)
flags = parser.parse_args()
# Save and Load
#Comment out this section after first run and having saved the dictionary and cases
'''
dictionary, cases = read_cases(flags.cases_source, limit=flags.limit)
with open('read_cases_dictionary.json', 'wb') as f:
pickle.dump(dictionary, f)
with open('read_cases_cases.json', 'wb') as f:
pickle.dump(cases, f)
'''
# Comment out this section after first run and having saved the dictionary and cases
with open('read_cases_dictionary.json', 'rb') as f:
dictionary = pickle.load(f)
with open('read_cases_cases.json', 'rb') as f:
cases = pickle.load(f)
#Save cases (corpora)
'''
corpora_file = open("corpora.txt", "w")
n = corpora_file.write(str(cases))
corpora_file.close()
'''
model = fit_model(dictionary, cases, output_filename="lda_model_num_topics=" + str(flags.num_topics) + ".save", num_topics=flags.num_topics)
model = load_model(filename="lda_model_num_topics=" + str(flags.num_topics) + ".save")
words_per_topic_dict = output_topics(model, num_topics = flags.num_topics)
topics_per_doc_dict = output_doc_topic_distribution(model, doc_term_matrix=cases, num_topics = flags.num_topics)