-
Notifications
You must be signed in to change notification settings - Fork 1
/
collect_info_for_lda.py
253 lines (225 loc) · 9.41 KB
/
collect_info_for_lda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
# -*- coding: utf-8 -*-
from nlp_models import session, HseArticle
from sqlalchemy import distinct
import re
import pymorphy2
import MySQLdb
import shelve
from bs4 import UnicodeDammit
words = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "a", "b",
"c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n",
"o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
"а","б","в","г","д","е", "ё","ж","з","и","л","м","н","о",
"п","р","с","т","у","ф","х","ц","ш","щ","ъ","ь","э","ю","я",
"большой", "бы", "быть", "в", "весь", "вот", "все",
"всей", "вы", "говорить", "год", "да", "для", "до", "еще",
"же", "знать", "и", "из", "к", "как", "который", "мочь",
"мы", "на", "наш", "не", "него", "нее", "нет", "них", "но",
"о", "один", "она", "они", "оно", "оный", "от", "ото", "по",
"с", "свой", "себя", "сказать", "та", "такой", "только", "тот",
"ты", "у", "что", "это", "этот", "я", "без", "более", "больше",
"будет", "будто", "бы", "был", "была", "были", "было", "быть",
"вам", "вас", "ведь", "весь", "вдоль", "вдруг", "вместо",
"вне", "вниз", "внизу", "внутри", "во", "вокруг", "вот",
"впрочем", "все", "всегда", "всего", "всех", "всю", "вы",
"где", "да", "давай", "давать", "даже", "для", "до",
"достаточно", "другой", "его", "ему", "ее", "её", "ей", "если",
"есть", "ещё", "еще", "же", "за", "за исключением", "здесь",
"из", "из-за", "из", "или", "им", "иметь", "иногда", "их",
"как-то", "кто", "когда", "кроме", "кто", "куда", "ли", "либо",
"между", "меня", "мне", "много", "может", "мое", "моё", "мои",
"мой", "мы", "на", "навсегда", "над", "надо", "наконец", "нас",
"наш", "не", "него", "неё", "нее", "ней", "нет", "ни",
"нибудь", "никогда", "ним", "них", "ничего", "но", "ну", "об",
"однако", "он", "она", "они", "оно", "опять", "от", "отчего",
"очень", "перед", "по", "под", "после", "потом", "потому",
"потому что", "почти", "при", "про", "раз", "разве", "свою",
"себя", "сказать", "снова","с", "со", "совсем", "так", "также",
"такие", "такой", "там", "те", "тебя", "тем", "теперь",
"то", "тогда", "того", "тоже", "той", "только", "том", "тот",
"тут", "ты", "уже", "хоть", "хотя", "чего", "чего-то", "чей",
"чем", "через", "что", "что-то", "чтоб", "чтобы", "чуть",
"чьё", "чья", "эта", "эти", "это", "эту", "этого", "этом",
"этот","к"]
words = [UnicodeDammit(word).unicode_markup for word in words]
#Deprecated functionality
def collectInfo ():
morph = pymorphy2.MorphAnalyzer()
titles = session.query(distinct(HseArticle.title)).all()
print "len of distinct(titles) : " + str(len(titles))
titles_list = [x[0] for x in titles]
titles_dic = {}
isAuthor = re.compile('^Author:\s*', re.IGNORECASE)
isPubList = False
isPub = re.compile('^\thttp://publications.hse.ru/view/.*', re.IGNORECASE)
fileArr = open("logfile2.txt","r").readlines()
author_name = ""
for line in fileArr:
if isAuthor.match(line):
lineArr = line.split(":")[-1].split()
#print lineArr
lineArr = line.split()
authorUri = lineArr[-1]
author_name = ' '.join(lineArr[1:4])
author_name = UnicodeDammit(author_name).unicode_markup
# print author_name
# break
# for l in lineArr:
# if len(l) > 3:
# print l.lower()
# author_name = l.lower()
# break
if author_name == "":
isPubList = False
continue
else:
isPubList = True
elif isPubList:
if isPub.match(line):
print line.strip() + ' ' + author_name
pub = line.strip()
if author_name != "":
if pub not in titles_dic.keys():
titles_dic[pub] = []
if author_name not in titles_dic[pub]:
titles_dic[pub].append(authorUri)
# print UnicodeDammit(author_name).unicode_markup
result_list = []
authors = []
for uri in titles_dic.keys():
#print uri
collected_info = []
article = session.query(HseArticle)\
.filter(HseArticle.uri == uri)\
.first()
stop_string = ":.-()!,[]'\"|"
abstr_list = []
for x in article.abstr.split():
if x not in words:
x = x.strip(stop_string).lower()
abstr_list.append(morph.parse(UnicodeDammit(x).unicode_markup)[0].normal_form)
keyword_list = []
for x in article.keyword.split(";"):
for y in x.split(" "):
if y not in words:
y = y.strip(stop_string).lower()
keyword_list.append(morph.parse(UnicodeDammit(y).unicode_markup)[0].normal_form)
title_list = []
for x in article.title.split():
if x not in words:
x = x.strip(stop_string).lower()
title_list.append(morph.parse(UnicodeDammit(x).unicode_markup)[0].normal_form)
elib_list = []
for x in article.elib.split():
if x not in words:
x = x.strip(stop_string).lower()
elib_list.append(morph.parse(UnicodeDammit(x).unicode_markup)[0].normal_form)
interest_list = []
for x in article.interest.split():
if x not in words:
x = x.strip(stop_string).lower()
interest_list.append(morph.parse(UnicodeDammit(x).unicode_markup)[0].normal_form)
author_list = []
for x in article.authors.split():
if x not in words:
x = x.strip(stop_string).lower()
author_list.append(x)
# author_list.extend(titles_dic[uri])
# session.commit()
# article.authors = " ".join(titles_dic[uri])
# session.commit()
collected_info.extend(abstr_list)
collected_info.extend(keyword_list)
collected_info.extend(title_list)
collected_info.extend(elib_list)
collected_info.extend(interest_list)
collected_info.extend(author_list)
authors.extend(author_list)
result_list.append(collected_info)
return result_list,authors
#print author_list
def collectInfo2 ():
morph = pymorphy2.MorphAnalyzer()
titles = session.query(distinct(HseArticle.title)).all()
d = shelve.open("authors.list")
#titles_list = [x[0] for x in titles]
# titles_dic = {}
# isAuthor = re.compile('^Author:\s*', re.IGNORECASE)
# isPubList = False
# isPub = re.compile('^\thttp://publications.hse.ru/view/.*', re.IGNORECASE)
# fileArr = open("logfile2.txt","r").readlines()
# author_name = ""
# for line in fileArr:
# if isAuthor.match(line):
# lineArr = line.split(":")[-1].split()
# #print lineArr
# lineArr = line.split()
# authorUri = lineArr[-1]
# author_name = ' '.join(lineArr[1:4])
# author_name = UnicodeDammit(author_name).unicode_markup
# if author_name == "":
# isPubList = False
# continue
# else:
# isPubList = True
# elif isPubList:
# if isPub.match(line):
# # print line.strip() + ' ' + author_name
# pub = line.strip()
# if author_name != "":
# if pub not in titles_dic.keys():
# titles_dic[pub] = []
# if author_name not in titles_dic[pub]:
# titles_dic[pub].append(authorUri)
# # print UnicodeDammit(author_name).unicode_markup
# print len(titles_dic.keys())
titles_dic = {}
for authUri in d["authorUri2paper"].keys():
for paperUri in d["authorUri2paper"][authUri]:
if paperUri not in titles_dic.keys():
titles_dic[paperUri] = []
titles_dic[paperUri].append(authUri)
else:
if authUri not in titles_dic[paperUri]:
titles_dic[paperUri].append(authUri)
# print len(titles_dic.keys())
result_list = []
# authors = []
db = MySQLdb.connect(host="localhost", user="root", passwd="pass", db="nlp", charset='utf8')
cursor = db.cursor()
author_list = set([])
for uri in titles_dic.keys():
collected_info = []
sql = """SELECT abstr,keyword,title,elib,interest,authors
FROM hse_article WHERE uri = "{}" """.format(uri)
cursor.execute(sql)
article = cursor.fetchall()
for column in article:
# print column[5].split()
for a in column[5].split():
author_list.add(a)
# print re.split(';|,|\)|\(|"|\]|\[| |',column[5])
for word in column:
w = re.split(';|,|\)|\(|"|\]|\[| |',word)
for i in w:
if i == u"":
continue
i = morph.parse(i)[0].normal_form
i = i.lower()
if i not in words:
# isPub = re.compile('^http://www.hse.ru/org/.*', re.IGNORECASE)
# if isPub.match(i):
# print i
collected_info.append(i)
# else:
# print "Is common word!"
# print collected_info
# break
result_list.append(collected_info)
db.close()
print author_list
print len(author_list)
print result_list
return result_list
if __name__=="__main__":
collectInfo2()