-
Notifications
You must be signed in to change notification settings - Fork 2
/
recent_script.py
141 lines (116 loc) · 5.39 KB
/
recent_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# -*- coding: utf-8 -*-
### update in wikiwho_api_api.py as well
import sys
reload(sys)
sys.setdefaultencoding("utf-8") # to handle UnicodeDecode errors
import requests
from bs4 import BeautifulSoup
from traceback import format_exc
from os import path, listdir
from math import ceil
import sys
#from collections import OrderedDict
from operator import itemgetter
import pickle
from datetime import datetime # for arithmetic on dates
from numpy import mean, std # for computing standard scores
from pageviews import format_date, article_views
# to access functions from python files in another directory
sys.path.insert(0,'/home/priyanka/Documents/Wikipedia_Accuracy_Review/wikiwho_api/wikiwho_api')
from wikiwho_api_api import mainFunction
# can set parameters such as limit, offset, etc
url = 'https://en.wikipedia.org/w/index.php?title=Special:Search&limit=500&offset=0&profile=default&search=recently' ###
categ = path.basename(url)
recdir = 'PA_records' + path.sep ###
def nextrecord():
try:
records = listdir(recdir)
record = 1+int(max(records)[:9])
### todo: check for improperly named files
return format(record, '09')
except:
return format(1, '09')
if __name__ == '__main__':
try:
r = requests.get(url)
except:
print "Error while querying Special Search" + title
print format_exc()
bs = BeautifulSoup(r.text)
cnt = 0
d = []
results = bs.find('ul', 'mw-search-results')
for entry in results.find_all('li'):
try:
heading = entry.find('div','mw-search-result-heading')
link = "https://en.wikipedia.org" + heading.a.get('href')
title = heading.a.get('title')
if 'recently' in title.lower(): # title should not contain recently
continue
pageviews = article_views(title)
if (pageviews == None):
print 'Error while obtaining pageviews for article ' + title
continue
content = entry.find('div','searchresult')
context = content.contents[0] + '<b>'+content.contents[1].contents[0]+'</b>' + content.contents[2]
date = mainFunction(title) # importing function from wikiwho_api_api.py. This uses the findWord function in Wikiwho_simple.py
if (date == None):
print 'Error while obtaining date for article ' + title
continue
print cnt+1, title, pageviews, date
d.append([title,link, pageviews, context, str(date)]) # date is of type unicode
cnt = cnt+1
# if(cnt==500):
# break
except:
print "Error while parsing article " + title
print format_exc()
pv = []
date = []
for i in d:
pv.append(i[2]) # get all the pageviews
epoch = int(datetime.strptime(i[4], '%Y-%m-%d').strftime('%s'))
date.append(epoch) # get all the dates in epoch format for doing arithmetic operations on them
i.append(epoch) # appending epoch time to i
mean_pv = mean(pv)
std_pv = std(pv)
mean_date = mean(date)
std_date = std(date)
for i in d:
i. append(float(format(float((i[2] - mean_pv)/std_pv) - float(((i[5] - mean_date)/std_date)), '.2f'))) # to rank higher in severity, article should rank high in pageview and low date epoch
# od = OrderedDict(sorted(d.items(), key=lambda t:t[1][1], reverse=True)) # ordered dict in descending order of final score
od = sorted(d, key=itemgetter(6), reverse=True) # ordered list in descending order of pageviews
print '\n\nArticle rankings based on pageviews:\n'
for item in od:
print item
with open('PA_pickles/recently_ranking.pkl', 'wb') as f: ###
pickle.dump(od, f)
# if __name__ == '__main__':
# with open('PA_pickles/recently_ranking.pkl', 'rb') as f: ### use when od has already been created; comment above stuff
# od = pickle.load(f)
cnt = 0
counter = 100 #int(ceil(0.25*len(od))) # top 20% of rankings
#url = 'http://127.0.0.1:5000/ask' # url for POSTing to ask. Replace with Labs/PythonAnywhere instance if needed
for i in od:
# POSTing to ask
# data = {'question':'The article '+i[1]+' has the word 'recently' inserted in '+i[4]+'. Is it still correctly used? The context is:\n'+i[3],
# 'iframeurl':i[1]}
# r = requests.post(url, data=data)
fn = recdir + nextrecord() + 'q'
if path.exists(fn):
print('A billion questions reached! Start answering!')
exit()
with open(fn, 'w') as f:
try:
f.write('The article <a target="_blank" href="' + i[1] + '">' + i[0] +
'</a> has the word \'recently\' inserted in ' + i[4] +
'. Is it still correctly used? The context is:</br><i>' + i[3] +
'</i></br><a target="_blank" style="float:right;" href="'+i[1]+'">'+i[1]+'</a><iframe src="' + i[1] +
'" style="height: 40%; width: 100%;">[Can not display <a target="_blank" href="' +
i[1] + '">' + i[1] + '</a> inline as an iframe here.]</iframe>')
cnt = cnt + 1
except:
print 'Error while creating file'
print format_exc()
if (cnt == counter):
exit()