-
Notifications
You must be signed in to change notification settings - Fork 0
/
arxiv-highlights.py
65 lines (54 loc) · 1.94 KB
/
arxiv-highlights.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python3
"""Python script to scan arxiv papers and create a list of highlights
based on author citation counts.
Usage:
./arxiv-scan.py [-n ntop] [-o filename] [-a listings]
"""
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from arxiv_crawler.spiders.arxiv_spider import ArxivSpider
import contextlib, os, json, sys, argparse
parser = argparse.ArgumentParser(description='Scrape data from arxiv and create top list of daily papers')
parser.add_argument('-o',action='store',default='arxiv.json',dest='filename')
parser.add_argument('-a',action='store',default=None,dest='listings')
parser.add_argument('-n',action='store',type=int,default=8,dest='ntop')
args=parser.parse_args()
# remove json if already there
with contextlib.suppress(FileNotFoundError):
os.remove(args.filename)
# get settings
settings = get_project_settings()
# write out to json
settings.set('FEED_FORMAT', 'json')
settings.set('FEED_URI', args.filename)
# create the crawler process
process = CrawlerProcess(settings)
spider = ArxivSpider(filename=args.listings)
# get the listings
arxivlist=''
for x in spider.listings:
arxivlist+=x+', '
arxivlist=arxivlist[:-2]
# start the 'arxiv' spider.
process.crawl(spider)
process.start() # the script will block here until the crawling is finished
print()
with open(args.filename) as data_file:
data = json.load(data_file)
sorted_data = sorted(data, key=lambda k: k['score'])
print('\n# Starting ranked list of today\'s top '+
str(args.ntop)+' on '+arxivlist+'\n')
first=True
# loop over the first ntop best scored values
for val in sorted_data[:-args.ntop:-1]:
if (first):
first=False
else:
print('====================\n')
print("'"+val['title']+"'",'('+str(val['score'])+')')
auths = ''
for a in val['authors']:
auths+=a+', '
auths = auths[:-2]
print(' - by',auths,'\n',val['subject'],'\n')
print(val['abstract'])