forked from karpathy/researchpooler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
nips_add_pdftext.py
55 lines (43 loc) · 1.82 KB
/
nips_add_pdftext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
"""
Standalone helper script.
Load nips pubs_ file, and adds to every paper its word counts under key
'pdf_text'. The PDF for each paper is downloaded from NIPS site.
"""
from repool_util import loadPubs, savePubs, stringToWordDictionary
from pdf_read import convertPDF
pubs_all = loadPubs('pubs_nips')
print 'loaded pubs with %d entries.' % (len(pubs_all),)
#possibly place restrictions on pubs to process here
pubs = pubs_all
for i,p in enumerate(pubs):
#if the pdf url does not exist, in future this could possibly use google
#search to try to look up a link for the pdf first.
if p.has_key('pdf') and not p.has_key('pdf_text'):
# try to open the PDF from downloaded location
processed = False
try:
floc = p['pdf'].index('NIPS')
fname = p['pdf'][floc:]
txt = convertPDF('downloads/'+fname)
processed = True
print 'found %s in file!' % (p['title'],)
except:
pass
if not processed:
# download the PDF and convert to text
try:
print 'downloading pdf for [%s] and parsing...' % (p.get('title', 'an un-titled paper'))
txt = convertPDF(p['pdf'])
processed = True
print 'processed from url!'
except:
print 'error: unable to open download the pdf from %s' % (p['pdf'],)
print 'skipping...'
if processed:
# convert to bag of words and store
try:
p['pdf_text'] = stringToWordDictionary(txt)
except:
print 'was unable to convert text to bag of words. Skipped.'
print '%d/%d = %.2f%% done.' % (i+1, len(pubs), 100*(i+1.0)/len(pubs))
savePubs('pubs_nips', pubs_all)