-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_downloader.py
executable file
·137 lines (113 loc) · 4.5 KB
/
pdf_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from collections import namedtuple
from itertools import imap
import multiprocessing
import os
import requests
import shutil
import sys
""" Download pairs of pdf files
Input format:
stripped_url<TAB>source_pdf<TAB>target_pdf<TAB>source_page<TAB>target_page
Output:
Directories with a single pair of .pdf file and log file documenting the files'
origins.
"""
def make_request(url, session):
try:
r = session.get(url, stream=True, timeout=1.0)
except requests.exceptions.ConnectionError:
return False, "connection refused for %s" % url
except requests.exceptions.InvalidSchema:
return False, "invalid schema %s" % url
except requests.exceptions.TooManyRedirects:
return False, "too many redirects for %s" % url
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
return False, "other error: %s" % str(e)
if r.status_code != 200:
return False, "HTTP response not OK: %s for %s" % (r.status_code, url)
content_type = r.headers.get('content-type', '')
if 'pdf' not in content_type.lower():
return False, "wrong content type: %s" % content_type
return True, r
def download_pair(candidate, basedir, session):
source_pdf = candidate.source_url
target_pdf = candidate.target_url
# 0. Compute path from hashed url
h = str(hash(candidate.stripped_url))
path = os.path.join(basedir, h[1:4], h[4:7], h[7:10])
if os.path.exists(path):
# Duplicate download?
return False, "Target path exists already: %s" % path
# 1. Check that both files exist and have correct type
success, source_r = make_request(source_pdf, session)
if not success:
reason = source_r
return False, reason
success, target_r = make_request(target_pdf, session)
if not success:
reason = target_r
return False, reason
# 2. Make target directory
if os.path.exists(path): # slight race condition here
# Duplicate download?
return False, "Target path exists already: %s" % path
try:
os.makedirs(path)
except OSError:
return False, "Target path exists already: %s" % path
# 3. Download pdf and document original names
l = open(os.path.join(path, "log.txt"), 'wc')
l.write("%s\t%s\n" % (os.path.join(path, 'source.pdf'), source_pdf))
l.write("%s\t%s\n" % (os.path.join(path, 'target.pdf'), target_pdf))
try:
for r, name in [[source_r, 'source.pdf'], [target_r, 'target.pdf']]:
with open(os.path.join(path, name), 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
except (KeyboardInterrupt, SystemExit):
raise
except:
shutil.rmtree(path, ignore_errors=True)
return False, "download error"
return True, "Success"
CandidatePair = namedtuple('CandidatePair', 'stripped_url, \
source_url, target_url, source_page, target_page')
session = requests.Session()
def process_line(line):
candidate = CandidatePair(*line.split('\t'))
success, reason = download_pair(candidate, args.downloaddir, session)
return success, reason, candidate
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-lang', help='language codes')
parser.add_argument('-candidates',
help='candidates from url strippper',
type=argparse.FileType('r'), default=sys.stdin)
parser.add_argument('-downloaddir',
help='download base directory', required=True)
parser.add_argument('-threads', default=1, type=int,
help='Number of concurrent downloads')
args = parser.parse_args(sys.argv[1:])
assert os.path.exists(args.downloaddir)
errors, total = 0, 0
it = None
if args.threads > 1:
pool = multiprocessing.Pool(processes=args.threads)
it = pool.imap_unordered(process_line, args.candidates)
else:
it = imap(process_line, args.candidates)
for success, reason, candidate in it:
total += 1
if not success:
sys.stderr.write("Error %d/%d '%s' processing %s <-> %s\n" %
(errors, total, reason,
candidate.source_url, candidate.target_url))
errors += 1
sys.stderr.write("Wrote %d out of %d candidate pairs\n" %
(total - errors, total))