-
Notifications
You must be signed in to change notification settings - Fork 11
/
config.yaml
79 lines (68 loc) · 2.13 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# If the scraper does not have a custom setting, use the default
# Any value in the default section can also be set in a specific scraper as well
default:
dispatch:
service:
type: local
ratelimit:
type: qps
value: 1
downloader:
save_metadata: true
save_data:
service: local
file_template: "source/{scraper_name}.html"
extractor:
save_data:
service: local
file_template: "extracted/{scraper_name}.json"
# Custom per scraper
minimal:
downloader:
file_template: "source/{scraper_name}/minimal.html"
extractor:
file_template: "extracted/{scraper_name}/minimal.json"
multiple_extractors:
downloader:
file_template: "source/{scraper_name}/multiple_extractors.html"
extractor:
file_template: "extracted/{scraper_name}/multi_test_{extractor_name}.json"
json_source:
downloader:
file_template: "source/{scraper_name}/json_source.json"
extractor:
file_template: "extracted/{scraper_name}/json_source_data.json"
qa_results:
downloader:
file_template: "source/{scraper_name}/qa_source.html"
extractor:
file_template: "extracted/{scraper_name}/qa_data.json"
multiple_sources:
downloader:
file_template: "source/{scraper_name}/multiple_sources_{source_name}.json"
extractor:
file_template: "extracted/{scraper_name}/multiple_sources.json"
page_trigger_download:
downloader:
file_template: "source/{scraper_name}/page_{page}.html"
extractor:
file_template: "extracted/{scraper_name}/page_{page}.json"
page_dispatch:
downloader:
file_template: "source/{scraper_name}/page_{page}.html"
extractor:
file_template: "extracted/{scraper_name}/page_{page}.json"
gen_cookie_requests:
downloader:
file_template: "source/{scraper_name}/gen_cookie_requests.json"
extractor:
file_template: "extracted/{scraper_name}/gen_cookie_requests_data.json"
dispatch_cookie_selenium:
dispatch:
ratelimit:
type: period
value: 0.017 # ~1 min
downloader:
file_template: "source/{scraper_name}/dispatch_cookie_selenium_{ref_id}.json"
extractor:
file_template: "extracted/{scraper_name}/dispatch_cookie_selenium_{ref_id}.json"