forked from Antfield-Creations/NDE-monitoring-file-formats
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.yaml
119 lines (112 loc) · 3.3 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
spec:
# Leave blank to have the config loader generate a timestamp-based run id
run_id:
data:
dans:
# Scrape settings
root_url: https://archaeology.datastations.nl
page_subpath: /dataverse/root?q=&types=dataverses%3Adatasets&sort=dateSort&order=desc&page={page}
start_page: 1
dataset_overview_api_subpath: /api/v1/datasets/:persistentId?persistentId={doi}
dataset_versions_api_subpath: /api/v1/datasets/:persistentId/versions?persistentId={doi}
scrape_log_path: data/dans/scrape_log.ndjson
filetype_monthly_aggregate_path: data/dans/filetype_monthly.json
date_json_path: data[?(versionNumber=1&versionMinorNumber=0)].productionDate
# Skip archaeology datastation ingestion artifacts
file_skip_list:
- dataset.xml
- emd.xml
- files.xml
- provenance.xml
# Aggregation settings
filetype_mapping:
.tiff: .tif
.jpeg: .jpg
# Analysis settings
minimum_time_periods: 8
min_year: 1997
max_year: 2019
# The number of last periods to measure if there was a decline
decline_periods: 2
img_output_dir: images/dans/
num_test_measurements: 2
mime_plots:
- .jpg
- .jpg cumulatief
- .tif
- .tif cumulatief
- .pdf
- .doc
- .docx
- .xml
- .xlsx
- .xls
- .tab
- .shp
- .geojson
- .mdb
- .accdb
linear_plots: []
nibg:
raw_csv_path: ~/data/NDE/digital_files.csv
raw_csv_line_count: 4741017
minimum_time_periods: 100
json_output_dir: data/nibg/
img_output_dir: images/nibg/
num_test_measurements: 4
linear_plots:
- mxf
- tif
- tar
- pdf
- mpg
- mp3
kb:
raw_csv_path: data/kb/kbData.csv
raw_csv_line_count: 5048327
minimum_time_periods: 10
json_output_dir: data/kb/
img_output_dir: images/kb/
num_test_measurements: 4
linear_plots:
- application/msaccess
- application/msexcel
- application/mspowerpoint
- application/msword
- application/octet-stream
- application/pdf
- application/postscript
- application/rtf
- application/wordperfect
- application/x-shockwave-flash
- application/x-tar
- application/zip
- audio/mpeg
- audio/x-wav
- image/bmp
- image/gif
- image/jpeg
- image/tiff
- image/x-png
- text/css
- text/html
- text/plain
- video/mpeg
- video/quicktime
- video/x-msvideo
common_crawl:
# Metadata url
collection_url: https://index.commoncrawl.org/collinfo.json
# Source data url
stats_url: https://raw.githubusercontent.com/commoncrawl/cc-crawl-statistics/master/plots/mimetypes_detected.csv
# Which statistic to use to calculate usage over time
usage_stat: urls
# The number of crawls to use as test data
num_test_crawls: 6
# Selection of mime types to use to generate usage and model plots
mime_plots:
- application/xhtml+xml
- image/gif
- application/x-mobipocket-ebook
csv_output_dir: data/common-crawl/
models: []