Skip to content

Commit

Permalink
Add update option to scrapers.
Browse files Browse the repository at this point in the history
  • Loading branch information
darkdragon-001 committed May 17, 2021
1 parent 60d39f1 commit a87934d
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 17 deletions.
13 changes: 7 additions & 6 deletions banz_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""BAnz-Scraper.
Usage:
banz_scaper.py <outputfile> [<minyear> [<maxyear>]]
banz_scaper.py <outputfile> [update | <minyear> [<maxyear>]]
banz_scaper.py -h | --help
banz_scaper.py --version
Expand All @@ -19,6 +19,7 @@
banz_scaper.py data/banz.json
"""
import sys
from pathlib import Path
import re
import json
Expand Down Expand Up @@ -49,7 +50,7 @@ def post(self, *args, **kwargs) -> Response:
"Referer": "https://www.bundesanzeiger.de/"
})

def scrape(self, low=0, high=10000):
def scrape(self, low=0, high=sys.maxsize):
collection = {}
years = self.get_years()
for year in years:
Expand Down Expand Up @@ -147,15 +148,15 @@ def get_items(self, year, date: Tuple[str, str]):


def main(arguments):
minyear = arguments['<minyear>'] or 0
maxyear = arguments['<maxyear>'] or 10000
minyear = int(minyear)
maxyear = int(maxyear)
banz = BAnzScraper()
data = {}
if Path(arguments['<outputfile>']).exists():
with open(arguments['<outputfile>']) as f:
data = json.load(f)
minyear = int(arguments['<minyear>'] or 0)
maxyear = int(arguments['<maxyear>'] or sys.maxsize)
if arguments['update'] and len(data) > 0:
minyear = max([toc_entry['year'] for pub in data.values() for toc_entry in pub])
data.update(banz.scrape(minyear, maxyear))
with open(arguments['<outputfile>'], 'w') as f:
json.dump(data, f, indent=4)
Expand Down
10 changes: 5 additions & 5 deletions bgbl_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""BGBl-Scraper.
Usage:
bgbl_scaper.py <outputfile> [<minyear> [<maxyear>]]
bgbl_scaper.py <outputfile> [update | <minyear> [<maxyear>]]
bgbl_scaper.py -h | --help
bgbl_scaper.py --version
Expand Down Expand Up @@ -152,15 +152,15 @@ def get_number_toc(self, number_id, number_did):
return toc

def main(arguments):
minyear = arguments['<minyear>'] or 0
maxyear = arguments['<maxyear>'] or 10000
minyear = int(minyear)
maxyear = int(maxyear)
bgbl = BGBLScraper()
data = {}
if Path(arguments['<outputfile>']).exists():
with open(arguments['<outputfile>']) as f:
data = json.load(f)
minyear = int(arguments['<minyear>'] or 0)
maxyear = int(arguments['<maxyear>'] or sys.maxsize)
if arguments['update'] and len(data) > 0:
minyear = max([toc_entry['year'] for pub in data.values() for toc_entry in pub])
data.update(bgbl.scrape(minyear, maxyear))
with open(arguments['<outputfile>'], 'w') as f:
json.dump(data, f, indent=4)
Expand Down
11 changes: 5 additions & 6 deletions vkbl_scraper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""VkBl-Scraper.
Usage:
vkbl_scaper.py <outputfile> [<minyear> [<maxyear>]]
vkbl_scaper.py <outputfile> [update | <minyear> [<maxyear>]]
vkbl_scaper.py -h | --help
vkbl_scaper.py --version
Expand Down Expand Up @@ -126,17 +126,16 @@ def scrape(self, low=1947, high=datetime.datetime.now().year):


def main(arguments):
current_year = datetime.datetime.now().year
minyear = arguments['<minyear>'] or 1947
maxyear = arguments['<maxyear>'] or current_year
minyear = int(minyear)
maxyear = int(maxyear)
vkbl = VkblScraper()
data = {}
if Path(arguments['<outputfile>']).exists():
with open(arguments['<outputfile>']) as f:
data = json.load(f)
data.update(vkbl.scrape(minyear, maxyear))
minyear = int(arguments['<minyear>'] or 1947)
maxyear = int(arguments['<maxyear>'] or datetime.datetime.now().year)
if arguments['update'] and len(data) > 0:
minyear = max([toc_entry['year'] for pub in data.values() for toc_entry in pub])
with open(arguments['<outputfile>'], 'w') as f:
json.dump(data, f)

Expand Down

0 comments on commit a87934d

Please sign in to comment.