diff --git a/banz_scraper.py b/banz_scraper.py index 81135d57..457e6373 100755 --- a/banz_scraper.py +++ b/banz_scraper.py @@ -3,7 +3,7 @@ """BAnz-Scraper. Usage: - banz_scaper.py [ []] + banz_scaper.py [update | []] banz_scaper.py -h | --help banz_scaper.py --version @@ -19,6 +19,7 @@ banz_scaper.py data/banz.json """ +import sys from pathlib import Path import re import json @@ -49,7 +50,7 @@ def post(self, *args, **kwargs) -> Response: "Referer": "https://www.bundesanzeiger.de/" }) - def scrape(self, low=0, high=10000): + def scrape(self, low=0, high=sys.maxsize): collection = {} years = self.get_years() for year in years: @@ -147,15 +148,15 @@ def get_items(self, year, date: Tuple[str, str]): def main(arguments): - minyear = arguments[''] or 0 - maxyear = arguments[''] or 10000 - minyear = int(minyear) - maxyear = int(maxyear) banz = BAnzScraper() data = {} if Path(arguments['']).exists(): with open(arguments['']) as f: data = json.load(f) + minyear = int(arguments[''] or 0) + maxyear = int(arguments[''] or sys.maxsize) + if arguments['update'] and len(data) > 0: + minyear = max([toc_entry['year'] for pub in data.values() for toc_entry in pub]) data.update(banz.scrape(minyear, maxyear)) with open(arguments[''], 'w') as f: json.dump(data, f, indent=4) diff --git a/bgbl_scraper.py b/bgbl_scraper.py index 5b5bc20e..2a91238a 100755 --- a/bgbl_scraper.py +++ b/bgbl_scraper.py @@ -3,7 +3,7 @@ """BGBl-Scraper. Usage: - bgbl_scaper.py [ []] + bgbl_scaper.py [update | []] bgbl_scaper.py -h | --help bgbl_scaper.py --version @@ -152,15 +152,15 @@ def get_number_toc(self, number_id, number_did): return toc def main(arguments): - minyear = arguments[''] or 0 - maxyear = arguments[''] or 10000 - minyear = int(minyear) - maxyear = int(maxyear) bgbl = BGBLScraper() data = {} if Path(arguments['']).exists(): with open(arguments['']) as f: data = json.load(f) + minyear = int(arguments[''] or 0) + maxyear = int(arguments[''] or sys.maxsize) + if arguments['update'] and len(data) > 0: + minyear = max([toc_entry['year'] for pub in data.values() for toc_entry in pub]) data.update(bgbl.scrape(minyear, maxyear)) with open(arguments[''], 'w') as f: json.dump(data, f, indent=4) diff --git a/vkbl_scraper.py b/vkbl_scraper.py index ef4cd286..60acefaa 100644 --- a/vkbl_scraper.py +++ b/vkbl_scraper.py @@ -1,7 +1,7 @@ """VkBl-Scraper. Usage: - vkbl_scaper.py [ []] + vkbl_scaper.py [update | []] vkbl_scaper.py -h | --help vkbl_scaper.py --version @@ -126,17 +126,16 @@ def scrape(self, low=1947, high=datetime.datetime.now().year): def main(arguments): - current_year = datetime.datetime.now().year - minyear = arguments[''] or 1947 - maxyear = arguments[''] or current_year - minyear = int(minyear) - maxyear = int(maxyear) vkbl = VkblScraper() data = {} if Path(arguments['']).exists(): with open(arguments['']) as f: data = json.load(f) data.update(vkbl.scrape(minyear, maxyear)) + minyear = int(arguments[''] or 1947) + maxyear = int(arguments[''] or datetime.datetime.now().year) + if arguments['update'] and len(data) > 0: + minyear = max([toc_entry['year'] for pub in data.values() for toc_entry in pub]) with open(arguments[''], 'w') as f: json.dump(data, f)