Skip to content

Commit

Permalink
stricter error catching when loading last_runs
Browse files Browse the repository at this point in the history
Create an exception for when a last funs file doesn't exist.

Signed-off-by: Szymon Łopaciuk <[email protected]>
  • Loading branch information
szymonlopaciuk committed Dec 14, 2017
1 parent 858f6c1 commit 604ebef
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 1,496 deletions.
29 changes: 18 additions & 11 deletions hepcrawl/spiders/oaipmh_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"""Generic spider for OAI-PMH servers."""

import logging
from errno import EEXIST
from errno import EEXIST as FILE_EXISTS, ENOENT as NO_SUCH_FILE_OR_DIR
from datetime import datetime
from dateutil import parser as dateparser
import hashlib
Expand All @@ -28,6 +28,12 @@
LOGGER = logging.getLogger(__name__)


class NoLastRunToLoad(Exception):
"""Error raised when there was a problem with loading the last_runs file"""
def __init__(self, file_path):
self.message = "Failed to load file at {}".format(file_path)


class OAIPMHSpider(StatefulSpider):
"""
Implements a spider for the OAI-PMH protocol by using the Python sickle library.
Expand Down Expand Up @@ -132,8 +138,10 @@ def _load_last_run(self):
last_run = json.load(f)
LOGGER.info('Last run file loaded: {}'.format(repr(last_run)))
return last_run
except IOError:
return None
except IOError as exc:
if exc.errno == NO_SUCH_FILE_OR_DIR:
raise NoLastRunToLoad(file_path)
raise

def _save_run(self, started_at):
"""Store last run information
Expand All @@ -159,18 +167,17 @@ def _save_run(self, started_at):
try:
makedirs(path.dirname(file_path))
except OSError as exc:
if exc.errno == EEXIST:
pass
else:
if exc.errno != FILE_EXISTS:
raise
with open(file_path, 'w') as f:
json.dump(last_run_info, f, indent=4)

@property
def _resume_from(self):
last_run = self._load_last_run()
if not last_run:
try:
last_run = self._load_last_run()
resume_at = last_run['until_date'] or last_run['last_run_finished_at']
date_parsed = dateparser.parse(resume_at)
return date_parsed.strftime('%Y-%m-%d')
except NoLastRunToLoad:
return None
resume_at = last_run['until_date'] or last_run['last_run_finished_at']
date_parsed = dateparser.parse(resume_at)
return date_parsed.strftime('%Y-%m-%d')
Loading

0 comments on commit 604ebef

Please sign in to comment.