diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 9a1df2e0..e3b90cdf 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -18,6 +18,7 @@ services: - APP_CRAWLER_HOST_URL=http://scrapyd:6800 - APP_API_PIPELINE_TASK_ENDPOINT_DEFAULT=hepcrawl.testlib.tasks.submit_results - APP_FILES_STORE=/tmp/file_urls + - APP_LAST_RUNS_PATH=/code/.scrapy/last_runs - APP_CRAWL_ONCE_PATH=/code/.scrapy - COVERAGE_PROCESS_START=/code/.coveragerc - BASE_USER_UID=${BASE_USER_UID:-1000} @@ -58,8 +59,11 @@ services: functional_cds: <<: *service_base command: py.test -vv tests/functional/cds - links: - - scrapyd + depends_on: + scrapyd: + condition: service_healthy + cds-http-server.local: + condition: service_healthy functional_pos: <<: *service_base @@ -129,6 +133,21 @@ services: - "CMD-SHELL" - "curl https://localhost:443/" + cds-http-server.local: + image: nginx:stable-alpine + volumes: + - ${PWD}/tests/functional/cds/fixtures/http_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf + - ${PWD}/tests/functional/cds/fixtures/http_server/records:/etc/nginx/html/ + ports: + - 80:80 + healthcheck: + timeout: 5s + interval: 5s + retries: 5 + test: + - "CMD-SHELL" + - "curl http://localhost:80/" + rabbitmq: image: rabbitmq healthcheck: diff --git a/hepcrawl/downloaders.py b/hepcrawl/downloaders.py new file mode 100644 index 00000000..49d9fba7 --- /dev/null +++ b/hepcrawl/downloaders.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2016, 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +"""Additional downloaders.""" + + +from scrapy.http import Response + + +class DummyDownloadHandler(object): + def __init__(self, *args, **kwargs): + pass + + def download_request(self, request, spider): + url = request.url + return Response(url, request=request) diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py index 0d9581c6..6c38c1e2 100644 --- a/hepcrawl/settings.py +++ b/hepcrawl/settings.py @@ -19,6 +19,8 @@ from __future__ import absolute_import, division, print_function +from scrapy.settings import default_settings + import os @@ -40,6 +42,12 @@ 'http://localhost/schemas/records/' ) +# Location of last run information +LAST_RUNS_PATH = os.environ.get( + 'APP_LAST_RUNS_PATH', + '/var/lib/scrapy/last_runs/' +) + # Configure maximum concurrent requests performed by Scrapy (default: 16) # CONCURRENT_REQUESTS=32 @@ -71,6 +79,13 @@ 'hepcrawl.middlewares.HepcrawlCrawlOnceMiddleware': 100, } +# Configure custom downloaders +# See https://doc.scrapy.org/en/0.20/topics/settings.html#download-handlers +DOWNLOAD_HANDLERS = { + 'oaipmh+http': 'hepcrawl.downloaders.DummyDownloadHandler', + 'oaipmh+https': 'hepcrawl.downloaders.DummyDownloadHandler', +} + # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { diff --git a/hepcrawl/spiders/__init__.py b/hepcrawl/spiders/__init__.py index e4336459..2d6d6746 100644 --- a/hepcrawl/spiders/__init__.py +++ b/hepcrawl/spiders/__init__.py @@ -8,11 +8,3 @@ # more details. from __future__ import absolute_import, division, print_function - -from scrapy import Spider - - -class StatefulSpider(Spider): - def __init__(self, *args, **kwargs): - self.state = {} - return super(Spider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/alpha_spider.py b/hepcrawl/spiders/alpha_spider.py index ce334056..8a9e285e 100644 --- a/hepcrawl/spiders/alpha_spider.py +++ b/hepcrawl/spiders/alpha_spider.py @@ -18,7 +18,7 @@ from scrapy import Request from scrapy.spiders import CrawlSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/aps_spider.py b/hepcrawl/spiders/aps_spider.py index 69e19010..0adf94b1 100644 --- a/hepcrawl/spiders/aps_spider.py +++ b/hepcrawl/spiders/aps_spider.py @@ -18,7 +18,7 @@ from scrapy import Request -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/arxiv_spider.py b/hepcrawl/spiders/arxiv_spider.py index 64d076dc..a72b1e3b 100644 --- a/hepcrawl/spiders/arxiv_spider.py +++ b/hepcrawl/spiders/arxiv_spider.py @@ -16,7 +16,7 @@ from scrapy import Request, Selector from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..mappings import CONFERENCE_WORDS, THESIS_WORDS diff --git a/hepcrawl/spiders/base_spider.py b/hepcrawl/spiders/base_spider.py index 79748fde..0f596c68 100644 --- a/hepcrawl/spiders/base_spider.py +++ b/hepcrawl/spiders/base_spider.py @@ -16,7 +16,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/brown_spider.py b/hepcrawl/spiders/brown_spider.py index fe1c340d..dba3d27d 100644 --- a/hepcrawl/spiders/brown_spider.py +++ b/hepcrawl/spiders/brown_spider.py @@ -19,7 +19,7 @@ from scrapy import Request from scrapy.spiders import CrawlSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/cds_spider.py b/hepcrawl/spiders/cds_spider.py index 9353d6d3..60d8d5be 100644 --- a/hepcrawl/spiders/cds_spider.py +++ b/hepcrawl/spiders/cds_spider.py @@ -9,67 +9,55 @@ """Spider for the CERN Document Server OAI-PMH interface""" -from scrapy.spider import XMLFeedSpider -from scrapy import Request -from harvestingkit.inspire_cds_package.from_cds import CDS2Inspire -from harvestingkit.bibrecord import ( - create_record as create_bibrec, - record_xml_output, -) -from dojson.contrib.marc21.utils import create_record -from inspire_dojson.hep import hep +import logging +from flask.app import Flask +from inspire_dojson import marcxml2record +from os.path import join as path_join -from . import StatefulSpider +from .common import OAIPMHSpider from ..utils import ParsedItem -class CDSSpider(StatefulSpider, XMLFeedSpider): +LOGGER = logging.getLogger(__name__) + + +class CDSSpider(OAIPMHSpider): """Spider for crawling the CERN Document Server OAI-PMH XML files. Example: Using OAI-PMH XML files:: - $ scrapy crawl \\ - cds \\ - -a "source_file=file://$PWD/tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml" + $ scrapy crawl CDS \\ + -a "oai_set=forINSPIRE" -a "from_date=2017-10-10" - It uses `HarvestingKit `_ to - translate from CDS's MARCXML into INSPIRE Legacy's MARCXML flavor. It then - employs `inspire-dojson `_ to - transform the legacy INSPIRE MARCXML into the new INSPIRE Schema. + It uses `inspire-dojson `_ to + translate from CDS's MARCXML into the new INSPIRE Schema. """ name = 'CDS' - iterator = 'xml' - itertag = 'OAI-PMH:record' - namespaces = [ - ('OAI-PMH', 'http://www.openarchives.org/OAI/2.0/'), - ('marc', 'http://www.loc.gov/MARC21/slim'), - ] - def __init__(self, source_file=None, **kwargs): - super(CDSSpider, self).__init__(**kwargs) - self.source_file = source_file - - def start_requests(self): - yield Request(self.source_file) + def __init__(self, + oai_endpoint='http://cds.cern.ch/oai2d', + from_date=None, + oai_set="forINSPIRE", + *args, **kwargs): + super(CDSSpider, self).__init__( + url=oai_endpoint, + metadata_prefix='marcxml', + oai_set=oai_set, + from_date=from_date, + **kwargs + ) - def parse_node(self, response, node): - node.remove_namespaces() - cds_bibrec, ok, errs = create_bibrec( - node.xpath('.//record').extract()[0] + def parse_record(self, selector): + selector.remove_namespaces() + record = selector.xpath('.//record').extract_first() + app = Flask('hepcrawl') + app.config.update( + self.settings.getdict('MARC_TO_HEP_SETTINGS', {}) ) - if not ok: - raise RuntimeError("Cannot parse record %s: %s", node, errs) - self.logger.info("Here's the record: %s" % cds_bibrec) - inspire_bibrec = CDS2Inspire(cds_bibrec).get_record() - marcxml_record = record_xml_output(inspire_bibrec) - record = create_record(marcxml_record) - json_record = hep.do(record) - base_uri = self.settings['SCHEMA_BASE_URI'] - json_record['$schema'] = base_uri + 'hep.json' - parsed_item = ParsedItem( - record=json_record, - record_format='hep', - ) - return parsed_item + with app.app_context(): + json_record = marcxml2record(record) + base_uri = self.settings['SCHEMA_BASE_URI'] + json_record['$schema'] = path_join(base_uri, 'hep.json') + return ParsedItem(record=json_record, record_format='hep') diff --git a/hepcrawl/spiders/common/__init__.py b/hepcrawl/spiders/common/__init__.py new file mode 100644 index 00000000..5453444a --- /dev/null +++ b/hepcrawl/spiders/common/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2015, 2016, 2017, 2018 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +from __future__ import absolute_import, division, print_function + +from .oaipmh_spider import OAIPMHSpider +from .stateful_spider import StatefulSpider diff --git a/hepcrawl/spiders/common/oaipmh_spider.py b/hepcrawl/spiders/common/oaipmh_spider.py new file mode 100644 index 00000000..3ea147a4 --- /dev/null +++ b/hepcrawl/spiders/common/oaipmh_spider.py @@ -0,0 +1,186 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +"""Generic spider for OAI-PMH servers.""" + +import abc +import logging +from errno import EEXIST as FILE_EXISTS, ENOENT as NO_SUCH_FILE_OR_DIR +from datetime import datetime +from dateutil import parser as dateparser +import hashlib +import json +from os import path, makedirs + +from sickle import Sickle +from sickle.oaiexceptions import NoRecordsMatch + +from scrapy.http import Request, XmlResponse +from scrapy.selector import Selector +from .stateful_spider import StatefulSpider + + +LOGGER = logging.getLogger(__name__) + + +class NoLastRunToLoad(Exception): + """Error raised when there was a problem with loading the last_runs file""" + def __init__(self, file_path): + self.message = "Failed to load file at {}".format(file_path) + + +class OAIPMHSpider(StatefulSpider): + """ + Implements a spider for the OAI-PMH protocol by using the Python sickle library. + + In case of successful harvest (OAI-PMH crawling) the spider will remember + the initial starting date and will use it as `from_date` argument on the + next harvest. + """ + __metaclass__ = abc.ABCMeta + name = 'OAI-PMH' + + def __init__( + self, + url, + metadata_prefix='oai_dc', + oai_set=None, + alias=None, + from_date=None, + until_date=None, + *args, **kwargs + ): + super(OAIPMHSpider, self).__init__(*args, **kwargs) + self.url = url + self.metadata_prefix = metadata_prefix + self.set = oai_set + self.from_date = from_date + self.until_date = until_date + + def start_requests(self): + self.from_date = self.from_date or self._resume_from + started_at = datetime.utcnow() + + LOGGER.info("Starting harvesting of {url} with set={set} and " + "metadataPrefix={metadata_prefix}, from={from_date}, " + "until={until_date}".format( + url=self.url, + set=self.set, + metadata_prefix=self.metadata_prefix, + from_date=self.from_date, + until_date=self.until_date + )) + + request = Request('oaipmh+{}'.format(self.url), self.parse) + yield request + + now = datetime.utcnow() + self._save_run(started_at) + + LOGGER.info("Harvesting completed. Next harvesting will resume from {}" + .format(self.until_date or now.strftime('%Y-%m-%d'))) + + @abc.abstractmethod + def parse_record(self, record): + """ + This method need to be reimplemented in order to provide special parsing. + + Args: + record (scrapy.selector.Selector): selector on the parsed record + """ + raise NotImplementedError() + + def parse(self, response): + sickle = Sickle(self.url) + try: + records = sickle.ListRecords(**{ + 'metadataPrefix': self.metadata_prefix, + 'set': self.set, + 'from': self.from_date, + 'until': self.until_date, + }) + except NoRecordsMatch as err: + LOGGER.warning(err) + raise StopIteration() + for record in records: + response = XmlResponse(self.url, encoding='utf-8', body=record.raw) + selector = Selector(response, type='xml') + yield self.parse_record(selector) + + def _make_alias(self): + return 'metadataPrefix={metadata_prefix}&set={set}'.format( + metadata_prefix=self.metadata_prefix, + set=self.set + ) + + def _last_run_file_path(self): + """Render a path to a file where last run information is stored. + + Returns: + string: path to last runs path + """ + lasts_run_path = self.settings['LAST_RUNS_PATH'] + file_name = hashlib.sha1(self._make_alias()).hexdigest() + '.json' + return path.join(lasts_run_path, self.name, file_name) + + def _load_last_run(self): + """Return stored last run information + + Returns: + Optional[dict]: last run information or None if don't exist + """ + file_path = self._last_run_file_path() + try: + with open(file_path) as f: + last_run = json.load(f) + LOGGER.info('Last run file loaded: {}'.format(repr(last_run))) + return last_run + except IOError as exc: + if exc.errno == NO_SUCH_FILE_OR_DIR: + raise NoLastRunToLoad(file_path) + raise + + def _save_run(self, started_at): + """Store last run information + + Args: + started_at (datetime.datetime) + + Raises: + IOError: if writing the file is unsuccessful + """ + last_run_info = { + 'spider': self.name, + 'url': self.url, + 'metadata_prefix': self.metadata_prefix, + 'set': self.set, + 'from_date': self.from_date, + 'until_date': self.until_date, + 'last_run_started_at': started_at.isoformat(), + 'last_run_finished_at': datetime.utcnow().isoformat(), + } + file_path = self._last_run_file_path() + LOGGER.info("Last run file saved to {}".format(file_path)) + try: + makedirs(path.dirname(file_path)) + except OSError as exc: + if exc.errno != FILE_EXISTS: + raise + with open(file_path, 'w') as f: + json.dump(last_run_info, f, indent=4) + + @property + def _resume_from(self): + try: + last_run = self._load_last_run() + resume_at = last_run['until_date'] or last_run['last_run_finished_at'] + date_parsed = dateparser.parse(resume_at) + return date_parsed.strftime('%Y-%m-%d') + except NoLastRunToLoad: + return None diff --git a/hepcrawl/spiders/common/stateful_spider.py b/hepcrawl/spiders/common/stateful_spider.py new file mode 100644 index 00000000..3de5c613 --- /dev/null +++ b/hepcrawl/spiders/common/stateful_spider.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2015, 2016, 2017, 2018 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +from __future__ import absolute_import, division, print_function + +from scrapy import Spider + + +class StatefulSpider(Spider): + def __init__(self, *args, **kwargs): + self.state = {} + super(StatefulSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py index 05b35282..8da15c41 100644 --- a/hepcrawl/spiders/desy_spider.py +++ b/hepcrawl/spiders/desy_spider.py @@ -18,7 +18,7 @@ from scrapy import Request from six.moves import urllib -from . import StatefulSpider +from .common import StatefulSpider from ..utils import ( ftp_list_files, ftp_connection_info, diff --git a/hepcrawl/spiders/dnb_spider.py b/hepcrawl/spiders/dnb_spider.py index 5f243b94..b4a4540c 100644 --- a/hepcrawl/spiders/dnb_spider.py +++ b/hepcrawl/spiders/dnb_spider.py @@ -14,7 +14,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/edp_spider.py b/hepcrawl/spiders/edp_spider.py index c051c8ee..5c1fc8db 100644 --- a/hepcrawl/spiders/edp_spider.py +++ b/hepcrawl/spiders/edp_spider.py @@ -19,7 +19,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..extractors.jats import Jats from ..items import HEPRecord from ..loaders import HEPLoader diff --git a/hepcrawl/spiders/elsevier_spider.py b/hepcrawl/spiders/elsevier_spider.py index e2d4e919..f4d97b12 100644 --- a/hepcrawl/spiders/elsevier_spider.py +++ b/hepcrawl/spiders/elsevier_spider.py @@ -23,7 +23,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/hindawi_spider.py b/hepcrawl/spiders/hindawi_spider.py index 5f81f5b4..7c14ab41 100644 --- a/hepcrawl/spiders/hindawi_spider.py +++ b/hepcrawl/spiders/hindawi_spider.py @@ -14,7 +14,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/infn_spider.py b/hepcrawl/spiders/infn_spider.py index 2e093ab1..04240307 100644 --- a/hepcrawl/spiders/infn_spider.py +++ b/hepcrawl/spiders/infn_spider.py @@ -19,7 +19,7 @@ from scrapy.http import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/iop_spider.py b/hepcrawl/spiders/iop_spider.py index fbca3ae5..5b1f2826 100644 --- a/hepcrawl/spiders/iop_spider.py +++ b/hepcrawl/spiders/iop_spider.py @@ -18,7 +18,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..extractors.nlm import NLM from ..items import HEPRecord from ..loaders import HEPLoader diff --git a/hepcrawl/spiders/magic_spider.py b/hepcrawl/spiders/magic_spider.py index 8dfd5d51..27f79b80 100644 --- a/hepcrawl/spiders/magic_spider.py +++ b/hepcrawl/spiders/magic_spider.py @@ -16,7 +16,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/mit_spider.py b/hepcrawl/spiders/mit_spider.py index 21804873..e24fcfb0 100644 --- a/hepcrawl/spiders/mit_spider.py +++ b/hepcrawl/spiders/mit_spider.py @@ -21,7 +21,7 @@ from scrapy.http import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/phenix_spider.py b/hepcrawl/spiders/phenix_spider.py index aa54bd98..3e8b990b 100644 --- a/hepcrawl/spiders/phenix_spider.py +++ b/hepcrawl/spiders/phenix_spider.py @@ -16,7 +16,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ParsedItem diff --git a/hepcrawl/spiders/phil_spider.py b/hepcrawl/spiders/phil_spider.py index 06f52da2..b1b76284 100644 --- a/hepcrawl/spiders/phil_spider.py +++ b/hepcrawl/spiders/phil_spider.py @@ -17,7 +17,7 @@ from scrapy import Request from scrapy.spiders import CrawlSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index 024eff6b..a85e3e5c 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -18,7 +18,7 @@ from scrapy import Request, Selector -from . import StatefulSpider +from .common import StatefulSpider from ..dateutils import create_valid_date from ..items import HEPRecord from ..loaders import HEPLoader diff --git a/hepcrawl/spiders/t2k_spider.py b/hepcrawl/spiders/t2k_spider.py index db18eb1e..a165bbed 100644 --- a/hepcrawl/spiders/t2k_spider.py +++ b/hepcrawl/spiders/t2k_spider.py @@ -16,7 +16,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index 280b6875..5a5776ec 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -18,7 +18,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..parsers import JatsParser from ..utils import ( ftp_list_files, diff --git a/setup.py b/setup.py index 728d85f1..ba297d41 100644 --- a/setup.py +++ b/setup.py @@ -37,6 +37,7 @@ 'python-dateutil>=2.4.2', 'python-scrapyd-api>=2.0.1', 'harvestingkit>=0.6.12', + 'Sickle~=0.6,>=0.6.2', ] tests_require = [ diff --git a/tests/functional/cds/fixtures/cds_expected.json b/tests/functional/cds/fixtures/cds_expected.json new file mode 100644 index 00000000..51d219e1 --- /dev/null +++ b/tests/functional/cds/fixtures/cds_expected.json @@ -0,0 +1,218 @@ +[ + { + "core": true, + "documents": [ + { + "url": "http://cds.cern.ch/record/1200752/files/MQW7_018.pdf", + "source": "CDS", + "description": "Published version from PoS", + "key": "MQW7_018.pdf" + } + ], + "curated": true, + "_collections": [ + "Literature" + ], + "inspire_categories": [ + { + "source": "cds", + "term": "Astrophysics" + } + ], + "titles": [ + { + "source": "CDS", + "title": "High and very high energy gamma-ray emission from binaries" + } + ], + "_private_notes": [ + { + "source": "CDS", + "value": "CDS-1200752" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "Grenoble Observ." + } + ], + "full_name": "Dubus, G" + } + ], + "publication_info": [ + { + "journal_volume": "MQW7", + "page_start": "018", + "journal_title": "PoS", + "artid": "018", + "year": 2008 + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "citeable": true, + "imprints": [ + { + "date": "2009" + } + ], + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:03.875113" + } + }, + { + "core": true, + "documents": [ + { + "url": "http://cds.cern.ch/record/1200753/files/MQW7_019.pdf", + "source": "CDS", + "description": "Published version from PoS", + "key": "MQW7_019.pdf" + } + ], + "curated": true, + "_collections": [ + "Literature" + ], + "collaborations": [ + { + "value": "Fermi LAT" + } + ], + "inspire_categories": [ + { + "source": "cds", + "term": "Astrophysics" + } + ], + "titles": [ + { + "source": "CDS", + "title": "GLAST: Launched and Being Commissioned - Status and Prospects for Microquasars" + }, + { + "source": "CDS", + "title": "Fermi: Launched and Being Commissioned - Status and Prospects for Microquasars" + } + ], + "_private_notes": [ + { + "source": "CDS", + "value": "CDS-1200753" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "SLAC" + } + ], + "full_name": "Dubois, R" + } + ], + "publication_info": [ + { + "journal_volume": "MQW7", + "page_start": "019", + "journal_title": "PoS", + "artid": "019", + "year": 2008 + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "citeable": true, + "imprints": [ + { + "date": "2008" + } + ], + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:03.951904" + } + }, + { + "core": true, + "documents": [ + { + "url": "http://cds.cern.ch/record/1200754/files/MQW7_020.pdf", + "source": "CDS", + "description": "Published version from PoS", + "key": "MQW7_020.pdf" + } + ], + "curated": true, + "_collections": [ + "Literature" + ], + "inspire_categories": [ + { + "source": "cds", + "term": "Astrophysics" + } + ], + "titles": [ + { + "source": "CDS", + "title": "Hadronic models of high-energy radiation from microquasars: recent developments" + } + ], + "_private_notes": [ + { + "source": "CDS", + "value": "CDS-1200754" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "Villa Elisa, Inst. Argentino Radioastron." + }, + { + "value": "La Plata U." + } + ], + "full_name": "Romero, G E" + } + ], + "publication_info": [ + { + "journal_volume": "MQW7", + "page_start": "020", + "journal_title": "PoS", + "artid": "020", + "year": 2008 + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "citeable": true, + "imprints": [ + { + "date": "2008" + } + ], + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:03.984541" + } + } +] diff --git a/tests/functional/cds/fixtures/cds_smoke_records_expected.json b/tests/functional/cds/fixtures/cds_smoke_records_expected.json deleted file mode 100644 index f6f6a7f8..00000000 --- a/tests/functional/cds/fixtures/cds_smoke_records_expected.json +++ /dev/null @@ -1,153 +0,0 @@ -[ - { - "$schema": "http://localhost/schemas/records/hep.json", - "_collections": [ - "Literature" - ], - "accelerator_experiments": [ - { - "legacy_name": "CERN-SPS---" - } - ], - "acquisition_source": { - "datetime": "2017-10-04T14:07:59.746165", - "method": "hepcrawl", - "source": "CDS", - "submission_number": "None" - }, - "core": true, - "curated": true, - "corporate_author": [ - "European Organization for Nuclear Research" - ], - "documents": [ - { - "url": "http://cds.cern.ch/record/21099/files/CM-P00077286-e.pdf", - "key": "document" - }, - { - "url": "http://cds.cern.ch/record/21099/files/CM-P00078235-f.pdf", - "key": "1_document" - } - ], - "document_type": [ - "article" - ], - "external_system_identifiers": [ - { - "schema": "Inspire", - "value": "1614043" - }, - { - "schema": "ADMADM", - "value": "0003711" - }, - { - "schema": "CDS", - "value": "21099" - } - ], - "inspire_categories": [ - { - "term": "Accelerators" - } - ], - "languages": [ - "fr" - ], - "preprint_date": "1967-05-30", - "report_numbers": [ - { - "value": "CERN/0702" - }, - { - "value": "CM-P00077286-e" - }, - { - "value": "CM-P00078235-f" - } - ], - "titles": [ - { - "title": "Addendum to the Report on the Design Study of a 300 GeV Proton Synchrotron (CERN/563) (AR/Int. SG/64-15)" - }, - { - "title": "Suppl\u00e9ment au Rapport sur le projet du synchrotron \u00e0 prontons de 300 GeV (CERN/563) (Ar/Int. SG/64-15)" - } - ] - }, - { - "$schema": "http://localhost/schemas/records/hep.json", - "_collections": [ - "Literature" - ], - "accelerator_experiments": [ - { - "legacy_name": "CERN-LEP---" - } - ], - "acquisition_source": { - "datetime": "2017-10-04T14:07:59.783028", - "method": "hepcrawl", - "source": "CDS", - "submission_number": "None" - }, - "core": true, - "curated": true, - "documents": [ - { - "url": "http://cds.cern.ch/record/60936/files/CM-P00098683-f.pdf", - "key": "document" - }, - { - "url": "http://cds.cern.ch/record/60936/files/CERN-SPC-426.pdf", - "key": "1_document" - } - ], - "document_type": [ - "article" - ], - "external_system_identifiers": [ - { - "schema": "ADMADM", - "value": "0009846" - }, - { - "schema": "Inspire", - "value": "1614044" - }, - { - "schema": "CDS", - "value": "60936" - } - ], - "inspire_categories": [ - { - "term": "Accelerators" - } - ], - "languages": [ - "fr" - ], - "preprint_date": "1978-10-06", - "report_numbers": [ - { - "value": "CERN/SPC/0426" - }, - { - "value": "CM-P00095369-e" - }, - { - "value": "CM-P00098683-f" - } - ], - "titles": [ - { - "title": "LEP Studies 1979 to 1981" - }, - { - "title": "Les Etudes sur le LEP de 1979 -1981" - } - ] - } -] diff --git a/tests/functional/cds/fixtures/http_server/conf/proxy.conf b/tests/functional/cds/fixtures/http_server/conf/proxy.conf new file mode 100644 index 00000000..68d70722 --- /dev/null +++ b/tests/functional/cds/fixtures/http_server/conf/proxy.conf @@ -0,0 +1,12 @@ +server { + listen 80; + server_name localhost; + charset_types text/xml; + charset UTF-8; + + location /oai2d { + if ($args ~ from=2017-11-15&verb=ListRecords&set=forINSPIRE&metadataPrefix=marcxml) { + rewrite ^.*$ /cds.xml permanent; + } + } +} diff --git a/tests/functional/cds/fixtures/http_server/records/cds.xml b/tests/functional/cds/fixtures/http_server/records/cds.xml new file mode 100644 index 00000000..c23aee04 --- /dev/null +++ b/tests/functional/cds/fixtures/http_server/records/cds.xml @@ -0,0 +1,285 @@ + + + +2017-12-07T15:05:26Zhttp://cds.cern.ch/oai2d +
oai:cds.cern.ch:12007522017-11-16T08:09:30Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1200752 + SzGeCERN + 20171116090930.0 + + oai:cds.cern.ch:1200752 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509577 + + + eng + + + Dubus, G + Grenoble Observ. + + + High and very high energy gamma-ray emission from binaries + + + 2009 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS MQW7-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + 018 + PoS + MQW7 + 2008 + + + http://cds.cern.ch/record/1200752/files/MQW7_018.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090817 + + + 1129423 + 018 + izmir20080901 + + + PUBLIC + + + 002842486CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12007532017-11-16T08:09:30Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1200753 + SzGeCERN + 20171116090930.0 + + oai:cds.cern.ch:1200753 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509578 + + + eng + + + Dubois, R + SLAC + + + GLAST: Launched and Being Commissioned - Status and Prospects for Microquasars + + + Fermi: Launched and Being Commissioned - Status and Prospects for Microquasars + Other title + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS MQW7-2009 + + + No authors + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + for the Fermi LAT Collaboration + + + 019 + PoS + MQW7 + 2008 + + + http://cds.cern.ch/record/1200753/files/MQW7_019.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090817 + + + 1129423 + 019 + izmir20080901 + + + PUBLIC + + + 002842487CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12007542017-11-16T08:09:30Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1200754 + SzGeCERN + 20171116090930.0 + + oai:cds.cern.ch:1200754 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509579 + + + eng + + + Romero, G E + Villa Elisa, Inst. Argentino Radioastron. + La Plata U. + + + Hadronic models of high-energy radiation from microquasars: recent developments + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS MQW7-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + 020 + PoS + MQW7 + 2008 + + + http://cds.cern.ch/record/1200754/files/MQW7_020.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090817 + + + 1129423 + 020 + izmir20080901 + + + PUBLIC + + + 002842488CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
+
+ diff --git a/tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml b/tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml deleted file mode 100644 index b3c521f6..00000000 --- a/tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml +++ /dev/null @@ -1,246 +0,0 @@ - - - - 2017-10-04T07:25:58Z - http://cds.cern.ch/oai2d - - -
- oai:cds.cern.ch:21099 - 2017-07-27T21:14:27Z - cerncds:FULLTEXT - forINSPIRE -
- - - 00000coc 2200000uu 4500 - 21099 - SzGeCERN - 20170803223648.0 - - oai:cds.cern.ch:21099 - cerncds:FULLTEXT - forINSPIRE - - - ADMADM - 0003711 - - - Inspire - 1614043 - - - eng - - - fre - - - COUNCIL-0702 - - - CERN/0702 - - - CM-P00077286-e - - - CM-P00078235-f - - - European Organization for Nuclear Research - - - Addendum to the Report on the Design Study of a 300 GeV Proton Synchrotron (CERN/563) (AR/Int. SG/64-15) - - - Supplément au Rapport sur le projet du synchrotron à prontons de 300 GeV (CERN/563) (Ar/Int. SG/64-15) - Titre français - - - 1967 - - - 1967-05-30 - - - SISARC-2009 - - - CLAS1 - - - Inspire - - - SzGeCERN - Accelerators and Storage Rings - - - Design Report - - - CERN - - - CERN SPS - - - http://cds.cern.ch/record/21099/files/CM-P00077286-e.pdf - English - - - http://cds.cern.ch/record/21099/files/CM-P00078235-f.pdf - French - - - n - 200319 - - - 60 - - - 20031203 - 0855 - MAN01 - 19990126 - - - PUBLIC - - - 000003711MAN - - - COUNCIL - - - -
- -
- oai:cds.cern.ch:60936 - 2017-07-27T21:14:28Z - cerncds:FULLTEXT - forINSPIRE -
- - - 00000coc 2200000uu 4500 - 60936 - SzGeCERN - 20170803223648.0 - - oai:cds.cern.ch:60936 - cerncds:FULLTEXT - forINSPIRE - - - ADMADM - 0009846 - - - Inspire - 1614044 - - - eng - - - fre - - - CERN/SPC/0426 - - - CM-P00095369-e - - - CM-P00098683-f - - - 19781023 - 104th Meeting of Scientific Policy Committee - CERN, Geneva, Switzerland - 23 - 24 Oct 1978 - 1978 - cern19781023 - 104 - CH - 19781024 - - - LEP Studies 1979 to 1981 - - - Les Etudes sur le LEP de 1979 -1981 - Titre français - - - 1978 - - - 1978-10-06 - - - 78/140/5 - - - SISARC-2009 - - - CLAS1 - - - Inspire - - - SzGeCERN - Accelerators and Storage Rings - - - Design Report - - - CERN - - - CERN LEP - - - http://cds.cern.ch/record/60936/files/CM-P00098683-f.pdf - French - - - http://cds.cern.ch/record/60936/files/CERN-SPC-426.pdf - English - - - n - 200319 - - - 62 - - - 20031203 - 0901 - MAN01 - 19990126 - - - PUBLIC - - - 000009846MAN - - - SPC - - - -
-
-
diff --git a/tests/functional/cds/test_cds.py b/tests/functional/cds/test_cds.py index 93c60ce3..eb25cd27 100644 --- a/tests/functional/cds/test_cds.py +++ b/tests/functional/cds/test_cds.py @@ -7,20 +7,29 @@ # under the terms of the Revised BSD License; see LICENSE file for # more details. -"""Functional tests for ArXiv spider""" +"""Functional tests for CDS spider""" from __future__ import absolute_import, division, print_function +import os import pytest -from hepcrawl.testlib.tasks import app as celery_app from hepcrawl.testlib.celery_monitor import CeleryMonitor -from hepcrawl.testlib.utils import get_crawler_instance, deep_sort from hepcrawl.testlib.fixtures import ( - get_test_suite_path, expected_json_results_from_file, clean_dir, ) +from hepcrawl.testlib.tasks import app as celery_app +from hepcrawl.testlib.utils import get_crawler_instance, deep_sort + + +@pytest.fixture(scope='function', autouse=True) +def cleanup(): + clean_dir() + clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) + yield + clean_dir() + clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) def override_generated_fields(record): @@ -32,44 +41,39 @@ def override_generated_fields(record): return record -@pytest.fixture(scope="function") -def set_up_local_environment(): - package_location = get_test_suite_path( - 'cds', - 'fixtures', - 'oai_harvested', - 'cds_smoke_records.xml', - test_suite='functional', - ) - - yield { +def get_configuration(): + return { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { - 'source_file': 'file://' + package_location, + 'from_date': '2017-11-15', + 'oai_set': 'forINSPIRE', + 'oai_endpoint': 'http://cds-http-server.local/oai2d', } } - clean_dir() - @pytest.mark.parametrize( - 'expected_results', + 'expected_results, config', [ - expected_json_results_from_file( - 'cds', - 'fixtures', - 'cds_smoke_records_expected.json', + ( + expected_json_results_from_file( + 'cds', + 'fixtures', + 'cds_expected.json', + ), + get_configuration(), ), ], ids=[ 'smoke', ] ) -def test_cds(set_up_local_environment, expected_results): - crawler = get_crawler_instance( - set_up_local_environment.get('CRAWLER_HOST_URL') - ) +def test_cds( + expected_results, + config, +): + crawler = get_crawler_instance(config['CRAWLER_HOST_URL']) results = CeleryMonitor.do_crawl( app=celery_app, @@ -77,23 +81,10 @@ def test_cds(set_up_local_environment, expected_results): monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, - project=set_up_local_environment.get('CRAWLER_PROJECT'), + project=config['CRAWLER_PROJECT'], spider='CDS', settings={}, - **set_up_local_environment.get('CRAWLER_ARGUMENTS') - ) - - results = deep_sort( - sorted( - results, - key=lambda result: result['titles'][0]['title'], - ) - ) - expected_results = deep_sort( - sorted( - expected_results, - key=lambda result: result['titles'][0]['title'], - ) + **config['CRAWLER_ARGUMENTS'] ) gotten_results = [override_generated_fields(result) for result in results] @@ -101,70 +92,7 @@ def test_cds(set_up_local_environment, expected_results): override_generated_fields(expected) for expected in expected_results ] - assert gotten_results == expected_results - - -@pytest.mark.parametrize( - 'expected_results', - [ - expected_json_results_from_file( - 'cds', - 'fixtures', - 'cds_smoke_records_expected.json', - ), - ], - ids=[ - 'crawl_twice', - ] -) -def test_cds_crawl_twice(set_up_local_environment, expected_results): - crawler = get_crawler_instance( - set_up_local_environment.get('CRAWLER_HOST_URL') - ) - - results = CeleryMonitor.do_crawl( - app=celery_app, - monitor_timeout=5, - monitor_iter_limit=20, - events_limit=1, - crawler_instance=crawler, - project=set_up_local_environment.get('CRAWLER_PROJECT'), - spider='CDS', - settings={}, - **set_up_local_environment.get('CRAWLER_ARGUMENTS') - ) - - results = deep_sort( - sorted( - results, - key=lambda result: result['titles'][0]['title'], - ) - ) - expected_results = deep_sort( - sorted( - expected_results, - key=lambda result: result['titles'][0]['title'], - ) - ) - - gotten_results = [override_generated_fields(result) for result in results] - expected_results = [ - override_generated_fields(expected) for expected in expected_results - ] + gotten_results = deep_sort(gotten_results) + expected_results = deep_sort(expected_results) assert gotten_results == expected_results - - results = CeleryMonitor.do_crawl( - app=celery_app, - monitor_timeout=5, - monitor_iter_limit=20, - crawler_instance=crawler, - project=set_up_local_environment.get('CRAWLER_PROJECT'), - spider='CDS', - settings={}, - **set_up_local_environment.get('CRAWLER_ARGUMENTS') - ) - - gotten_results = [override_generated_fields(result) for result in results] - - assert gotten_results == [] diff --git a/tests/unit/test_oaipmh.py b/tests/unit/test_oaipmh.py new file mode 100644 index 00000000..f1715cce --- /dev/null +++ b/tests/unit/test_oaipmh.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +from datetime import datetime +from mock import patch +import pytest + +from hepcrawl.spiders.common.oaipmh_spider import OAIPMHSpider, NoLastRunToLoad +from hepcrawl.testlib.fixtures import clean_dir +from scrapy.utils.project import get_project_settings + + +LAST_RUN_TEST_FILE_SHA1 = '4fabe0a2d2f3cb58e656f307b6290b3edd46acd6' + + +def override_dynamic_fields(run): + if 'last_run_finished_at' in run: + run['last_run_finished_at'] = '2017-12-08T23:55:54.794969' + return run + + +@pytest.fixture(scope='function') +def cleanup(): + yield + clean_dir('/tmp/last_runs/') + + +@pytest.fixture +def settings(): + settings_patch = { + 'LAST_RUNS_PATH': '/tmp/last_runs/' + } + settings = get_project_settings() + with patch.dict(settings, settings_patch): + yield settings + + +@pytest.fixture +def spider(settings): + class TestOAIPMHSpider(OAIPMHSpider): + def parse_record(self, record): + return None + + spider = TestOAIPMHSpider('http://0.0.0.0/oai2', settings=settings) + spider.from_date = '2017-12-08' + spider.set = 'physics:hep-th' + spider.metadata_prefix = 'marcxml' + yield spider + + +def test_last_run_file_path(spider): + expected = '/tmp/last_runs/OAI-PMH/{}.json'.format(LAST_RUN_TEST_FILE_SHA1) + result = spider._last_run_file_path() + assert expected == result + + +def test_load_last_run(spider, cleanup): + now = datetime.utcnow() + spider._save_run(started_at=now) + + expected = override_dynamic_fields({ + 'spider': 'OAI-PMH', + 'url': 'http://0.0.0.0/oai2', + 'metadata_prefix': 'marcxml', + 'set': 'physics:hep-th', + 'from_date': '2017-12-08', + 'until_date': None, + 'last_run_started_at': now.isoformat(), + 'last_run_finished_at': '2017-12-08T13:55:00.000000', + }) + + result = override_dynamic_fields(spider._load_last_run()) + + assert expected == result + + +def test_load_last_run_nonexistent(spider): + with pytest.raises(NoLastRunToLoad): + spider._load_last_run() + + +def test_resume_from_nonexistent_no_error(spider): + resume_from = spider._resume_from + assert resume_from == None