From 4abe6b32dd2d1c71177e2888e3f4105663e3c1dc Mon Sep 17 00:00:00 2001 From: Samuele Kaplun Date: Tue, 10 Oct 2017 15:46:54 +0200 Subject: [PATCH] create a OAI-PMH spider to use in CDS spider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also remove old tests. Fixes #197. Co-authored-by: Samuele Kaplun Signed-off-by: Szymon Łopaciuk --- hepcrawl/downloaders.py | 22 ++ hepcrawl/pipelines.py | 7 +- hepcrawl/scrapy.cfg | 2 +- hepcrawl/settings.py | 8 + hepcrawl/spiders/arxiv_spider.py | 2 +- hepcrawl/spiders/cds_spider.py | 68 +++-- hepcrawl/spiders/oaipmh_spider.py | 100 +++++++ setup.py | 1 + .../oai_harvested/cds_smoke_records.xml | 246 ------------------ tests/functional/cds/test_cds.py | 170 ++---------- 10 files changed, 187 insertions(+), 439 deletions(-) create mode 100644 hepcrawl/downloaders.py create mode 100644 hepcrawl/spiders/oaipmh_spider.py delete mode 100644 tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml diff --git a/hepcrawl/downloaders.py b/hepcrawl/downloaders.py new file mode 100644 index 00000000..49d9fba7 --- /dev/null +++ b/hepcrawl/downloaders.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2016, 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +"""Additional downloaders.""" + + +from scrapy.http import Response + + +class DummyDownloadHandler(object): + def __init__(self, *args, **kwargs): + pass + + def download_request(self, request, spider): + url = request.url + return Response(url, request=request) diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py index b30ff6c7..100cda5d 100644 --- a/hepcrawl/pipelines.py +++ b/hepcrawl/pipelines.py @@ -116,8 +116,11 @@ def process_item(self, item, spider): hep_record = self._post_enhance_item(item, spider) - validate(hep_record, 'hep') - spider.logger.debug('Validated item by Inspire Schemas.') + try: + validate(hep_record, 'hep') + spider.logger.debug('Validated item by Inspire Schemas.') + except Exception as err: + spider.logger.error('ERROR in validating {}: {}'.format(hep_record, err)) self.results_data.append(hep_record) diff --git a/hepcrawl/scrapy.cfg b/hepcrawl/scrapy.cfg index adffa153..1ec7711e 100644 --- a/hepcrawl/scrapy.cfg +++ b/hepcrawl/scrapy.cfg @@ -14,7 +14,7 @@ default = hepcrawl.settings [deploy] -url = http://scrapyd:6800/ +url = http://localhost:6800/ project = hepcrawl #username = scrapy #password = secret diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py index 0d9581c6..c5c11c4a 100644 --- a/hepcrawl/settings.py +++ b/hepcrawl/settings.py @@ -19,6 +19,8 @@ from __future__ import absolute_import, division, print_function +from scrapy.settings import default_settings + import os @@ -71,6 +73,12 @@ 'hepcrawl.middlewares.HepcrawlCrawlOnceMiddleware': 100, } +DOWNLOAD_HANDLERS_BASE = dict(default_settings.DOWNLOAD_HANDLERS_BASE) +DOWNLOAD_HANDLERS_BASE.update({ + 'oaipmh+http': 'hepcrawl.downloaders.DummyDownloadHandler', + 'oaipmh+https': 'hepcrawl.downloaders.DummyDownloadHandler', +}) + # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { diff --git a/hepcrawl/spiders/arxiv_spider.py b/hepcrawl/spiders/arxiv_spider.py index 64d076dc..82e086fb 100644 --- a/hepcrawl/spiders/arxiv_spider.py +++ b/hepcrawl/spiders/arxiv_spider.py @@ -46,7 +46,7 @@ class ArxivSpider(StatefulSpider, XMLFeedSpider): """ name = 'arXiv' - iterator = 'xml' + iterator = 'iternodes' itertag = 'OAI-PMH:record' namespaces = [ ("OAI-PMH", "http://www.openarchives.org/OAI/2.0/") diff --git a/hepcrawl/spiders/cds_spider.py b/hepcrawl/spiders/cds_spider.py index 9353d6d3..25ff0071 100644 --- a/hepcrawl/spiders/cds_spider.py +++ b/hepcrawl/spiders/cds_spider.py @@ -9,8 +9,11 @@ """Spider for the CERN Document Server OAI-PMH interface""" -from scrapy.spider import XMLFeedSpider +import logging from scrapy import Request +from scrapy.http import XmlResponse +from scrapy.selector import Selector +from flask.app import Flask from harvestingkit.inspire_cds_package.from_cds import CDS2Inspire from harvestingkit.bibrecord import ( create_record as create_bibrec, @@ -19,19 +22,19 @@ from dojson.contrib.marc21.utils import create_record from inspire_dojson.hep import hep -from . import StatefulSpider +from .oaipmh_spider import OAIPMHSpider from ..utils import ParsedItem +logger = logging.getLogger(__name__) -class CDSSpider(StatefulSpider, XMLFeedSpider): +class CDSSpider(OAIPMHSpider): """Spider for crawling the CERN Document Server OAI-PMH XML files. Example: Using OAI-PMH XML files:: - $ scrapy crawl \\ - cds \\ - -a "source_file=file://$PWD/tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml" + $ scrapy crawl CDS \\ + -a "set=forINSPIRE" -a "from_date=2017-10-10" It uses `HarvestingKit `_ to translate from CDS's MARCXML into INSPIRE Legacy's MARCXML flavor. It then @@ -40,36 +43,29 @@ class CDSSpider(StatefulSpider, XMLFeedSpider): """ name = 'CDS' - iterator = 'xml' - itertag = 'OAI-PMH:record' - namespaces = [ - ('OAI-PMH', 'http://www.openarchives.org/OAI/2.0/'), - ('marc', 'http://www.loc.gov/MARC21/slim'), - ] - def __init__(self, source_file=None, **kwargs): - super(CDSSpider, self).__init__(**kwargs) - self.source_file = source_file + def __init__(self, from_date=None, set="forINSPIRE", *args, **kwargs): + super(CDSSpider, self).__init__(url='http://cds.cern.ch/oai2d', metadata_prefix='marcxml', set=set, from_date=from_date, **kwargs) - def start_requests(self): - yield Request(self.source_file) - - def parse_node(self, response, node): - node.remove_namespaces() - cds_bibrec, ok, errs = create_bibrec( - node.xpath('.//record').extract()[0] - ) - if not ok: - raise RuntimeError("Cannot parse record %s: %s", node, errs) - self.logger.info("Here's the record: %s" % cds_bibrec) - inspire_bibrec = CDS2Inspire(cds_bibrec).get_record() - marcxml_record = record_xml_output(inspire_bibrec) - record = create_record(marcxml_record) - json_record = hep.do(record) - base_uri = self.settings['SCHEMA_BASE_URI'] - json_record['$schema'] = base_uri + 'hep.json' - parsed_item = ParsedItem( - record=json_record, - record_format='hep', + def parse_record(self, record): + response = XmlResponse(self.url, encoding='utf-8', body=record.raw) + selector = Selector(response, type='xml') + selector.remove_namespaces() + try: + cds_bibrec, ok, errs = create_bibrec(selector.xpath('.//record').extract()[0]) + if not ok: + raise RuntimeError("Cannot parse record %s: %s", record, errs) + self.logger.info("Here's the record: %s" % cds_bibrec) + inspire_bibrec = CDS2Inspire(cds_bibrec).get_record() + marcxml_record = record_xml_output(inspire_bibrec) + record = create_record(marcxml_record) + app = Flask('hepcrawl') + app.config.update( + self.settings.getdict('MARC_TO_HEP_SETTINGS', {}) ) - return parsed_item + with app.app_context(): + json_record = hep.do(record) + return ParsedItem(record=json_record, record_format='hep') + except Exception: + logger.exception("Error when parsing record") + return None diff --git a/hepcrawl/spiders/oaipmh_spider.py b/hepcrawl/spiders/oaipmh_spider.py new file mode 100644 index 00000000..375799e9 --- /dev/null +++ b/hepcrawl/spiders/oaipmh_spider.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2015, 2016, 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +"""Generic spider for OAI-PMH servers.""" + +import logging +import sickle +from datetime import datetime + +from sickle import Sickle +from sickle.models import Record +from sickle.oaiexceptions import NoRecordsMatch + +from scrapy.http import Request +from scrapy.spiders import Spider + +logger = logging.getLogger(__name__) + +class OAIPMHSpider(Spider): + """ + Implements a spider for the OAI-PMH protocol by using the Python sickle library. + + In case of successful harvest (OAI-PMH crawling) the spider will remember the initial starting + date and will use it as `from_date` argument on the next harvest. + """ + name = 'OAI-PMH' + state = {} + + def __init__(self, url, metadata_prefix='marcxml', set=None, alias=None, from_date=None, until_date=None, granularity='YYYY-MM-DD', record_class=Record, *args, **kwargs): + super(OAIPMHSpider, self).__init__(*args, **kwargs) + self.url = url + self.metadata_prefix = metadata_prefix + self.set = set + self.granularity = granularity + self.alias = alias or self._make_alias() + self.from_date = from_date + logger.info("Current state:{}".format(self.state)) + self.until_date = until_date + self.record_class = record_class + + def start_requests(self): + self.from_date = self.from_date or self.state.get(self.alias) + logger.info("Current state 2:{}".format(self.state)) + logger.info("Starting harvesting of {url} with set={set} and metadataPrefix={metadata_prefix}, from={from_date}, until={until_date}".format( + url=self.url, + set=self.set, + metadata_prefix=self.metadata_prefix, + from_date=self.from_date, + until_date=self.until_date + )) + now = datetime.utcnow() + request = Request('oaipmh+{}'.format(self.url), self.parse) + yield request + self.state[self.alias] = self._format_date(now) + logger.info("Harvesting completed. Next harvesting will resume from {}".format(self.state[self.alias])) + + def parse_record(self, record): + """ + This method need to be reimplemented in order to provide special parsing. + """ + return record.xml + + def parse(self, response): + sickle = Sickle(self.url, class_mapping={ + 'ListRecords': self.record_class, + 'GetRecord': self.record_class, + }) + try: + records = sickle.ListRecords(**{ + 'metadataPrefix': self.metadata_prefix, + 'set': self.set, + 'from': self.from_date, + 'until': self.until_date, + }) + except NoRecordsMatch as err: + logger.warning(err) + raise StopIteration() + for record in records: + yield self.parse_record(record) + + def _format_date(self, datetime_object): + if self.granularity == 'YYYY-MM-DD': + return datetime_object.strftime('%Y-%m-%d') + elif self.granularity == 'YYYY-MM-DDThh:mm:ssZ': + return datetime_object.strftime('%Y-%m-%dT%H:%M:%SZ') + else: + raise RuntimeError("Invalid granularity: %s" % self.granularity) + + def _make_alias(self): + return '{url}-{metadata_prefix}-{set}'.format( + url=self.url, + metadata_prefix=self.metadata_prefix, + set=self.set + ) diff --git a/setup.py b/setup.py index 728d85f1..ba297d41 100644 --- a/setup.py +++ b/setup.py @@ -37,6 +37,7 @@ 'python-dateutil>=2.4.2', 'python-scrapyd-api>=2.0.1', 'harvestingkit>=0.6.12', + 'Sickle~=0.6,>=0.6.2', ] tests_require = [ diff --git a/tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml b/tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml deleted file mode 100644 index b3c521f6..00000000 --- a/tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml +++ /dev/null @@ -1,246 +0,0 @@ - - - - 2017-10-04T07:25:58Z - http://cds.cern.ch/oai2d - - -
- oai:cds.cern.ch:21099 - 2017-07-27T21:14:27Z - cerncds:FULLTEXT - forINSPIRE -
- - - 00000coc 2200000uu 4500 - 21099 - SzGeCERN - 20170803223648.0 - - oai:cds.cern.ch:21099 - cerncds:FULLTEXT - forINSPIRE - - - ADMADM - 0003711 - - - Inspire - 1614043 - - - eng - - - fre - - - COUNCIL-0702 - - - CERN/0702 - - - CM-P00077286-e - - - CM-P00078235-f - - - European Organization for Nuclear Research - - - Addendum to the Report on the Design Study of a 300 GeV Proton Synchrotron (CERN/563) (AR/Int. SG/64-15) - - - Supplément au Rapport sur le projet du synchrotron à prontons de 300 GeV (CERN/563) (Ar/Int. SG/64-15) - Titre français - - - 1967 - - - 1967-05-30 - - - SISARC-2009 - - - CLAS1 - - - Inspire - - - SzGeCERN - Accelerators and Storage Rings - - - Design Report - - - CERN - - - CERN SPS - - - http://cds.cern.ch/record/21099/files/CM-P00077286-e.pdf - English - - - http://cds.cern.ch/record/21099/files/CM-P00078235-f.pdf - French - - - n - 200319 - - - 60 - - - 20031203 - 0855 - MAN01 - 19990126 - - - PUBLIC - - - 000003711MAN - - - COUNCIL - - - -
- -
- oai:cds.cern.ch:60936 - 2017-07-27T21:14:28Z - cerncds:FULLTEXT - forINSPIRE -
- - - 00000coc 2200000uu 4500 - 60936 - SzGeCERN - 20170803223648.0 - - oai:cds.cern.ch:60936 - cerncds:FULLTEXT - forINSPIRE - - - ADMADM - 0009846 - - - Inspire - 1614044 - - - eng - - - fre - - - CERN/SPC/0426 - - - CM-P00095369-e - - - CM-P00098683-f - - - 19781023 - 104th Meeting of Scientific Policy Committee - CERN, Geneva, Switzerland - 23 - 24 Oct 1978 - 1978 - cern19781023 - 104 - CH - 19781024 - - - LEP Studies 1979 to 1981 - - - Les Etudes sur le LEP de 1979 -1981 - Titre français - - - 1978 - - - 1978-10-06 - - - 78/140/5 - - - SISARC-2009 - - - CLAS1 - - - Inspire - - - SzGeCERN - Accelerators and Storage Rings - - - Design Report - - - CERN - - - CERN LEP - - - http://cds.cern.ch/record/60936/files/CM-P00098683-f.pdf - French - - - http://cds.cern.ch/record/60936/files/CERN-SPC-426.pdf - English - - - n - 200319 - - - 62 - - - 20031203 - 0901 - MAN01 - 19990126 - - - PUBLIC - - - 000009846MAN - - - SPC - - - -
-
-
diff --git a/tests/functional/cds/test_cds.py b/tests/functional/cds/test_cds.py index 93c60ce3..3b825a31 100644 --- a/tests/functional/cds/test_cds.py +++ b/tests/functional/cds/test_cds.py @@ -7,164 +7,28 @@ # under the terms of the Revised BSD License; see LICENSE file for # more details. -"""Functional tests for ArXiv spider""" - -from __future__ import absolute_import, division, print_function +"""Functional tests for CDS spider""" import pytest +import requests_mock -from hepcrawl.testlib.tasks import app as celery_app -from hepcrawl.testlib.celery_monitor import CeleryMonitor -from hepcrawl.testlib.utils import get_crawler_instance, deep_sort -from hepcrawl.testlib.fixtures import ( - get_test_suite_path, - expected_json_results_from_file, - clean_dir, -) - - -def override_generated_fields(record): - record['acquisition_source']['datetime'] = u'2017-04-03T10:26:40.365216' - record['acquisition_source']['submission_number'] = ( - u'5652c7f6190f11e79e8000224dabeaad' - ) - - return record - - -@pytest.fixture(scope="function") -def set_up_local_environment(): - package_location = get_test_suite_path( - 'cds', - 'fixtures', - 'oai_harvested', - 'cds_smoke_records.xml', - test_suite='functional', - ) - - yield { - 'CRAWLER_HOST_URL': 'http://scrapyd:6800', - 'CRAWLER_PROJECT': 'hepcrawl', - 'CRAWLER_ARGUMENTS': { - 'source_file': 'file://' + package_location, - } - } - - clean_dir() - - -@pytest.mark.parametrize( - 'expected_results', - [ - expected_json_results_from_file( - 'cds', - 'fixtures', - 'cds_smoke_records_expected.json', - ), - ], - ids=[ - 'smoke', - ] -) -def test_cds(set_up_local_environment, expected_results): - crawler = get_crawler_instance( - set_up_local_environment.get('CRAWLER_HOST_URL') - ) - - results = CeleryMonitor.do_crawl( - app=celery_app, - monitor_timeout=5, - monitor_iter_limit=100, - events_limit=1, - crawler_instance=crawler, - project=set_up_local_environment.get('CRAWLER_PROJECT'), - spider='CDS', - settings={}, - **set_up_local_environment.get('CRAWLER_ARGUMENTS') - ) - - results = deep_sort( - sorted( - results, - key=lambda result: result['titles'][0]['title'], - ) - ) - expected_results = deep_sort( - sorted( - expected_results, - key=lambda result: result['titles'][0]['title'], - ) - ) - - gotten_results = [override_generated_fields(result) for result in results] - expected_results = [ - override_generated_fields(expected) for expected in expected_results - ] - - assert gotten_results == expected_results - - -@pytest.mark.parametrize( - 'expected_results', - [ - expected_json_results_from_file( - 'cds', - 'fixtures', - 'cds_smoke_records_expected.json', - ), - ], - ids=[ - 'crawl_twice', - ] -) -def test_cds_crawl_twice(set_up_local_environment, expected_results): - crawler = get_crawler_instance( - set_up_local_environment.get('CRAWLER_HOST_URL') - ) - - results = CeleryMonitor.do_crawl( - app=celery_app, - monitor_timeout=5, - monitor_iter_limit=20, - events_limit=1, - crawler_instance=crawler, - project=set_up_local_environment.get('CRAWLER_PROJECT'), - spider='CDS', - settings={}, - **set_up_local_environment.get('CRAWLER_ARGUMENTS') - ) - - results = deep_sort( - sorted( - results, - key=lambda result: result['titles'][0]['title'], - ) - ) - expected_results = deep_sort( - sorted( - expected_results, - key=lambda result: result['titles'][0]['title'], - ) - ) +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings - gotten_results = [override_generated_fields(result) for result in results] - expected_results = [ - override_generated_fields(expected) for expected in expected_results - ] +from hepcrawl.testlib.fixtures import get_test_suite_path - assert gotten_results == expected_results - results = CeleryMonitor.do_crawl( - app=celery_app, - monitor_timeout=5, - monitor_iter_limit=20, - crawler_instance=crawler, - project=set_up_local_environment.get('CRAWLER_PROJECT'), - spider='CDS', - settings={}, - **set_up_local_environment.get('CRAWLER_ARGUMENTS') - ) +@pytest.fixture +def cds_oai_server(): + with requests_mock.Mocker() as m: + m.get('http://cds.cern.ch/oai2d?from=2017-10-10&verb=ListRecords&set=forINSPIRE&metadataPrefix=marcxml', + text=open(get_test_suite_path('cds', 'fixtures', 'cds1.xml', test_suite='functional')).read()) + m.get('http://cds.cern.ch/oai2d?from=2017-10-10&verb=ListRecords&&resumptionToken=___kuYtYs', + text=open(get_test_suite_path('cds', 'fixtures', 'cds2.xml', test_suite='functional')).read()) + yield m - gotten_results = [override_generated_fields(result) for result in results] - assert gotten_results == [] +def test_cds(cds_oai_server): + process = CrawlerProcess(get_project_settings()) + process.crawl('CDS', from_date='2017-10-10') + process.start()