From fad1b504c32ffb161d53a3180260fa47c15cee1a Mon Sep 17 00:00:00 2001 From: Samuele Kaplun Date: Tue, 10 Oct 2017 15:46:54 +0200 Subject: [PATCH 01/21] create a OAI-PMH spider to use in CDS spider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also remove old tests. Fixes #197. Co-authored-by: Samuele Kaplun Signed-off-by: Szymon Łopaciuk --- hepcrawl/downloaders.py | 22 ++ hepcrawl/pipelines.py | 7 +- hepcrawl/scrapy.cfg | 2 +- hepcrawl/settings.py | 8 + hepcrawl/spiders/arxiv_spider.py | 2 +- hepcrawl/spiders/cds_spider.py | 68 +++-- hepcrawl/spiders/oaipmh_spider.py | 100 +++++++ setup.py | 1 + .../oai_harvested/cds_smoke_records.xml | 246 ------------------ tests/functional/cds/test_cds.py | 170 ++---------- 10 files changed, 187 insertions(+), 439 deletions(-) create mode 100644 hepcrawl/downloaders.py create mode 100644 hepcrawl/spiders/oaipmh_spider.py delete mode 100644 tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml diff --git a/hepcrawl/downloaders.py b/hepcrawl/downloaders.py new file mode 100644 index 00000000..49d9fba7 --- /dev/null +++ b/hepcrawl/downloaders.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2016, 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +"""Additional downloaders.""" + + +from scrapy.http import Response + + +class DummyDownloadHandler(object): + def __init__(self, *args, **kwargs): + pass + + def download_request(self, request, spider): + url = request.url + return Response(url, request=request) diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py index b30ff6c7..100cda5d 100644 --- a/hepcrawl/pipelines.py +++ b/hepcrawl/pipelines.py @@ -116,8 +116,11 @@ def process_item(self, item, spider): hep_record = self._post_enhance_item(item, spider) - validate(hep_record, 'hep') - spider.logger.debug('Validated item by Inspire Schemas.') + try: + validate(hep_record, 'hep') + spider.logger.debug('Validated item by Inspire Schemas.') + except Exception as err: + spider.logger.error('ERROR in validating {}: {}'.format(hep_record, err)) self.results_data.append(hep_record) diff --git a/hepcrawl/scrapy.cfg b/hepcrawl/scrapy.cfg index adffa153..1ec7711e 100644 --- a/hepcrawl/scrapy.cfg +++ b/hepcrawl/scrapy.cfg @@ -14,7 +14,7 @@ default = hepcrawl.settings [deploy] -url = http://scrapyd:6800/ +url = http://localhost:6800/ project = hepcrawl #username = scrapy #password = secret diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py index 0d9581c6..c5c11c4a 100644 --- a/hepcrawl/settings.py +++ b/hepcrawl/settings.py @@ -19,6 +19,8 @@ from __future__ import absolute_import, division, print_function +from scrapy.settings import default_settings + import os @@ -71,6 +73,12 @@ 'hepcrawl.middlewares.HepcrawlCrawlOnceMiddleware': 100, } +DOWNLOAD_HANDLERS_BASE = dict(default_settings.DOWNLOAD_HANDLERS_BASE) +DOWNLOAD_HANDLERS_BASE.update({ + 'oaipmh+http': 'hepcrawl.downloaders.DummyDownloadHandler', + 'oaipmh+https': 'hepcrawl.downloaders.DummyDownloadHandler', +}) + # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { diff --git a/hepcrawl/spiders/arxiv_spider.py b/hepcrawl/spiders/arxiv_spider.py index 64d076dc..82e086fb 100644 --- a/hepcrawl/spiders/arxiv_spider.py +++ b/hepcrawl/spiders/arxiv_spider.py @@ -46,7 +46,7 @@ class ArxivSpider(StatefulSpider, XMLFeedSpider): """ name = 'arXiv' - iterator = 'xml' + iterator = 'iternodes' itertag = 'OAI-PMH:record' namespaces = [ ("OAI-PMH", "http://www.openarchives.org/OAI/2.0/") diff --git a/hepcrawl/spiders/cds_spider.py b/hepcrawl/spiders/cds_spider.py index 9353d6d3..25ff0071 100644 --- a/hepcrawl/spiders/cds_spider.py +++ b/hepcrawl/spiders/cds_spider.py @@ -9,8 +9,11 @@ """Spider for the CERN Document Server OAI-PMH interface""" -from scrapy.spider import XMLFeedSpider +import logging from scrapy import Request +from scrapy.http import XmlResponse +from scrapy.selector import Selector +from flask.app import Flask from harvestingkit.inspire_cds_package.from_cds import CDS2Inspire from harvestingkit.bibrecord import ( create_record as create_bibrec, @@ -19,19 +22,19 @@ from dojson.contrib.marc21.utils import create_record from inspire_dojson.hep import hep -from . import StatefulSpider +from .oaipmh_spider import OAIPMHSpider from ..utils import ParsedItem +logger = logging.getLogger(__name__) -class CDSSpider(StatefulSpider, XMLFeedSpider): +class CDSSpider(OAIPMHSpider): """Spider for crawling the CERN Document Server OAI-PMH XML files. Example: Using OAI-PMH XML files:: - $ scrapy crawl \\ - cds \\ - -a "source_file=file://$PWD/tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml" + $ scrapy crawl CDS \\ + -a "set=forINSPIRE" -a "from_date=2017-10-10" It uses `HarvestingKit `_ to translate from CDS's MARCXML into INSPIRE Legacy's MARCXML flavor. It then @@ -40,36 +43,29 @@ class CDSSpider(StatefulSpider, XMLFeedSpider): """ name = 'CDS' - iterator = 'xml' - itertag = 'OAI-PMH:record' - namespaces = [ - ('OAI-PMH', 'http://www.openarchives.org/OAI/2.0/'), - ('marc', 'http://www.loc.gov/MARC21/slim'), - ] - def __init__(self, source_file=None, **kwargs): - super(CDSSpider, self).__init__(**kwargs) - self.source_file = source_file + def __init__(self, from_date=None, set="forINSPIRE", *args, **kwargs): + super(CDSSpider, self).__init__(url='http://cds.cern.ch/oai2d', metadata_prefix='marcxml', set=set, from_date=from_date, **kwargs) - def start_requests(self): - yield Request(self.source_file) - - def parse_node(self, response, node): - node.remove_namespaces() - cds_bibrec, ok, errs = create_bibrec( - node.xpath('.//record').extract()[0] - ) - if not ok: - raise RuntimeError("Cannot parse record %s: %s", node, errs) - self.logger.info("Here's the record: %s" % cds_bibrec) - inspire_bibrec = CDS2Inspire(cds_bibrec).get_record() - marcxml_record = record_xml_output(inspire_bibrec) - record = create_record(marcxml_record) - json_record = hep.do(record) - base_uri = self.settings['SCHEMA_BASE_URI'] - json_record['$schema'] = base_uri + 'hep.json' - parsed_item = ParsedItem( - record=json_record, - record_format='hep', + def parse_record(self, record): + response = XmlResponse(self.url, encoding='utf-8', body=record.raw) + selector = Selector(response, type='xml') + selector.remove_namespaces() + try: + cds_bibrec, ok, errs = create_bibrec(selector.xpath('.//record').extract()[0]) + if not ok: + raise RuntimeError("Cannot parse record %s: %s", record, errs) + self.logger.info("Here's the record: %s" % cds_bibrec) + inspire_bibrec = CDS2Inspire(cds_bibrec).get_record() + marcxml_record = record_xml_output(inspire_bibrec) + record = create_record(marcxml_record) + app = Flask('hepcrawl') + app.config.update( + self.settings.getdict('MARC_TO_HEP_SETTINGS', {}) ) - return parsed_item + with app.app_context(): + json_record = hep.do(record) + return ParsedItem(record=json_record, record_format='hep') + except Exception: + logger.exception("Error when parsing record") + return None diff --git a/hepcrawl/spiders/oaipmh_spider.py b/hepcrawl/spiders/oaipmh_spider.py new file mode 100644 index 00000000..375799e9 --- /dev/null +++ b/hepcrawl/spiders/oaipmh_spider.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2015, 2016, 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +"""Generic spider for OAI-PMH servers.""" + +import logging +import sickle +from datetime import datetime + +from sickle import Sickle +from sickle.models import Record +from sickle.oaiexceptions import NoRecordsMatch + +from scrapy.http import Request +from scrapy.spiders import Spider + +logger = logging.getLogger(__name__) + +class OAIPMHSpider(Spider): + """ + Implements a spider for the OAI-PMH protocol by using the Python sickle library. + + In case of successful harvest (OAI-PMH crawling) the spider will remember the initial starting + date and will use it as `from_date` argument on the next harvest. + """ + name = 'OAI-PMH' + state = {} + + def __init__(self, url, metadata_prefix='marcxml', set=None, alias=None, from_date=None, until_date=None, granularity='YYYY-MM-DD', record_class=Record, *args, **kwargs): + super(OAIPMHSpider, self).__init__(*args, **kwargs) + self.url = url + self.metadata_prefix = metadata_prefix + self.set = set + self.granularity = granularity + self.alias = alias or self._make_alias() + self.from_date = from_date + logger.info("Current state:{}".format(self.state)) + self.until_date = until_date + self.record_class = record_class + + def start_requests(self): + self.from_date = self.from_date or self.state.get(self.alias) + logger.info("Current state 2:{}".format(self.state)) + logger.info("Starting harvesting of {url} with set={set} and metadataPrefix={metadata_prefix}, from={from_date}, until={until_date}".format( + url=self.url, + set=self.set, + metadata_prefix=self.metadata_prefix, + from_date=self.from_date, + until_date=self.until_date + )) + now = datetime.utcnow() + request = Request('oaipmh+{}'.format(self.url), self.parse) + yield request + self.state[self.alias] = self._format_date(now) + logger.info("Harvesting completed. Next harvesting will resume from {}".format(self.state[self.alias])) + + def parse_record(self, record): + """ + This method need to be reimplemented in order to provide special parsing. + """ + return record.xml + + def parse(self, response): + sickle = Sickle(self.url, class_mapping={ + 'ListRecords': self.record_class, + 'GetRecord': self.record_class, + }) + try: + records = sickle.ListRecords(**{ + 'metadataPrefix': self.metadata_prefix, + 'set': self.set, + 'from': self.from_date, + 'until': self.until_date, + }) + except NoRecordsMatch as err: + logger.warning(err) + raise StopIteration() + for record in records: + yield self.parse_record(record) + + def _format_date(self, datetime_object): + if self.granularity == 'YYYY-MM-DD': + return datetime_object.strftime('%Y-%m-%d') + elif self.granularity == 'YYYY-MM-DDThh:mm:ssZ': + return datetime_object.strftime('%Y-%m-%dT%H:%M:%SZ') + else: + raise RuntimeError("Invalid granularity: %s" % self.granularity) + + def _make_alias(self): + return '{url}-{metadata_prefix}-{set}'.format( + url=self.url, + metadata_prefix=self.metadata_prefix, + set=self.set + ) diff --git a/setup.py b/setup.py index 728d85f1..ba297d41 100644 --- a/setup.py +++ b/setup.py @@ -37,6 +37,7 @@ 'python-dateutil>=2.4.2', 'python-scrapyd-api>=2.0.1', 'harvestingkit>=0.6.12', + 'Sickle~=0.6,>=0.6.2', ] tests_require = [ diff --git a/tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml b/tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml deleted file mode 100644 index b3c521f6..00000000 --- a/tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml +++ /dev/null @@ -1,246 +0,0 @@ - - - - 2017-10-04T07:25:58Z - http://cds.cern.ch/oai2d - - -
- oai:cds.cern.ch:21099 - 2017-07-27T21:14:27Z - cerncds:FULLTEXT - forINSPIRE -
- - - 00000coc 2200000uu 4500 - 21099 - SzGeCERN - 20170803223648.0 - - oai:cds.cern.ch:21099 - cerncds:FULLTEXT - forINSPIRE - - - ADMADM - 0003711 - - - Inspire - 1614043 - - - eng - - - fre - - - COUNCIL-0702 - - - CERN/0702 - - - CM-P00077286-e - - - CM-P00078235-f - - - European Organization for Nuclear Research - - - Addendum to the Report on the Design Study of a 300 GeV Proton Synchrotron (CERN/563) (AR/Int. SG/64-15) - - - Supplément au Rapport sur le projet du synchrotron à prontons de 300 GeV (CERN/563) (Ar/Int. SG/64-15) - Titre français - - - 1967 - - - 1967-05-30 - - - SISARC-2009 - - - CLAS1 - - - Inspire - - - SzGeCERN - Accelerators and Storage Rings - - - Design Report - - - CERN - - - CERN SPS - - - http://cds.cern.ch/record/21099/files/CM-P00077286-e.pdf - English - - - http://cds.cern.ch/record/21099/files/CM-P00078235-f.pdf - French - - - n - 200319 - - - 60 - - - 20031203 - 0855 - MAN01 - 19990126 - - - PUBLIC - - - 000003711MAN - - - COUNCIL - - - -
- -
- oai:cds.cern.ch:60936 - 2017-07-27T21:14:28Z - cerncds:FULLTEXT - forINSPIRE -
- - - 00000coc 2200000uu 4500 - 60936 - SzGeCERN - 20170803223648.0 - - oai:cds.cern.ch:60936 - cerncds:FULLTEXT - forINSPIRE - - - ADMADM - 0009846 - - - Inspire - 1614044 - - - eng - - - fre - - - CERN/SPC/0426 - - - CM-P00095369-e - - - CM-P00098683-f - - - 19781023 - 104th Meeting of Scientific Policy Committee - CERN, Geneva, Switzerland - 23 - 24 Oct 1978 - 1978 - cern19781023 - 104 - CH - 19781024 - - - LEP Studies 1979 to 1981 - - - Les Etudes sur le LEP de 1979 -1981 - Titre français - - - 1978 - - - 1978-10-06 - - - 78/140/5 - - - SISARC-2009 - - - CLAS1 - - - Inspire - - - SzGeCERN - Accelerators and Storage Rings - - - Design Report - - - CERN - - - CERN LEP - - - http://cds.cern.ch/record/60936/files/CM-P00098683-f.pdf - French - - - http://cds.cern.ch/record/60936/files/CERN-SPC-426.pdf - English - - - n - 200319 - - - 62 - - - 20031203 - 0901 - MAN01 - 19990126 - - - PUBLIC - - - 000009846MAN - - - SPC - - - -
-
-
diff --git a/tests/functional/cds/test_cds.py b/tests/functional/cds/test_cds.py index 93c60ce3..3b825a31 100644 --- a/tests/functional/cds/test_cds.py +++ b/tests/functional/cds/test_cds.py @@ -7,164 +7,28 @@ # under the terms of the Revised BSD License; see LICENSE file for # more details. -"""Functional tests for ArXiv spider""" - -from __future__ import absolute_import, division, print_function +"""Functional tests for CDS spider""" import pytest +import requests_mock -from hepcrawl.testlib.tasks import app as celery_app -from hepcrawl.testlib.celery_monitor import CeleryMonitor -from hepcrawl.testlib.utils import get_crawler_instance, deep_sort -from hepcrawl.testlib.fixtures import ( - get_test_suite_path, - expected_json_results_from_file, - clean_dir, -) - - -def override_generated_fields(record): - record['acquisition_source']['datetime'] = u'2017-04-03T10:26:40.365216' - record['acquisition_source']['submission_number'] = ( - u'5652c7f6190f11e79e8000224dabeaad' - ) - - return record - - -@pytest.fixture(scope="function") -def set_up_local_environment(): - package_location = get_test_suite_path( - 'cds', - 'fixtures', - 'oai_harvested', - 'cds_smoke_records.xml', - test_suite='functional', - ) - - yield { - 'CRAWLER_HOST_URL': 'http://scrapyd:6800', - 'CRAWLER_PROJECT': 'hepcrawl', - 'CRAWLER_ARGUMENTS': { - 'source_file': 'file://' + package_location, - } - } - - clean_dir() - - -@pytest.mark.parametrize( - 'expected_results', - [ - expected_json_results_from_file( - 'cds', - 'fixtures', - 'cds_smoke_records_expected.json', - ), - ], - ids=[ - 'smoke', - ] -) -def test_cds(set_up_local_environment, expected_results): - crawler = get_crawler_instance( - set_up_local_environment.get('CRAWLER_HOST_URL') - ) - - results = CeleryMonitor.do_crawl( - app=celery_app, - monitor_timeout=5, - monitor_iter_limit=100, - events_limit=1, - crawler_instance=crawler, - project=set_up_local_environment.get('CRAWLER_PROJECT'), - spider='CDS', - settings={}, - **set_up_local_environment.get('CRAWLER_ARGUMENTS') - ) - - results = deep_sort( - sorted( - results, - key=lambda result: result['titles'][0]['title'], - ) - ) - expected_results = deep_sort( - sorted( - expected_results, - key=lambda result: result['titles'][0]['title'], - ) - ) - - gotten_results = [override_generated_fields(result) for result in results] - expected_results = [ - override_generated_fields(expected) for expected in expected_results - ] - - assert gotten_results == expected_results - - -@pytest.mark.parametrize( - 'expected_results', - [ - expected_json_results_from_file( - 'cds', - 'fixtures', - 'cds_smoke_records_expected.json', - ), - ], - ids=[ - 'crawl_twice', - ] -) -def test_cds_crawl_twice(set_up_local_environment, expected_results): - crawler = get_crawler_instance( - set_up_local_environment.get('CRAWLER_HOST_URL') - ) - - results = CeleryMonitor.do_crawl( - app=celery_app, - monitor_timeout=5, - monitor_iter_limit=20, - events_limit=1, - crawler_instance=crawler, - project=set_up_local_environment.get('CRAWLER_PROJECT'), - spider='CDS', - settings={}, - **set_up_local_environment.get('CRAWLER_ARGUMENTS') - ) - - results = deep_sort( - sorted( - results, - key=lambda result: result['titles'][0]['title'], - ) - ) - expected_results = deep_sort( - sorted( - expected_results, - key=lambda result: result['titles'][0]['title'], - ) - ) +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings - gotten_results = [override_generated_fields(result) for result in results] - expected_results = [ - override_generated_fields(expected) for expected in expected_results - ] +from hepcrawl.testlib.fixtures import get_test_suite_path - assert gotten_results == expected_results - results = CeleryMonitor.do_crawl( - app=celery_app, - monitor_timeout=5, - monitor_iter_limit=20, - crawler_instance=crawler, - project=set_up_local_environment.get('CRAWLER_PROJECT'), - spider='CDS', - settings={}, - **set_up_local_environment.get('CRAWLER_ARGUMENTS') - ) +@pytest.fixture +def cds_oai_server(): + with requests_mock.Mocker() as m: + m.get('http://cds.cern.ch/oai2d?from=2017-10-10&verb=ListRecords&set=forINSPIRE&metadataPrefix=marcxml', + text=open(get_test_suite_path('cds', 'fixtures', 'cds1.xml', test_suite='functional')).read()) + m.get('http://cds.cern.ch/oai2d?from=2017-10-10&verb=ListRecords&&resumptionToken=___kuYtYs', + text=open(get_test_suite_path('cds', 'fixtures', 'cds2.xml', test_suite='functional')).read()) + yield m - gotten_results = [override_generated_fields(result) for result in results] - assert gotten_results == [] +def test_cds(cds_oai_server): + process = CrawlerProcess(get_project_settings()) + process.crawl('CDS', from_date='2017-10-10') + process.start() From 33c3ae5c845a2350f8ff5436ed43abe1a908a54d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Thu, 7 Dec 2017 17:15:35 +0100 Subject: [PATCH 02/21] refactor, test contents MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- hepcrawl/scrapy.cfg | 2 +- hepcrawl/settings.py | 7 +- hepcrawl/spiders/arxiv_spider.py | 2 +- hepcrawl/spiders/cds_spider.py | 13 +- hepcrawl/spiders/oaipmh_spider.py | 36 +- tests/functional/cds/fixtures/cds.xml | 1480 +++++++++++++++++ .../functional/cds/fixtures/cds_expected.json | 1369 +++++++++++++++ .../fixtures/cds_smoke_records_expected.json | 153 -- tests/functional/cds/test_cds.py | 73 +- 9 files changed, 2954 insertions(+), 181 deletions(-) create mode 100644 tests/functional/cds/fixtures/cds.xml create mode 100644 tests/functional/cds/fixtures/cds_expected.json delete mode 100644 tests/functional/cds/fixtures/cds_smoke_records_expected.json diff --git a/hepcrawl/scrapy.cfg b/hepcrawl/scrapy.cfg index 1ec7711e..adffa153 100644 --- a/hepcrawl/scrapy.cfg +++ b/hepcrawl/scrapy.cfg @@ -14,7 +14,7 @@ default = hepcrawl.settings [deploy] -url = http://localhost:6800/ +url = http://scrapyd:6800/ project = hepcrawl #username = scrapy #password = secret diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py index c5c11c4a..025e7186 100644 --- a/hepcrawl/settings.py +++ b/hepcrawl/settings.py @@ -73,11 +73,12 @@ 'hepcrawl.middlewares.HepcrawlCrawlOnceMiddleware': 100, } -DOWNLOAD_HANDLERS_BASE = dict(default_settings.DOWNLOAD_HANDLERS_BASE) -DOWNLOAD_HANDLERS_BASE.update({ +# Configure custom downloaders +# See https://doc.scrapy.org/en/0.20/topics/settings.html#download-handlers +DOWNLOAD_HANDLERS = { 'oaipmh+http': 'hepcrawl.downloaders.DummyDownloadHandler', 'oaipmh+https': 'hepcrawl.downloaders.DummyDownloadHandler', -}) +} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html diff --git a/hepcrawl/spiders/arxiv_spider.py b/hepcrawl/spiders/arxiv_spider.py index 82e086fb..64d076dc 100644 --- a/hepcrawl/spiders/arxiv_spider.py +++ b/hepcrawl/spiders/arxiv_spider.py @@ -46,7 +46,7 @@ class ArxivSpider(StatefulSpider, XMLFeedSpider): """ name = 'arXiv' - iterator = 'iternodes' + iterator = 'xml' itertag = 'OAI-PMH:record' namespaces = [ ("OAI-PMH", "http://www.openarchives.org/OAI/2.0/") diff --git a/hepcrawl/spiders/cds_spider.py b/hepcrawl/spiders/cds_spider.py index 25ff0071..c11d3d77 100644 --- a/hepcrawl/spiders/cds_spider.py +++ b/hepcrawl/spiders/cds_spider.py @@ -34,7 +34,7 @@ class CDSSpider(OAIPMHSpider): Using OAI-PMH XML files:: $ scrapy crawl CDS \\ - -a "set=forINSPIRE" -a "from_date=2017-10-10" + -a "oai_set=forINSPIRE" -a "from_date=2017-10-10" It uses `HarvestingKit `_ to translate from CDS's MARCXML into INSPIRE Legacy's MARCXML flavor. It then @@ -44,8 +44,13 @@ class CDSSpider(OAIPMHSpider): name = 'CDS' - def __init__(self, from_date=None, set="forINSPIRE", *args, **kwargs): - super(CDSSpider, self).__init__(url='http://cds.cern.ch/oai2d', metadata_prefix='marcxml', set=set, from_date=from_date, **kwargs) + def __init__(self, from_date=None, oai_set="forINSPIRE", *args, **kwargs): + super(CDSSpider, self).__init__( + url='http://cds.cern.ch/oai2d', + metadata_prefix='marcxml', + oai_set=oai_set, + from_date=from_date, + **kwargs) def parse_record(self, record): response = XmlResponse(self.url, encoding='utf-8', body=record.raw) @@ -65,6 +70,8 @@ def parse_record(self, record): ) with app.app_context(): json_record = hep.do(record) + base_uri = self.settings['SCHEMA_BASE_URI'] + json_record['$schema'] = base_uri + 'hep.json' return ParsedItem(record=json_record, record_format='hep') except Exception: logger.exception("Error when parsing record") diff --git a/hepcrawl/spiders/oaipmh_spider.py b/hepcrawl/spiders/oaipmh_spider.py index 375799e9..3bd429e1 100644 --- a/hepcrawl/spiders/oaipmh_spider.py +++ b/hepcrawl/spiders/oaipmh_spider.py @@ -10,7 +10,7 @@ """Generic spider for OAI-PMH servers.""" import logging -import sickle +from enum import Enum from datetime import datetime from sickle import Sickle @@ -22,6 +22,19 @@ logger = logging.getLogger(__name__) + +class _Granularity(Enum): + DATE = 'YYYY-MM-DD' + SECOND = 'YYYY-MM-DDThh:mm:ssZ' + + def format(self, datetime_object): + if self == self.DATE: + return datetime_object.strftime('%Y-%m-%d') + if self == self.SECOND: + return datetime_object.strftime('%Y-%m-%dT%H:%M:%SZ') + raise ValueError("Invalid granularity: %s" % self.granularity) + + class OAIPMHSpider(Spider): """ Implements a spider for the OAI-PMH protocol by using the Python sickle library. @@ -31,12 +44,15 @@ class OAIPMHSpider(Spider): """ name = 'OAI-PMH' state = {} + granularity = _Granularity.DATE - def __init__(self, url, metadata_prefix='marcxml', set=None, alias=None, from_date=None, until_date=None, granularity='YYYY-MM-DD', record_class=Record, *args, **kwargs): + def __init__(self, url, metadata_prefix='marcxml', oai_set=None, alias=None, + from_date=None, until_date=None, granularity='', + record_class=Record, *args, **kwargs): super(OAIPMHSpider, self).__init__(*args, **kwargs) self.url = url self.metadata_prefix = metadata_prefix - self.set = set + self.set = oai_set self.granularity = granularity self.alias = alias or self._make_alias() self.from_date = from_date @@ -47,7 +63,9 @@ def __init__(self, url, metadata_prefix='marcxml', set=None, alias=None, from_da def start_requests(self): self.from_date = self.from_date or self.state.get(self.alias) logger.info("Current state 2:{}".format(self.state)) - logger.info("Starting harvesting of {url} with set={set} and metadataPrefix={metadata_prefix}, from={from_date}, until={until_date}".format( + logger.info("Starting harvesting of {url} with set={set} and " + "metadataPrefix={metadata_prefix}, from={from_date}, " + "until={until_date}".format( url=self.url, set=self.set, metadata_prefix=self.metadata_prefix, @@ -57,7 +75,7 @@ def start_requests(self): now = datetime.utcnow() request = Request('oaipmh+{}'.format(self.url), self.parse) yield request - self.state[self.alias] = self._format_date(now) + self.state[self.alias] = self.granularity.format(now) logger.info("Harvesting completed. Next harvesting will resume from {}".format(self.state[self.alias])) def parse_record(self, record): @@ -84,14 +102,6 @@ def parse(self, response): for record in records: yield self.parse_record(record) - def _format_date(self, datetime_object): - if self.granularity == 'YYYY-MM-DD': - return datetime_object.strftime('%Y-%m-%d') - elif self.granularity == 'YYYY-MM-DDThh:mm:ssZ': - return datetime_object.strftime('%Y-%m-%dT%H:%M:%SZ') - else: - raise RuntimeError("Invalid granularity: %s" % self.granularity) - def _make_alias(self): return '{url}-{metadata_prefix}-{set}'.format( url=self.url, diff --git a/tests/functional/cds/fixtures/cds.xml b/tests/functional/cds/fixtures/cds.xml new file mode 100644 index 00000000..9bec8576 --- /dev/null +++ b/tests/functional/cds/fixtures/cds.xml @@ -0,0 +1,1480 @@ + + + +2017-12-07T15:05:26Zhttp://cds.cern.ch/oai2d +
oai:cds.cern.ch:12007522017-11-16T08:09:30Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1200752 + SzGeCERN + 20171116090930.0 + + oai:cds.cern.ch:1200752 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509577 + + + eng + + + Dubus, G + Grenoble Observ. + + + High and very high energy gamma-ray emission from binaries + + + 2009 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS MQW7-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + 018 + PoS + MQW7 + 2008 + + + http://cds.cern.ch/record/1200752/files/MQW7_018.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090817 + + + 1129423 + 018 + izmir20080901 + + + PUBLIC + + + 002842486CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12007532017-11-16T08:09:30Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1200753 + SzGeCERN + 20171116090930.0 + + oai:cds.cern.ch:1200753 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509578 + + + eng + + + Dubois, R + SLAC + + + GLAST: Launched and Being Commissioned - Status and Prospects for Microquasars + + + Fermi: Launched and Being Commissioned - Status and Prospects for Microquasars + Other title + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS MQW7-2009 + + + No authors + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + for the Fermi LAT Collaboration + + + 019 + PoS + MQW7 + 2008 + + + http://cds.cern.ch/record/1200753/files/MQW7_019.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090817 + + + 1129423 + 019 + izmir20080901 + + + PUBLIC + + + 002842487CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12007542017-11-16T08:09:30Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1200754 + SzGeCERN + 20171116090930.0 + + oai:cds.cern.ch:1200754 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509579 + + + eng + + + Romero, G E + Villa Elisa, Inst. Argentino Radioastron. + La Plata U. + + + Hadronic models of high-energy radiation from microquasars: recent developments + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS MQW7-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + 020 + PoS + MQW7 + 2008 + + + http://cds.cern.ch/record/1200754/files/MQW7_020.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090817 + + + 1129423 + 020 + izmir20080901 + + + PUBLIC + + + 002842488CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12032802017-11-16T08:09:52Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203280 + SzGeCERN + 20171116090952.0 + + oai:cds.cern.ch:1203280 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509595 + + + eng + + + Guess, C J + Michigan State U., NSCL + Michigan U. + Michigan State U., JINA + + + Studying matrix elements for the neutrinoless double beta decay of 150Nd via the 150Sm(t,3He)150Pm* and 150Nd(3He,t)150Pm* reactions + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Austin, S M + Michigan State U., NSCL + Michigan State U., JINA + + + Bazin, D + Michigan State U., NSCL + + + Brown, B A + Michigan State U., NSCL + Michigan U. + Michigan State U., JINA + + + Caesar, C + Michigan State U., NSCL + Mainz U. + + + Deaven, J M + Michigan State U., NSCL + Michigan U. + Michigan State U., JINA + + + Herlitzius, C + Michigan State U., NSCL + Mainz U. + + + Hitt, G W + Michigan State U., NSCL + Michigan U. + Michigan State U., JINA + + + Meharchand, R T + Michigan State U., NSCL + Michigan U. + Michigan State U., JINA + + + Perdikakis, G + Michigan State U., NSCL + Michigan State U., JINA + + + Shimbara, Y + Niigata U., Grad. Sch. Sci. Tech. + + + Tur, C + Michigan State U., NSCL + Michigan State U., JINA + + + Zegers, R G T + Michigan State U., NSCL + Michigan U. + Michigan State U., JINA + + + 104 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203280/files/NIC20X_104.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 104 + mackinacisland20080727 + + + PUBLIC + + + 002844587CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12032812017-11-16T08:09:55Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203281 + SzGeCERN + 20171116090955.0 + + oai:cds.cern.ch:1203281 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509596 + + + eng + + + Jachowicz, N + Ghent U. + + + Untangling supernova-neutrino oscillations with beta-beam data + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + McLaughlin, G C + North Carolina State U. + + + Volpe, C + Orsay, IPN + + + 107 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203281/files/NIC20X_107.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 107 + mackinacisland20080727 + + + PUBLIC + + + 002844588CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12033612017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203361 + SzGeCERN + 20171116090958.0 + + oai:cds.cern.ch:1203361 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509597 + + + eng + + + Kawagoe, S + Tokyo U. + + + Neutrino oscillations in non-spherical supernova explosions + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Takiwaki, T + Tokyo U. + + + Kotake, K + Natl. Astron. Observ. of Japan + + + 109 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203361/files/NIC20X_109.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 109 + mackinacisland20080727 + + + PUBLIC + + + 002844668CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12033622017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203362 + SzGeCERN + 20171116090958.0 + + oai:cds.cern.ch:1203362 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509598 + + + eng + + + Nakazato, K + Waseda U. + + + Neutrino Emission from Stellar Collapse including Hadron-Quark Mixed Phase + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Sumiyoshi, K + Numazu Coll. Tech. + + + Yamada, s + Waseda U. + + + 116 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203362/files/NIC20X_116.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 116 + mackinacisland20080727 + + + PUBLIC + + + 002844669CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12033632017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203363 + SzGeCERN + 20171116090958.0 + + oai:cds.cern.ch:1203363 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509599 + + + eng + + + Sumiyoshi, K + Numazu Coll. Tech. + + + Short neutrino burst from failed supernovae as a probe of dense matter with hyperon mixture + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Ishizuka, C + Hokkaido U. + + + Ohnishi, A + Kyoto U., Yukawa Inst., Kyoto + + + Yamada, S + Waseda U. + + + Suzuki, H + Tokyo U. of Sci. + + + 122 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203363/files/NICX_122.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 122 + mackinacisland20080727 + + + PUBLIC + + + 002844670CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12033642017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203364 + SzGeCERN + 20171116090958.0 + + oai:cds.cern.ch:1203364 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509600 + + + eng + + + Suzuki, T + Tokyo U. + + + Neutrino Nucleus Reactions and Nucleosynthesis in Stars + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Yoshida, T + Natl. Astron. Observ. of Japan + + + Chiba, S + JAEA, Ibaraki + + + Honma, M + Aizu U. + + + Higashiyama, K + Chiba Inst. Tech. + + + Umeda, H + Tokyo U. + + + Nomoto, K + Tokyo U. + + + Kajino, T + Tokyo U. + Natl. Astron. Observ. of Japan + + + Otsuka, T + Tokyo U. + + + 123 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203364/files/NICX_123.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 123 + mackinacisland20080727 + + + PUBLIC + + + 002844671CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12033652017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203365 + SzGeCERN + 20171116090958.0 + + oai:cds.cern.ch:1203365 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509601 + + + eng + + + Whitehouse, S + Basel U. + + + Neutrino transport in 3D simulations of core-collapse supernovae + + + A new approach to neutrino transport in 3D simulations of core-collapse supernovae + Other title + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Liebendörfer, M + Basel U. + + + 243 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203365/files/NICX_243.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 243 + mackinacisland20080727 + + + PUBLIC + + + 002844672CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12033662017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203366 + SzGeCERN + 20171116090958.0 + + oai:cds.cern.ch:1203366 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509602 + + + eng + + + Arcones, A + Damstadt, Tech. Hochsch. + Darmstadt, GSI + + + Neutrino-driven winds and nucleosynthesis + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Martínez-Pinedo, G + Darmstadt, GSI + + + Schwenk, A + TRIUMF + + + O’Connor, E + TRIUMF + Caltech + + + Langanke, K + Damstadt, Tech. Hochsch. + Darmstadt, GSI + + + Horowitz, C J + Indiana U. + + + Janka, H T + Garching, Max Planck Inst. + + + 128 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203366/files/NIC20X_128.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 128 + mackinacisland20080727 + + + PUBLIC + + + 002844673CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12033672017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203367 + SzGeCERN + 20171116090958.0 + + oai:cds.cern.ch:1203367 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509603 + + + eng + + + Roberts, L + UC, Santa Cruz, Astron. Astrophys. + + + Nucleosynthesis in the Neutrino Driven Wind of Protoneutron Stars + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Woosley, S + UC, Santa Cruz, Astron. Astrophys. + + + Heger, A + Minnesota U. + + + Hoffman, R + LLNL, Livermore + + + 146 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203367/files/NICX_146.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 146 + mackinacisland20080727 + + + PUBLIC + + + 002844674CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12033692017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203369 + SzGeCERN + 20171116090958.0 + + oai:cds.cern.ch:1203369 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509604 + + + eng + + + Kojima, K + Tokyo U. + Natl. Astron. Observ. of Japan + + + Neutrino effect in cosmology with the primordial magnetic field + + + Neutrino effects in cosmology with A primordial magnetic field + Other title + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Ichiki, K + Nagoya U. + + + Kajino, T + Tokyo U. + Natl. Astron. Observ. of Japan + + + Mathews, G J + Notre Dame U. + Natl. Astron. Observ. of Japan + + + 226 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203369/files/NICX_226.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 226 + mackinacisland20080727 + + + PUBLIC + + + 002844676CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12033702017-11-16T08:09:47Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203370 + SzGeCERN + 20171116090947.0 + + oai:cds.cern.ch:1203370 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509605 + + + eng + + + Yamazaki, D G + Natl. Astron. Observ. of Japan + + + A Strong Constraint on the Neutrino Mass from the Formation of Large Scale Structure in the Presence of the Primordial Magnetic Field + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Ichiki, K + Tokyo U. + + + Kajino, T + Natl. Astron. Observ. of Japan + + + Mathews, G J + Notre Dame U. + + + 239 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203370/files/NICX_239.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 239 + mackinacisland20080727 + + + PUBLIC + + + 002844677CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
+
+ diff --git a/tests/functional/cds/fixtures/cds_expected.json b/tests/functional/cds/fixtures/cds_expected.json new file mode 100644 index 00000000..cfb94ced --- /dev/null +++ b/tests/functional/cds/fixtures/cds_expected.json @@ -0,0 +1,1369 @@ +[ + { + "refereed": true, + "core": true, + "preprint_date": "2009", + "documents": [ + { + "url": "http://cds.cern.ch/record/1200752/files/MQW7_018.pdf", + "key": "document" + } + ], + "citeable": true, + "_collections": [ + "Literature" + ], + "acquisition_source": { + "source": "CDS", + "datetime": "2017-12-07T15:54:16.980315", + "method": "hepcrawl", + "submission_number": "None" + }, + "inspire_categories": [ + { + "term": "Astrophysics" + } + ], + "titles": [ + { + "title": "High and very high energy gamma-ray emission from binaries" + } + ], + "publication_info": [ + { + "journal_volume": "MQW7", + "page_start": "018", + "year": 2008, + "artid": "018", + "journal_title": "PoS" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "Grenoble Observ." + } + ], + "full_name": "Dubus, G." + } + ], + "external_system_identifiers": [ + { + "value": "1200752", + "schema": "CDS" + }, + { + "value": "1509577", + "schema": "Inspire" + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "curated": true + }, + { + "refereed": true, + "core": true, + "preprint_date": "2008", + "documents": [ + { + "url": "http://cds.cern.ch/record/1200753/files/MQW7_019.pdf", + "key": "document" + } + ], + "citeable": true, + "_collections": [ + "Literature" + ], + "collaborations": [ + { + "value": "Fermi LAT" + } + ], + "acquisition_source": { + "source": "CDS", + "datetime": "2017-12-07T15:54:17.101983", + "method": "hepcrawl", + "submission_number": "None" + }, + "inspire_categories": [ + { + "term": "Astrophysics" + } + ], + "titles": [ + { + "title": "GLAST: Launched and Being Commissioned - Status and Prospects for Microquasars" + }, + { + "title": "Fermi: Launched and Being Commissioned - Status and Prospects for Microquasars" + } + ], + "publication_info": [ + { + "journal_volume": "MQW7", + "page_start": "019", + "year": 2008, + "artid": "019", + "journal_title": "PoS" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "SLAC" + } + ], + "full_name": "Dubois, R." + } + ], + "external_system_identifiers": [ + { + "value": "1200753", + "schema": "CDS" + }, + { + "value": "1509578", + "schema": "Inspire" + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "curated": true + }, + { + "refereed": true, + "core": true, + "preprint_date": "2008", + "documents": [ + { + "url": "http://cds.cern.ch/record/1200754/files/MQW7_020.pdf", + "key": "document" + } + ], + "citeable": true, + "_collections": [ + "Literature" + ], + "acquisition_source": { + "source": "CDS", + "datetime": "2017-12-07T15:54:17.153496", + "method": "hepcrawl", + "submission_number": "None" + }, + "inspire_categories": [ + { + "term": "Astrophysics" + } + ], + "titles": [ + { + "title": "Hadronic models of high-energy radiation from microquasars: recent developments" + } + ], + "publication_info": [ + { + "journal_volume": "MQW7", + "page_start": "020", + "year": 2008, + "artid": "020", + "journal_title": "PoS" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "Villa Elisa, Inst. Argentino Radioastron." + }, + { + "value": "La Plata U." + } + ], + "full_name": "Romero, G. E." + } + ], + "external_system_identifiers": [ + { + "value": "1200754", + "schema": "CDS" + }, + { + "value": "1509579", + "schema": "Inspire" + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "curated": true + }, + { + "refereed": true, + "core": true, + "preprint_date": "2008", + "documents": [ + { + "url": "http://cds.cern.ch/record/1203280/files/NIC20X_104.pdf", + "key": "document" + } + ], + "curated": true, + "_collections": [ + "Literature" + ], + "acquisition_source": { + "source": "CDS", + "datetime": "2017-12-07T15:54:17.209845", + "method": "hepcrawl", + "submission_number": "None" + }, + "inspire_categories": [ + { + "term": "Astrophysics" + } + ], + "titles": [ + { + "title": "Studying matrix elements for the neutrinoless double beta decay of 150Nd via the 150Sm(t,3He)150Pm* and 150Nd(3He,t)150Pm* reactions" + } + ], + "publication_info": [ + { + "journal_volume": "NIC X", + "page_start": "104", + "year": 2008, + "artid": "104", + "journal_title": "PoS" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "Michigan State U., NSCL" + }, + { + "value": "Michigan U." + }, + { + "value": "Michigan State U., JINA" + } + ], + "full_name": "Guess, C. J." + }, + { + "affiliations": [ + { + "value": "Michigan State U., NSCL" + }, + { + "value": "Michigan State U., JINA" + } + ], + "full_name": "Austin, S. M." + }, + { + "affiliations": [ + { + "value": "Michigan State U., NSCL" + } + ], + "full_name": "Bazin, D." + }, + { + "affiliations": [ + { + "value": "Michigan State U., NSCL" + }, + { + "value": "Michigan U." + }, + { + "value": "Michigan State U., JINA" + } + ], + "full_name": "Brown, B. A." + }, + { + "affiliations": [ + { + "value": "Michigan State U., NSCL" + }, + { + "value": "Mainz U." + } + ], + "full_name": "Caesar, C." + }, + { + "affiliations": [ + { + "value": "Michigan State U., NSCL" + }, + { + "value": "Michigan U." + }, + { + "value": "Michigan State U., JINA" + } + ], + "full_name": "Deaven, J. M." + }, + { + "affiliations": [ + { + "value": "Michigan State U., NSCL" + }, + { + "value": "Mainz U." + } + ], + "full_name": "Herlitzius, C." + }, + { + "affiliations": [ + { + "value": "Michigan State U., NSCL" + }, + { + "value": "Michigan U." + }, + { + "value": "Michigan State U., JINA" + } + ], + "full_name": "Hitt, G. W." + }, + { + "affiliations": [ + { + "value": "Michigan State U., NSCL" + }, + { + "value": "Michigan U." + }, + { + "value": "Michigan State U., JINA" + } + ], + "full_name": "Meharchand, R. T." + }, + { + "affiliations": [ + { + "value": "Michigan State U., NSCL" + }, + { + "value": "Michigan State U., JINA" + } + ], + "full_name": "Perdikakis, G." + }, + { + "affiliations": [ + { + "value": "Niigata U., Grad. Sch. Sci. Tech." + } + ], + "full_name": "Shimbara, Y." + }, + { + "affiliations": [ + { + "value": "Michigan State U., NSCL" + }, + { + "value": "Michigan State U., JINA" + } + ], + "full_name": "Tur, C." + }, + { + "affiliations": [ + { + "value": "Michigan State U., NSCL" + }, + { + "value": "Michigan U." + }, + { + "value": "Michigan State U., JINA" + } + ], + "full_name": "Zegers, R. G. T." + } + ], + "external_system_identifiers": [ + { + "value": "1203280", + "schema": "CDS" + }, + { + "value": "1509595", + "schema": "Inspire" + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "citeable": true + }, + { + "refereed": true, + "core": true, + "preprint_date": "2008", + "documents": [ + { + "url": "http://cds.cern.ch/record/1203281/files/NIC20X_107.pdf", + "key": "document" + } + ], + "curated": true, + "_collections": [ + "Literature" + ], + "acquisition_source": { + "source": "CDS", + "datetime": "2017-12-07T15:54:17.271385", + "method": "hepcrawl", + "submission_number": "None" + }, + "inspire_categories": [ + { + "term": "Astrophysics" + } + ], + "titles": [ + { + "title": "Untangling supernova-neutrino oscillations with beta-beam data" + } + ], + "publication_info": [ + { + "journal_volume": "NIC X", + "page_start": "107", + "year": 2008, + "artid": "107", + "journal_title": "PoS" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "Ghent U." + } + ], + "full_name": "Jachowicz, N." + }, + { + "affiliations": [ + { + "value": "North Carolina State U." + } + ], + "full_name": "McLaughlin, G. C." + }, + { + "affiliations": [ + { + "value": "Orsay, IPN" + } + ], + "full_name": "Volpe, C." + } + ], + "external_system_identifiers": [ + { + "value": "1203281", + "schema": "CDS" + }, + { + "value": "1509596", + "schema": "Inspire" + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "citeable": true + }, + { + "refereed": true, + "core": true, + "preprint_date": "2008", + "documents": [ + { + "url": "http://cds.cern.ch/record/1203361/files/NIC20X_109.pdf", + "key": "document" + } + ], + "curated": true, + "_collections": [ + "Literature" + ], + "acquisition_source": { + "source": "CDS", + "datetime": "2017-12-07T15:54:17.324204", + "method": "hepcrawl", + "submission_number": "None" + }, + "inspire_categories": [ + { + "term": "Astrophysics" + } + ], + "titles": [ + { + "title": "Neutrino oscillations in non-spherical supernova explosions" + } + ], + "publication_info": [ + { + "journal_volume": "NIC X", + "page_start": "109", + "year": 2008, + "artid": "109", + "journal_title": "PoS" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "Tokyo U." + } + ], + "full_name": "Kawagoe, S." + }, + { + "affiliations": [ + { + "value": "Tokyo U." + } + ], + "full_name": "Takiwaki, T." + }, + { + "affiliations": [ + { + "value": "Natl. Astron. Observ. of Japan" + } + ], + "full_name": "Kotake, K." + } + ], + "external_system_identifiers": [ + { + "value": "1203361", + "schema": "CDS" + }, + { + "value": "1509597", + "schema": "Inspire" + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "citeable": true + }, + { + "refereed": true, + "core": true, + "preprint_date": "2008", + "documents": [ + { + "url": "http://cds.cern.ch/record/1203362/files/NIC20X_116.pdf", + "key": "document" + } + ], + "curated": true, + "_collections": [ + "Literature" + ], + "acquisition_source": { + "source": "CDS", + "datetime": "2017-12-07T15:54:17.378354", + "method": "hepcrawl", + "submission_number": "None" + }, + "inspire_categories": [ + { + "term": "Astrophysics" + } + ], + "titles": [ + { + "title": "Neutrino Emission from Stellar Collapse including Hadron-Quark Mixed Phase" + } + ], + "publication_info": [ + { + "journal_volume": "NIC X", + "page_start": "116", + "year": 2008, + "artid": "116", + "journal_title": "PoS" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "Waseda U." + } + ], + "full_name": "Nakazato, K." + }, + { + "affiliations": [ + { + "value": "Numazu Coll. Tech." + } + ], + "full_name": "Sumiyoshi, K." + }, + { + "affiliations": [ + { + "value": "Waseda U." + } + ], + "full_name": "Yamada, s." + } + ], + "external_system_identifiers": [ + { + "value": "1203362", + "schema": "CDS" + }, + { + "value": "1509598", + "schema": "Inspire" + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "citeable": true + }, + { + "refereed": true, + "core": true, + "preprint_date": "2008", + "documents": [ + { + "url": "http://cds.cern.ch/record/1203363/files/NICX_122.pdf", + "key": "document" + } + ], + "curated": true, + "_collections": [ + "Literature" + ], + "acquisition_source": { + "source": "CDS", + "datetime": "2017-12-07T15:54:17.431432", + "method": "hepcrawl", + "submission_number": "None" + }, + "inspire_categories": [ + { + "term": "Astrophysics" + } + ], + "titles": [ + { + "title": "Short neutrino burst from failed supernovae as a probe of dense matter with hyperon mixture" + } + ], + "publication_info": [ + { + "journal_volume": "NIC X", + "page_start": "122", + "year": 2008, + "artid": "122", + "journal_title": "PoS" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "Numazu Coll. Tech." + } + ], + "full_name": "Sumiyoshi, K." + }, + { + "affiliations": [ + { + "value": "Hokkaido U." + } + ], + "full_name": "Ishizuka, C." + }, + { + "affiliations": [ + { + "value": "Kyoto U., Yukawa Inst., Kyoto" + } + ], + "full_name": "Ohnishi, A." + }, + { + "affiliations": [ + { + "value": "Waseda U." + } + ], + "full_name": "Yamada, S." + }, + { + "affiliations": [ + { + "value": "Tokyo U. of Sci." + } + ], + "full_name": "Suzuki, H." + } + ], + "external_system_identifiers": [ + { + "value": "1203363", + "schema": "CDS" + }, + { + "value": "1509599", + "schema": "Inspire" + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "citeable": true + }, + { + "refereed": true, + "core": true, + "preprint_date": "2008", + "documents": [ + { + "url": "http://cds.cern.ch/record/1203364/files/NICX_123.pdf", + "key": "document" + } + ], + "curated": true, + "_collections": [ + "Literature" + ], + "acquisition_source": { + "source": "CDS", + "datetime": "2017-12-07T15:54:17.486814", + "method": "hepcrawl", + "submission_number": "None" + }, + "inspire_categories": [ + { + "term": "Astrophysics" + } + ], + "titles": [ + { + "title": "Neutrino Nucleus Reactions and Nucleosynthesis in Stars" + } + ], + "publication_info": [ + { + "journal_volume": "NIC X", + "page_start": "123", + "year": 2008, + "artid": "123", + "journal_title": "PoS" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "Tokyo U." + } + ], + "full_name": "Suzuki, T." + }, + { + "affiliations": [ + { + "value": "Natl. Astron. Observ. of Japan" + } + ], + "full_name": "Yoshida, T." + }, + { + "affiliations": [ + { + "value": "JAEA, Ibaraki" + } + ], + "full_name": "Chiba, S." + }, + { + "affiliations": [ + { + "value": "Aizu U." + } + ], + "full_name": "Honma, M." + }, + { + "affiliations": [ + { + "value": "Chiba Inst. Tech." + } + ], + "full_name": "Higashiyama, K." + }, + { + "affiliations": [ + { + "value": "Tokyo U." + } + ], + "full_name": "Umeda, H." + }, + { + "affiliations": [ + { + "value": "Tokyo U." + } + ], + "full_name": "Nomoto, K." + }, + { + "affiliations": [ + { + "value": "Tokyo U." + }, + { + "value": "Natl. Astron. Observ. of Japan" + } + ], + "full_name": "Kajino, T." + }, + { + "affiliations": [ + { + "value": "Tokyo U." + } + ], + "full_name": "Otsuka, T." + } + ], + "external_system_identifiers": [ + { + "value": "1203364", + "schema": "CDS" + }, + { + "value": "1509600", + "schema": "Inspire" + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "citeable": true + }, + { + "refereed": true, + "core": true, + "preprint_date": "2008", + "documents": [ + { + "url": "http://cds.cern.ch/record/1203365/files/NICX_243.pdf", + "key": "document" + } + ], + "curated": true, + "_collections": [ + "Literature" + ], + "acquisition_source": { + "source": "CDS", + "datetime": "2017-12-07T15:54:17.541467", + "method": "hepcrawl", + "submission_number": "None" + }, + "inspire_categories": [ + { + "term": "Astrophysics" + } + ], + "titles": [ + { + "title": "Neutrino transport in 3D simulations of core-collapse supernovae" + }, + { + "title": "A new approach to neutrino transport in 3D simulations of core-collapse supernovae" + } + ], + "publication_info": [ + { + "journal_volume": "NIC X", + "page_start": "243", + "year": 2008, + "artid": "243", + "journal_title": "PoS" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "Basel U." + } + ], + "full_name": "Whitehouse, S." + }, + { + "affiliations": [ + { + "value": "Basel U." + } + ], + "full_name": "Liebendörfer, M." + } + ], + "external_system_identifiers": [ + { + "value": "1203365", + "schema": "CDS" + }, + { + "value": "1509601", + "schema": "Inspire" + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "citeable": true + }, + { + "refereed": true, + "core": true, + "preprint_date": "2008", + "documents": [ + { + "url": "http://cds.cern.ch/record/1203366/files/NIC20X_128.pdf", + "key": "document" + } + ], + "curated": true, + "_collections": [ + "Literature" + ], + "acquisition_source": { + "source": "CDS", + "datetime": "2017-12-07T15:54:17.595372", + "method": "hepcrawl", + "submission_number": "None" + }, + "inspire_categories": [ + { + "term": "Astrophysics" + } + ], + "titles": [ + { + "title": "Neutrino-driven winds and nucleosynthesis" + } + ], + "publication_info": [ + { + "journal_volume": "NIC X", + "page_start": "128", + "year": 2008, + "artid": "128", + "journal_title": "PoS" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "Damstadt, Tech. Hochsch." + }, + { + "value": "Darmstadt, GSI" + } + ], + "full_name": "Arcones, A." + }, + { + "affiliations": [ + { + "value": "Darmstadt, GSI" + } + ], + "full_name": "Martínez-Pinedo, G." + }, + { + "affiliations": [ + { + "value": "TRIUMF" + } + ], + "full_name": "Schwenk, A." + }, + { + "affiliations": [ + { + "value": "TRIUMF" + }, + { + "value": "Caltech" + } + ], + "full_name": "O’Connor, E." + }, + { + "affiliations": [ + { + "value": "Damstadt, Tech. Hochsch." + }, + { + "value": "Darmstadt, GSI" + } + ], + "full_name": "Langanke, K." + }, + { + "affiliations": [ + { + "value": "Indiana U." + } + ], + "full_name": "Horowitz, C. J." + }, + { + "affiliations": [ + { + "value": "Garching, Max Planck Inst." + } + ], + "full_name": "Janka, H. T." + } + ], + "external_system_identifiers": [ + { + "value": "1203366", + "schema": "CDS" + }, + { + "value": "1509602", + "schema": "Inspire" + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "citeable": true + }, + { + "refereed": true, + "core": true, + "preprint_date": "2008", + "documents": [ + { + "url": "http://cds.cern.ch/record/1203367/files/NICX_146.pdf", + "key": "document" + } + ], + "curated": true, + "_collections": [ + "Literature" + ], + "acquisition_source": { + "source": "CDS", + "datetime": "2017-12-07T15:54:17.650283", + "method": "hepcrawl", + "submission_number": "None" + }, + "inspire_categories": [ + { + "term": "Astrophysics" + } + ], + "titles": [ + { + "title": "Nucleosynthesis in the Neutrino Driven Wind of Protoneutron Stars" + } + ], + "publication_info": [ + { + "journal_volume": "NIC X", + "page_start": "146", + "year": 2008, + "artid": "146", + "journal_title": "PoS" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "UC, Santa Cruz, Astron. Astrophys." + } + ], + "full_name": "Roberts, L." + }, + { + "affiliations": [ + { + "value": "UC, Santa Cruz, Astron. Astrophys." + } + ], + "full_name": "Woosley, S." + }, + { + "affiliations": [ + { + "value": "Minnesota U." + } + ], + "full_name": "Heger, A." + }, + { + "affiliations": [ + { + "value": "LLNL, Livermore" + } + ], + "full_name": "Hoffman, R." + } + ], + "external_system_identifiers": [ + { + "value": "1203367", + "schema": "CDS" + }, + { + "value": "1509603", + "schema": "Inspire" + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "citeable": true + }, + { + "refereed": true, + "core": true, + "preprint_date": "2008", + "documents": [ + { + "url": "http://cds.cern.ch/record/1203369/files/NICX_226.pdf", + "key": "document" + } + ], + "curated": true, + "_collections": [ + "Literature" + ], + "acquisition_source": { + "source": "CDS", + "datetime": "2017-12-07T15:54:17.704338", + "method": "hepcrawl", + "submission_number": "None" + }, + "inspire_categories": [ + { + "term": "Astrophysics" + } + ], + "titles": [ + { + "title": "Neutrino effect in cosmology with the primordial magnetic field" + }, + { + "title": "Neutrino effects in cosmology with A primordial magnetic field" + } + ], + "publication_info": [ + { + "journal_volume": "NIC X", + "page_start": "226", + "year": 2008, + "artid": "226", + "journal_title": "PoS" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "Tokyo U." + }, + { + "value": "Natl. Astron. Observ. of Japan" + } + ], + "full_name": "Kojima, K." + }, + { + "affiliations": [ + { + "value": "Nagoya U." + } + ], + "full_name": "Ichiki, K." + }, + { + "affiliations": [ + { + "value": "Tokyo U." + }, + { + "value": "Natl. Astron. Observ. of Japan" + } + ], + "full_name": "Kajino, T." + }, + { + "affiliations": [ + { + "value": "Notre Dame U." + }, + { + "value": "Natl. Astron. Observ. of Japan" + } + ], + "full_name": "Mathews, G. J." + } + ], + "external_system_identifiers": [ + { + "value": "1203369", + "schema": "CDS" + }, + { + "value": "1509604", + "schema": "Inspire" + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "citeable": true + }, + { + "refereed": true, + "core": true, + "preprint_date": "2008", + "documents": [ + { + "url": "http://cds.cern.ch/record/1203370/files/NICX_239.pdf", + "key": "document" + } + ], + "curated": true, + "_collections": [ + "Literature" + ], + "acquisition_source": { + "source": "CDS", + "datetime": "2017-12-07T15:54:17.758624", + "method": "hepcrawl", + "submission_number": "None" + }, + "inspire_categories": [ + { + "term": "Astrophysics" + } + ], + "titles": [ + { + "title": "A Strong Constraint on the Neutrino Mass from the Formation of Large Scale Structure in the Presence of the Primordial Magnetic Field" + } + ], + "publication_info": [ + { + "journal_volume": "NIC X", + "page_start": "239", + "year": 2008, + "artid": "239", + "journal_title": "PoS" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "Natl. Astron. Observ. of Japan" + } + ], + "full_name": "Yamazaki, D. G." + }, + { + "affiliations": [ + { + "value": "Tokyo U." + } + ], + "full_name": "Ichiki, K." + }, + { + "affiliations": [ + { + "value": "Natl. Astron. Observ. of Japan" + } + ], + "full_name": "Kajino, T." + }, + { + "affiliations": [ + { + "value": "Notre Dame U." + } + ], + "full_name": "Mathews, G. J." + } + ], + "external_system_identifiers": [ + { + "value": "1203370", + "schema": "CDS" + }, + { + "value": "1509605", + "schema": "Inspire" + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "citeable": true + } +] diff --git a/tests/functional/cds/fixtures/cds_smoke_records_expected.json b/tests/functional/cds/fixtures/cds_smoke_records_expected.json deleted file mode 100644 index f6f6a7f8..00000000 --- a/tests/functional/cds/fixtures/cds_smoke_records_expected.json +++ /dev/null @@ -1,153 +0,0 @@ -[ - { - "$schema": "http://localhost/schemas/records/hep.json", - "_collections": [ - "Literature" - ], - "accelerator_experiments": [ - { - "legacy_name": "CERN-SPS---" - } - ], - "acquisition_source": { - "datetime": "2017-10-04T14:07:59.746165", - "method": "hepcrawl", - "source": "CDS", - "submission_number": "None" - }, - "core": true, - "curated": true, - "corporate_author": [ - "European Organization for Nuclear Research" - ], - "documents": [ - { - "url": "http://cds.cern.ch/record/21099/files/CM-P00077286-e.pdf", - "key": "document" - }, - { - "url": "http://cds.cern.ch/record/21099/files/CM-P00078235-f.pdf", - "key": "1_document" - } - ], - "document_type": [ - "article" - ], - "external_system_identifiers": [ - { - "schema": "Inspire", - "value": "1614043" - }, - { - "schema": "ADMADM", - "value": "0003711" - }, - { - "schema": "CDS", - "value": "21099" - } - ], - "inspire_categories": [ - { - "term": "Accelerators" - } - ], - "languages": [ - "fr" - ], - "preprint_date": "1967-05-30", - "report_numbers": [ - { - "value": "CERN/0702" - }, - { - "value": "CM-P00077286-e" - }, - { - "value": "CM-P00078235-f" - } - ], - "titles": [ - { - "title": "Addendum to the Report on the Design Study of a 300 GeV Proton Synchrotron (CERN/563) (AR/Int. SG/64-15)" - }, - { - "title": "Suppl\u00e9ment au Rapport sur le projet du synchrotron \u00e0 prontons de 300 GeV (CERN/563) (Ar/Int. SG/64-15)" - } - ] - }, - { - "$schema": "http://localhost/schemas/records/hep.json", - "_collections": [ - "Literature" - ], - "accelerator_experiments": [ - { - "legacy_name": "CERN-LEP---" - } - ], - "acquisition_source": { - "datetime": "2017-10-04T14:07:59.783028", - "method": "hepcrawl", - "source": "CDS", - "submission_number": "None" - }, - "core": true, - "curated": true, - "documents": [ - { - "url": "http://cds.cern.ch/record/60936/files/CM-P00098683-f.pdf", - "key": "document" - }, - { - "url": "http://cds.cern.ch/record/60936/files/CERN-SPC-426.pdf", - "key": "1_document" - } - ], - "document_type": [ - "article" - ], - "external_system_identifiers": [ - { - "schema": "ADMADM", - "value": "0009846" - }, - { - "schema": "Inspire", - "value": "1614044" - }, - { - "schema": "CDS", - "value": "60936" - } - ], - "inspire_categories": [ - { - "term": "Accelerators" - } - ], - "languages": [ - "fr" - ], - "preprint_date": "1978-10-06", - "report_numbers": [ - { - "value": "CERN/SPC/0426" - }, - { - "value": "CM-P00095369-e" - }, - { - "value": "CM-P00098683-f" - } - ], - "titles": [ - { - "title": "LEP Studies 1979 to 1981" - }, - { - "title": "Les Etudes sur le LEP de 1979 -1981" - } - ] - } -] diff --git a/tests/functional/cds/test_cds.py b/tests/functional/cds/test_cds.py index 3b825a31..02ec2cd4 100644 --- a/tests/functional/cds/test_cds.py +++ b/tests/functional/cds/test_cds.py @@ -12,23 +12,82 @@ import pytest import requests_mock +import copy +import json from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings +from tempfile import NamedTemporaryFile +from twisted.internet import reactor -from hepcrawl.testlib.fixtures import get_test_suite_path +from hepcrawl.testlib.fixtures import ( + get_test_suite_path, + expected_json_results_from_file, +) @pytest.fixture def cds_oai_server(): with requests_mock.Mocker() as m: - m.get('http://cds.cern.ch/oai2d?from=2017-10-10&verb=ListRecords&set=forINSPIRE&metadataPrefix=marcxml', - text=open(get_test_suite_path('cds', 'fixtures', 'cds1.xml', test_suite='functional')).read()) - m.get('http://cds.cern.ch/oai2d?from=2017-10-10&verb=ListRecords&&resumptionToken=___kuYtYs', - text=open(get_test_suite_path('cds', 'fixtures', 'cds2.xml', test_suite='functional')).read()) + m.get('http://cds.cern.ch/oai2d?from=2017-11-15&verb=ListRecords&set=forINSPIRE&metadataPrefix=marcxml', + text=open(get_test_suite_path('cds', 'fixtures', 'cds.xml', test_suite='functional')).read()) yield m +def override_dynamic_fields_on_records(records): + clean_records = [] + for record in records: + clean_record = override_dynamic_fields_on_record(record) + clean_records.append(clean_record) + + return clean_records + + +def override_dynamic_fields_on_record(record): + def _override(field_key, original_dict, backup_dict, new_value): + backup_dict[field_key] = original_dict[field_key] + original_dict[field_key] = new_value + + clean_record = copy.deepcopy(record) + overriden_fields = {} + dummy_random_date = u'2017-04-03T10:26:40.365216' + + overriden_fields['acquisition_source'] = {} + _override( + field_key='datetime', + original_dict=clean_record['acquisition_source'], + backup_dict=overriden_fields['acquisition_source'], + new_value=dummy_random_date, + ) + _override( + field_key='submission_number', + original_dict=clean_record['acquisition_source'], + backup_dict=overriden_fields['acquisition_source'], + new_value=u'5652c7f6190f11e79e8000224dabeaad', + ) + + return clean_record + + def test_cds(cds_oai_server): - process = CrawlerProcess(get_project_settings()) - process.crawl('CDS', from_date='2017-10-10') + f = NamedTemporaryFile('rw') + + settings = get_project_settings() + settings.set('FEED_FORMAT', 'json') + settings.set('FEED_URI', f.name) + + process = CrawlerProcess(settings) + process.crawl('CDS', from_date='2017-11-15', oai_set='forINSPIRE') process.start() + + result = json.load(f) + + expected = expected_json_results_from_file( + 'cds', 'fixtures', 'cds_expected.json' + ) + + expected = override_dynamic_fields_on_records(expected) + result = override_dynamic_fields_on_records(result) + + assert result == expected + + f.close() From db2953f3ac515ea10700cff0a87afce3d982f7e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Fri, 8 Dec 2017 13:19:16 +0100 Subject: [PATCH 03/21] parse_record takes the selector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- hepcrawl/spiders/cds_spider.py | 6 +----- hepcrawl/spiders/oaipmh_spider.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/hepcrawl/spiders/cds_spider.py b/hepcrawl/spiders/cds_spider.py index c11d3d77..0d6e60d0 100644 --- a/hepcrawl/spiders/cds_spider.py +++ b/hepcrawl/spiders/cds_spider.py @@ -11,8 +11,6 @@ import logging from scrapy import Request -from scrapy.http import XmlResponse -from scrapy.selector import Selector from flask.app import Flask from harvestingkit.inspire_cds_package.from_cds import CDS2Inspire from harvestingkit.bibrecord import ( @@ -52,9 +50,7 @@ def __init__(self, from_date=None, oai_set="forINSPIRE", *args, **kwargs): from_date=from_date, **kwargs) - def parse_record(self, record): - response = XmlResponse(self.url, encoding='utf-8', body=record.raw) - selector = Selector(response, type='xml') + def parse_record(self, selector): selector.remove_namespaces() try: cds_bibrec, ok, errs = create_bibrec(selector.xpath('.//record').extract()[0]) diff --git a/hepcrawl/spiders/oaipmh_spider.py b/hepcrawl/spiders/oaipmh_spider.py index 3bd429e1..295b74d5 100644 --- a/hepcrawl/spiders/oaipmh_spider.py +++ b/hepcrawl/spiders/oaipmh_spider.py @@ -17,7 +17,8 @@ from sickle.models import Record from sickle.oaiexceptions import NoRecordsMatch -from scrapy.http import Request +from scrapy.http import Request, XmlResponse +from scrapy.selector import Selector from scrapy.spiders import Spider logger = logging.getLogger(__name__) @@ -81,8 +82,11 @@ def start_requests(self): def parse_record(self, record): """ This method need to be reimplemented in order to provide special parsing. + + Args: + record (scrapy.selector.Selector): selector on the parsed record """ - return record.xml + raise NotImplementedError() def parse(self, response): sickle = Sickle(self.url, class_mapping={ @@ -100,7 +104,9 @@ def parse(self, response): logger.warning(err) raise StopIteration() for record in records: - yield self.parse_record(record) + response = XmlResponse(self.url, encoding='utf-8', body=record.raw) + selector = Selector(response, type='xml') + yield self.parse_record(selector) def _make_alias(self): return '{url}-{metadata_prefix}-{set}'.format( From 4890aa1496d5d60ec85e5fcd6251dff6a90482c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Fri, 8 Dec 2017 18:06:15 +0100 Subject: [PATCH 04/21] spiders: OAI-PMH: continue where left off MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- docker-compose.test.yml | 1 + hepcrawl/settings.py | 6 ++ hepcrawl/spiders/__init__.py | 2 +- hepcrawl/spiders/cds_spider.py | 2 +- hepcrawl/spiders/oaipmh_spider.py | 100 ++++++++++++++++++++++---- tests/functional/cds/test_cds.py | 3 +- tests/unit/test_oaipmh.py | 113 ++++++++++++++++++++++++++++++ 7 files changed, 209 insertions(+), 18 deletions(-) create mode 100644 tests/unit/test_oaipmh.py diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 9a1df2e0..61fdf2a9 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -18,6 +18,7 @@ services: - APP_CRAWLER_HOST_URL=http://scrapyd:6800 - APP_API_PIPELINE_TASK_ENDPOINT_DEFAULT=hepcrawl.testlib.tasks.submit_results - APP_FILES_STORE=/tmp/file_urls + - APP_LAST_RUNS_PATH=/code/.scrapy/last_runs - APP_CRAWL_ONCE_PATH=/code/.scrapy - COVERAGE_PROCESS_START=/code/.coveragerc - BASE_USER_UID=${BASE_USER_UID:-1000} diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py index 025e7186..6c38c1e2 100644 --- a/hepcrawl/settings.py +++ b/hepcrawl/settings.py @@ -42,6 +42,12 @@ 'http://localhost/schemas/records/' ) +# Location of last run information +LAST_RUNS_PATH = os.environ.get( + 'APP_LAST_RUNS_PATH', + '/var/lib/scrapy/last_runs/' +) + # Configure maximum concurrent requests performed by Scrapy (default: 16) # CONCURRENT_REQUESTS=32 diff --git a/hepcrawl/spiders/__init__.py b/hepcrawl/spiders/__init__.py index e4336459..b931594e 100644 --- a/hepcrawl/spiders/__init__.py +++ b/hepcrawl/spiders/__init__.py @@ -15,4 +15,4 @@ class StatefulSpider(Spider): def __init__(self, *args, **kwargs): self.state = {} - return super(Spider, self).__init__(*args, **kwargs) + super(StatefulSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/cds_spider.py b/hepcrawl/spiders/cds_spider.py index 0d6e60d0..e9aabb80 100644 --- a/hepcrawl/spiders/cds_spider.py +++ b/hepcrawl/spiders/cds_spider.py @@ -55,7 +55,7 @@ def parse_record(self, selector): try: cds_bibrec, ok, errs = create_bibrec(selector.xpath('.//record').extract()[0]) if not ok: - raise RuntimeError("Cannot parse record %s: %s", record, errs) + raise RuntimeError("Cannot parse record %s: %s", selector, errs) self.logger.info("Here's the record: %s" % cds_bibrec) inspire_bibrec = CDS2Inspire(cds_bibrec).get_record() marcxml_record = record_xml_output(inspire_bibrec) diff --git a/hepcrawl/spiders/oaipmh_spider.py b/hepcrawl/spiders/oaipmh_spider.py index 295b74d5..b66528a2 100644 --- a/hepcrawl/spiders/oaipmh_spider.py +++ b/hepcrawl/spiders/oaipmh_spider.py @@ -11,7 +11,12 @@ import logging from enum import Enum +from errno import EEXIST from datetime import datetime +from dateutil import parser as dateparser +import hashlib +import json +from os import path, makedirs from sickle import Sickle from sickle.models import Record @@ -19,7 +24,7 @@ from scrapy.http import Request, XmlResponse from scrapy.selector import Selector -from scrapy.spiders import Spider +from . import StatefulSpider logger = logging.getLogger(__name__) @@ -33,22 +38,21 @@ def format(self, datetime_object): return datetime_object.strftime('%Y-%m-%d') if self == self.SECOND: return datetime_object.strftime('%Y-%m-%dT%H:%M:%SZ') - raise ValueError("Invalid granularity: %s" % self.granularity) -class OAIPMHSpider(Spider): +class OAIPMHSpider(StatefulSpider): """ Implements a spider for the OAI-PMH protocol by using the Python sickle library. - In case of successful harvest (OAI-PMH crawling) the spider will remember the initial starting - date and will use it as `from_date` argument on the next harvest. + In case of successful harvest (OAI-PMH crawling) the spider will remember + the initial starting date and will use it as `from_date` argument on the + next harvest. """ name = 'OAI-PMH' - state = {} granularity = _Granularity.DATE def __init__(self, url, metadata_prefix='marcxml', oai_set=None, alias=None, - from_date=None, until_date=None, granularity='', + from_date=None, until_date=None, granularity=_Granularity.DATE, record_class=Record, *args, **kwargs): super(OAIPMHSpider, self).__init__(*args, **kwargs) self.url = url @@ -57,13 +61,13 @@ def __init__(self, url, metadata_prefix='marcxml', oai_set=None, alias=None, self.granularity = granularity self.alias = alias or self._make_alias() self.from_date = from_date - logger.info("Current state:{}".format(self.state)) self.until_date = until_date self.record_class = record_class def start_requests(self): - self.from_date = self.from_date or self.state.get(self.alias) - logger.info("Current state 2:{}".format(self.state)) + self.from_date = self.from_date or self._resume_from + started_at = datetime.utcnow() + logger.info("Starting harvesting of {url} with set={set} and " "metadataPrefix={metadata_prefix}, from={from_date}, " "until={until_date}".format( @@ -73,11 +77,15 @@ def start_requests(self): from_date=self.from_date, until_date=self.until_date )) - now = datetime.utcnow() + request = Request('oaipmh+{}'.format(self.url), self.parse) yield request - self.state[self.alias] = self.granularity.format(now) - logger.info("Harvesting completed. Next harvesting will resume from {}".format(self.state[self.alias])) + + now = datetime.utcnow() + self._save_run(started_at) + + logger.info("Harvesting completed. Next harvesting will resume from {}" + .format(self.until_date or self.granularity.format(now))) def parse_record(self, record): """ @@ -109,8 +117,72 @@ def parse(self, response): yield self.parse_record(selector) def _make_alias(self): - return '{url}-{metadata_prefix}-{set}'.format( + return '{url}?metadataPrefix={metadata_prefix}&set={set}'.format( url=self.url, metadata_prefix=self.metadata_prefix, set=self.set ) + + def _last_run_file_path(self): + """Render a path to a file where last run information is stored. + + Returns: + string: path to last runs path + """ + lasts_run_path = self.settings['LAST_RUNS_PATH'] + file_name = hashlib.sha1(self._make_alias()).hexdigest() + '.json' + return path.join(lasts_run_path, self.name, file_name) + + def _load_last_run(self): + """Return stored last run information + + Returns: + Optional[dict]: last run information or None if don't exist + """ + file_path = self._last_run_file_path() + try: + with open(file_path) as f: + last_run = json.load(f) + logger.info('Last run file loaded: {}'.format(repr(last_run))) + return last_run + except IOError: + return None + + def _save_run(self, started_at): + """Store last run information + + Args: + started_at (datetime.datetime) + + Raises: + IOError: if writing the file is unsuccessful + """ + last_run_info = { + 'spider': self.name, + 'url': self.url, + 'metadata_prefix': self.metadata_prefix, + 'set': self.set, + 'granularity': self.granularity.value, + 'from_date': self.from_date, + 'until_date': self.until_date, + 'last_run_started_at': started_at.isoformat(), + 'last_run_finished_at': datetime.utcnow().isoformat(), + } + file_path = self._last_run_file_path() + logger.info("Last run file saved to {}".format(file_path)) + try: + makedirs(path.dirname(file_path)) + except OSError as exc: + if exc.errno == EEXIST: + pass + else: + raise + with open(file_path, 'w') as f: + json.dump(last_run_info, f, indent=4) + + @property + def _resume_from(self): + last_run = self._load_last_run() + resume_at = last_run['until_date'] or last_run['last_run_finished_at'] + date_parsed = dateparser.parse(resume_at) + return self.granularity.format(date_parsed) diff --git a/tests/functional/cds/test_cds.py b/tests/functional/cds/test_cds.py index 02ec2cd4..38f41c2e 100644 --- a/tests/functional/cds/test_cds.py +++ b/tests/functional/cds/test_cds.py @@ -17,7 +17,6 @@ from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from tempfile import NamedTemporaryFile -from twisted.internet import reactor from hepcrawl.testlib.fixtures import ( get_test_suite_path, @@ -69,7 +68,7 @@ def _override(field_key, original_dict, backup_dict, new_value): def test_cds(cds_oai_server): - f = NamedTemporaryFile('rw') + f = NamedTemporaryFile('r+') settings = get_project_settings() settings.set('FEED_FORMAT', 'json') diff --git a/tests/unit/test_oaipmh.py b/tests/unit/test_oaipmh.py new file mode 100644 index 00000000..faa173ec --- /dev/null +++ b/tests/unit/test_oaipmh.py @@ -0,0 +1,113 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +from datetime import datetime +import json +from mock import patch +from os import remove, rmdir +import pytest + +from hepcrawl.spiders.oaipmh_spider import OAIPMHSpider, _Granularity +from scrapy.utils.project import get_project_settings + + +def override_dynamic_fields(run): + if 'last_run_finished_at' in run: + run['last_run_finished_at'] = '2017-12-08T23:55:54.794969' + return run + + +@pytest.fixture(scope='function') +def cleanup(): + yield + remove('/tmp/last_runs/OAI-PMH/2cea86bbc1d329b4273a29dc603fb8c0bb91439c.json') + rmdir('/tmp/last_runs/OAI-PMH') + rmdir('/tmp/last_runs') + + +@pytest.fixture +def settings(): + settings_patch = { + 'LAST_RUNS_PATH': '/tmp/last_runs/' + } + settings = get_project_settings() + with patch.dict(settings, settings_patch): + yield settings + + +@pytest.fixture +def spider(settings): + spider = OAIPMHSpider('http://export.arxiv.org/oai2', settings=settings) + spider.from_date = '2017-12-08' + spider.set = 'physics:hep-th' + spider.metadata_prefix = 'marcxml' + yield spider + + +def test_last_run_file_path(spider): + expected = '/tmp/last_runs/OAI-PMH/2cea86bbc1d329b4273a29dc603fb8c0bb91439c.json' + result = spider._last_run_file_path() + assert expected == result + + +def test_store_and_load_last_run(spider, cleanup): + now = datetime.utcnow() + spider._save_run(started_at=now) + + file_path = spider._last_run_file_path() + result = override_dynamic_fields(json.load(open(file_path))) + + expected = override_dynamic_fields({ + 'spider': 'OAI-PMH', + 'url': 'http://export.arxiv.org/oai2', + 'metadata_prefix': 'marcxml', + 'set': 'physics:hep-th', + 'granularity': 'YYYY-MM-DD', + 'from_date': '2017-12-08', + 'until_date': None, + 'last_run_started_at': now.isoformat(), + 'last_run_finished_at': '2017-12-08T13:55:00.000000', + }) + + assert expected == result + + result = override_dynamic_fields(spider._load_last_run()) + + assert expected == result + + +def test_load_inexisting(spider): + last_run = spider._load_last_run() + assert last_run == None + + +@pytest.mark.parametrize('until_date,last_run,expected,granularity', [ + ('2017-12-08T13:54:00.0', '2017-12-08T13:54:00.0', '2017-12-08', _Granularity.DATE), + ('2017-12-08T13:54:00.0', '2017-12-08T13:54:00.0', '2017-12-08T13:54:00Z', _Granularity.SECOND), + ('2017-12-08', '2017-12-08', '2017-12-08', _Granularity.DATE), + ('2017-12-08', '2017-12-08', '2017-12-08T00:00:00Z', _Granularity.SECOND), + (None, '2017-12-10T13:54:00.0', '2017-12-10', _Granularity.DATE), + (None, '2017-12-10', '2017-12-10T00:00:00Z', _Granularity.SECOND), +]) +def test_resume_from(spider, until_date, last_run, expected, granularity, cleanup): + spider.until_date = until_date + spider.granularity = granularity + spider._save_run(started_at=datetime.utcnow()) + + with open(spider._last_run_file_path(), 'r') as f: + run_record = json.load(f) + + run_record['last_run_finished_at'] = last_run + + with open(spider._last_run_file_path(), 'w+') as f: + json.dump(run_record, f) + + result = spider._resume_from + + assert expected == result From 80efc446d4eb08bde814d78250d2bec87e7df0b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Tue, 12 Dec 2017 14:09:41 +0100 Subject: [PATCH 05/21] use celerymonitor in CDS tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- docker-compose.test.yml | 22 +- hepcrawl/spiders/cds_spider.py | 11 +- hepcrawl/spiders/oaipmh_spider.py | 10 +- .../cds/fixtures/http_server/conf/proxy.conf | 12 + .../cds/fixtures/http_server/records/cds.xml | 1480 +++++++++++++++++ tests/functional/cds/test_cds.py | 143 +- 6 files changed, 1606 insertions(+), 72 deletions(-) create mode 100644 tests/functional/cds/fixtures/http_server/conf/proxy.conf create mode 100644 tests/functional/cds/fixtures/http_server/records/cds.xml diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 61fdf2a9..e3b90cdf 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -59,8 +59,11 @@ services: functional_cds: <<: *service_base command: py.test -vv tests/functional/cds - links: - - scrapyd + depends_on: + scrapyd: + condition: service_healthy + cds-http-server.local: + condition: service_healthy functional_pos: <<: *service_base @@ -130,6 +133,21 @@ services: - "CMD-SHELL" - "curl https://localhost:443/" + cds-http-server.local: + image: nginx:stable-alpine + volumes: + - ${PWD}/tests/functional/cds/fixtures/http_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf + - ${PWD}/tests/functional/cds/fixtures/http_server/records:/etc/nginx/html/ + ports: + - 80:80 + healthcheck: + timeout: 5s + interval: 5s + retries: 5 + test: + - "CMD-SHELL" + - "curl http://localhost:80/" + rabbitmq: image: rabbitmq healthcheck: diff --git a/hepcrawl/spiders/cds_spider.py b/hepcrawl/spiders/cds_spider.py index e9aabb80..21d1ab15 100644 --- a/hepcrawl/spiders/cds_spider.py +++ b/hepcrawl/spiders/cds_spider.py @@ -42,13 +42,18 @@ class CDSSpider(OAIPMHSpider): name = 'CDS' - def __init__(self, from_date=None, oai_set="forINSPIRE", *args, **kwargs): + def __init__(self, + oai_endpoint='http://cds.cern.ch/oai2d', + from_date=None, + oai_set="forINSPIRE", + *args, **kwargs): super(CDSSpider, self).__init__( - url='http://cds.cern.ch/oai2d', + url=oai_endpoint, metadata_prefix='marcxml', oai_set=oai_set, from_date=from_date, - **kwargs) + **kwargs + ) def parse_record(self, selector): selector.remove_namespaces() diff --git a/hepcrawl/spiders/oaipmh_spider.py b/hepcrawl/spiders/oaipmh_spider.py index b66528a2..30121365 100644 --- a/hepcrawl/spiders/oaipmh_spider.py +++ b/hepcrawl/spiders/oaipmh_spider.py @@ -51,8 +51,14 @@ class OAIPMHSpider(StatefulSpider): name = 'OAI-PMH' granularity = _Granularity.DATE - def __init__(self, url, metadata_prefix='marcxml', oai_set=None, alias=None, - from_date=None, until_date=None, granularity=_Granularity.DATE, + def __init__(self, + url, + metadata_prefix='oai_dc', + oai_set=None, + alias=None, + from_date=None, + until_date=None, + granularity=_Granularity.DATE, record_class=Record, *args, **kwargs): super(OAIPMHSpider, self).__init__(*args, **kwargs) self.url = url diff --git a/tests/functional/cds/fixtures/http_server/conf/proxy.conf b/tests/functional/cds/fixtures/http_server/conf/proxy.conf new file mode 100644 index 00000000..68d70722 --- /dev/null +++ b/tests/functional/cds/fixtures/http_server/conf/proxy.conf @@ -0,0 +1,12 @@ +server { + listen 80; + server_name localhost; + charset_types text/xml; + charset UTF-8; + + location /oai2d { + if ($args ~ from=2017-11-15&verb=ListRecords&set=forINSPIRE&metadataPrefix=marcxml) { + rewrite ^.*$ /cds.xml permanent; + } + } +} diff --git a/tests/functional/cds/fixtures/http_server/records/cds.xml b/tests/functional/cds/fixtures/http_server/records/cds.xml new file mode 100644 index 00000000..9bec8576 --- /dev/null +++ b/tests/functional/cds/fixtures/http_server/records/cds.xml @@ -0,0 +1,1480 @@ + + + +2017-12-07T15:05:26Zhttp://cds.cern.ch/oai2d +
oai:cds.cern.ch:12007522017-11-16T08:09:30Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1200752 + SzGeCERN + 20171116090930.0 + + oai:cds.cern.ch:1200752 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509577 + + + eng + + + Dubus, G + Grenoble Observ. + + + High and very high energy gamma-ray emission from binaries + + + 2009 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS MQW7-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + 018 + PoS + MQW7 + 2008 + + + http://cds.cern.ch/record/1200752/files/MQW7_018.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090817 + + + 1129423 + 018 + izmir20080901 + + + PUBLIC + + + 002842486CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12007532017-11-16T08:09:30Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1200753 + SzGeCERN + 20171116090930.0 + + oai:cds.cern.ch:1200753 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509578 + + + eng + + + Dubois, R + SLAC + + + GLAST: Launched and Being Commissioned - Status and Prospects for Microquasars + + + Fermi: Launched and Being Commissioned - Status and Prospects for Microquasars + Other title + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS MQW7-2009 + + + No authors + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + for the Fermi LAT Collaboration + + + 019 + PoS + MQW7 + 2008 + + + http://cds.cern.ch/record/1200753/files/MQW7_019.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090817 + + + 1129423 + 019 + izmir20080901 + + + PUBLIC + + + 002842487CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12007542017-11-16T08:09:30Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1200754 + SzGeCERN + 20171116090930.0 + + oai:cds.cern.ch:1200754 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509579 + + + eng + + + Romero, G E + Villa Elisa, Inst. Argentino Radioastron. + La Plata U. + + + Hadronic models of high-energy radiation from microquasars: recent developments + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS MQW7-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + 020 + PoS + MQW7 + 2008 + + + http://cds.cern.ch/record/1200754/files/MQW7_020.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090817 + + + 1129423 + 020 + izmir20080901 + + + PUBLIC + + + 002842488CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12032802017-11-16T08:09:52Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203280 + SzGeCERN + 20171116090952.0 + + oai:cds.cern.ch:1203280 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509595 + + + eng + + + Guess, C J + Michigan State U., NSCL + Michigan U. + Michigan State U., JINA + + + Studying matrix elements for the neutrinoless double beta decay of 150Nd via the 150Sm(t,3He)150Pm* and 150Nd(3He,t)150Pm* reactions + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Austin, S M + Michigan State U., NSCL + Michigan State U., JINA + + + Bazin, D + Michigan State U., NSCL + + + Brown, B A + Michigan State U., NSCL + Michigan U. + Michigan State U., JINA + + + Caesar, C + Michigan State U., NSCL + Mainz U. + + + Deaven, J M + Michigan State U., NSCL + Michigan U. + Michigan State U., JINA + + + Herlitzius, C + Michigan State U., NSCL + Mainz U. + + + Hitt, G W + Michigan State U., NSCL + Michigan U. + Michigan State U., JINA + + + Meharchand, R T + Michigan State U., NSCL + Michigan U. + Michigan State U., JINA + + + Perdikakis, G + Michigan State U., NSCL + Michigan State U., JINA + + + Shimbara, Y + Niigata U., Grad. Sch. Sci. Tech. + + + Tur, C + Michigan State U., NSCL + Michigan State U., JINA + + + Zegers, R G T + Michigan State U., NSCL + Michigan U. + Michigan State U., JINA + + + 104 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203280/files/NIC20X_104.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 104 + mackinacisland20080727 + + + PUBLIC + + + 002844587CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12032812017-11-16T08:09:55Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203281 + SzGeCERN + 20171116090955.0 + + oai:cds.cern.ch:1203281 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509596 + + + eng + + + Jachowicz, N + Ghent U. + + + Untangling supernova-neutrino oscillations with beta-beam data + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + McLaughlin, G C + North Carolina State U. + + + Volpe, C + Orsay, IPN + + + 107 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203281/files/NIC20X_107.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 107 + mackinacisland20080727 + + + PUBLIC + + + 002844588CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12033612017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203361 + SzGeCERN + 20171116090958.0 + + oai:cds.cern.ch:1203361 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509597 + + + eng + + + Kawagoe, S + Tokyo U. + + + Neutrino oscillations in non-spherical supernova explosions + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Takiwaki, T + Tokyo U. + + + Kotake, K + Natl. Astron. Observ. of Japan + + + 109 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203361/files/NIC20X_109.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 109 + mackinacisland20080727 + + + PUBLIC + + + 002844668CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12033622017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203362 + SzGeCERN + 20171116090958.0 + + oai:cds.cern.ch:1203362 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509598 + + + eng + + + Nakazato, K + Waseda U. + + + Neutrino Emission from Stellar Collapse including Hadron-Quark Mixed Phase + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Sumiyoshi, K + Numazu Coll. Tech. + + + Yamada, s + Waseda U. + + + 116 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203362/files/NIC20X_116.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 116 + mackinacisland20080727 + + + PUBLIC + + + 002844669CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12033632017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203363 + SzGeCERN + 20171116090958.0 + + oai:cds.cern.ch:1203363 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509599 + + + eng + + + Sumiyoshi, K + Numazu Coll. Tech. + + + Short neutrino burst from failed supernovae as a probe of dense matter with hyperon mixture + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Ishizuka, C + Hokkaido U. + + + Ohnishi, A + Kyoto U., Yukawa Inst., Kyoto + + + Yamada, S + Waseda U. + + + Suzuki, H + Tokyo U. of Sci. + + + 122 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203363/files/NICX_122.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 122 + mackinacisland20080727 + + + PUBLIC + + + 002844670CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12033642017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203364 + SzGeCERN + 20171116090958.0 + + oai:cds.cern.ch:1203364 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509600 + + + eng + + + Suzuki, T + Tokyo U. + + + Neutrino Nucleus Reactions and Nucleosynthesis in Stars + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Yoshida, T + Natl. Astron. Observ. of Japan + + + Chiba, S + JAEA, Ibaraki + + + Honma, M + Aizu U. + + + Higashiyama, K + Chiba Inst. Tech. + + + Umeda, H + Tokyo U. + + + Nomoto, K + Tokyo U. + + + Kajino, T + Tokyo U. + Natl. Astron. Observ. of Japan + + + Otsuka, T + Tokyo U. + + + 123 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203364/files/NICX_123.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 123 + mackinacisland20080727 + + + PUBLIC + + + 002844671CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12033652017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203365 + SzGeCERN + 20171116090958.0 + + oai:cds.cern.ch:1203365 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509601 + + + eng + + + Whitehouse, S + Basel U. + + + Neutrino transport in 3D simulations of core-collapse supernovae + + + A new approach to neutrino transport in 3D simulations of core-collapse supernovae + Other title + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Liebendörfer, M + Basel U. + + + 243 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203365/files/NICX_243.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 243 + mackinacisland20080727 + + + PUBLIC + + + 002844672CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12033662017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203366 + SzGeCERN + 20171116090958.0 + + oai:cds.cern.ch:1203366 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509602 + + + eng + + + Arcones, A + Damstadt, Tech. Hochsch. + Darmstadt, GSI + + + Neutrino-driven winds and nucleosynthesis + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Martínez-Pinedo, G + Darmstadt, GSI + + + Schwenk, A + TRIUMF + + + O’Connor, E + TRIUMF + Caltech + + + Langanke, K + Damstadt, Tech. Hochsch. + Darmstadt, GSI + + + Horowitz, C J + Indiana U. + + + Janka, H T + Garching, Max Planck Inst. + + + 128 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203366/files/NIC20X_128.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 128 + mackinacisland20080727 + + + PUBLIC + + + 002844673CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12033672017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203367 + SzGeCERN + 20171116090958.0 + + oai:cds.cern.ch:1203367 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509603 + + + eng + + + Roberts, L + UC, Santa Cruz, Astron. Astrophys. + + + Nucleosynthesis in the Neutrino Driven Wind of Protoneutron Stars + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Woosley, S + UC, Santa Cruz, Astron. Astrophys. + + + Heger, A + Minnesota U. + + + Hoffman, R + LLNL, Livermore + + + 146 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203367/files/NICX_146.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 146 + mackinacisland20080727 + + + PUBLIC + + + 002844674CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12033692017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203369 + SzGeCERN + 20171116090958.0 + + oai:cds.cern.ch:1203369 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509604 + + + eng + + + Kojima, K + Tokyo U. + Natl. Astron. Observ. of Japan + + + Neutrino effect in cosmology with the primordial magnetic field + + + Neutrino effects in cosmology with A primordial magnetic field + Other title + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Ichiki, K + Nagoya U. + + + Kajino, T + Tokyo U. + Natl. Astron. Observ. of Japan + + + Mathews, G J + Notre Dame U. + Natl. Astron. Observ. of Japan + + + 226 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203369/files/NICX_226.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 226 + mackinacisland20080727 + + + PUBLIC + + + 002844676CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12033702017-11-16T08:09:47Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1203370 + SzGeCERN + 20171116090947.0 + + oai:cds.cern.ch:1203370 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509605 + + + eng + + + Yamazaki, D G + Natl. Astron. Observ. of Japan + + + A Strong Constraint on the Neutrino Mass from the Formation of Large Scale Structure in the Presence of the Primordial Magnetic Field + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS NIC X-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + Ichiki, K + Tokyo U. + + + Kajino, T + Natl. Astron. Observ. of Japan + + + Mathews, G J + Notre Dame U. + + + 239 + PoS + NIC X + 2008 + + + http://cds.cern.ch/record/1203370/files/NICX_239.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090827 + + + 1024674 + 239 + mackinacisland20080727 + + + PUBLIC + + + 002844677CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
+
+ diff --git a/tests/functional/cds/test_cds.py b/tests/functional/cds/test_cds.py index 38f41c2e..a864d534 100644 --- a/tests/functional/cds/test_cds.py +++ b/tests/functional/cds/test_cds.py @@ -9,84 +9,97 @@ """Functional tests for CDS spider""" -import pytest -import requests_mock +from __future__ import absolute_import, division, print_function -import copy -import json -from scrapy.crawler import CrawlerProcess -from scrapy.utils.project import get_project_settings -from tempfile import NamedTemporaryFile +import os +import pytest +from hepcrawl.testlib.celery_monitor import CeleryMonitor from hepcrawl.testlib.fixtures import ( get_test_suite_path, expected_json_results_from_file, + clean_dir, ) +from hepcrawl.testlib.tasks import app as celery_app +from hepcrawl.testlib.utils import get_crawler_instance -@pytest.fixture -def cds_oai_server(): - with requests_mock.Mocker() as m: - m.get('http://cds.cern.ch/oai2d?from=2017-11-15&verb=ListRecords&set=forINSPIRE&metadataPrefix=marcxml', - text=open(get_test_suite_path('cds', 'fixtures', 'cds.xml', test_suite='functional')).read()) - yield m - - -def override_dynamic_fields_on_records(records): - clean_records = [] - for record in records: - clean_record = override_dynamic_fields_on_record(record) - clean_records.append(clean_record) - - return clean_records +@pytest.fixture(scope='function', autouse=True) +def cleanup(): + clean_dir() + clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) + yield + clean_dir() + clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) -def override_dynamic_fields_on_record(record): - def _override(field_key, original_dict, backup_dict, new_value): - backup_dict[field_key] = original_dict[field_key] - original_dict[field_key] = new_value - - clean_record = copy.deepcopy(record) - overriden_fields = {} - dummy_random_date = u'2017-04-03T10:26:40.365216' - - overriden_fields['acquisition_source'] = {} - _override( - field_key='datetime', - original_dict=clean_record['acquisition_source'], - backup_dict=overriden_fields['acquisition_source'], - new_value=dummy_random_date, +def override_generated_fields(record): + record['acquisition_source']['datetime'] = u'2017-04-03T10:26:40.365216' + record['acquisition_source']['submission_number'] = ( + u'5652c7f6190f11e79e8000224dabeaad' ) - _override( - field_key='submission_number', - original_dict=clean_record['acquisition_source'], - backup_dict=overriden_fields['acquisition_source'], - new_value=u'5652c7f6190f11e79e8000224dabeaad', - ) - - return clean_record - -def test_cds(cds_oai_server): - f = NamedTemporaryFile('r+') - - settings = get_project_settings() - settings.set('FEED_FORMAT', 'json') - settings.set('FEED_URI', f.name) - - process = CrawlerProcess(settings) - process.crawl('CDS', from_date='2017-11-15', oai_set='forINSPIRE') - process.start() - - result = json.load(f) - - expected = expected_json_results_from_file( - 'cds', 'fixtures', 'cds_expected.json' + return record + + +def get_configuration(): + return { + 'CRAWLER_HOST_URL': 'http://scrapyd:6800', + 'CRAWLER_PROJECT': 'hepcrawl', + 'CRAWLER_ARGUMENTS': { + 'from_date': '2017-11-15', + 'oai_set': 'forINSPIRE', + 'oai_endpoint': 'http://cds-http-server.local/oai2d', + } + } + + +@pytest.mark.parametrize( + 'expected_results, config', + [ + ( + expected_json_results_from_file( + 'cds', + 'fixtures', + 'cds_expected.json', + ), + get_configuration(), + ), + ], + ids=[ + 'smoke', + ] +) +def test_cds( + expected_results, + config, +): + crawler = get_crawler_instance(config['CRAWLER_HOST_URL']) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=100, + events_limit=2, + crawler_instance=crawler, + project=config['CRAWLER_PROJECT'], + spider='CDS', + settings={}, + **config['CRAWLER_ARGUMENTS'] ) - expected = override_dynamic_fields_on_records(expected) - result = override_dynamic_fields_on_records(result) + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [ + override_generated_fields(expected) for expected in expected_results + ] - assert result == expected + gotten_results = sorted( + gotten_results, + key=lambda x: x['document_type'] + ) + expected_results = sorted( + expected_results, + key=lambda x: x['document_type'] + ) - f.close() + assert gotten_results == expected_results From adb19067d81b74ab9d0b265baedb3440a1d115a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Tue, 12 Dec 2017 16:27:14 +0100 Subject: [PATCH 06/21] CDS spider: drop HarvestingKit (#199) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Harvests CDS through dojson directly: closes #199. Signed-off-by: Szymon Łopaciuk --- hepcrawl/spiders/cds_spider.py | 42 +- .../functional/cds/fixtures/cds_expected.json | 1073 +++++++++++------ 2 files changed, 683 insertions(+), 432 deletions(-) diff --git a/hepcrawl/spiders/cds_spider.py b/hepcrawl/spiders/cds_spider.py index 21d1ab15..ed88cf9a 100644 --- a/hepcrawl/spiders/cds_spider.py +++ b/hepcrawl/spiders/cds_spider.py @@ -10,13 +10,7 @@ """Spider for the CERN Document Server OAI-PMH interface""" import logging -from scrapy import Request from flask.app import Flask -from harvestingkit.inspire_cds_package.from_cds import CDS2Inspire -from harvestingkit.bibrecord import ( - create_record as create_bibrec, - record_xml_output, -) from dojson.contrib.marc21.utils import create_record from inspire_dojson.hep import hep @@ -34,10 +28,8 @@ class CDSSpider(OAIPMHSpider): $ scrapy crawl CDS \\ -a "oai_set=forINSPIRE" -a "from_date=2017-10-10" - It uses `HarvestingKit `_ to - translate from CDS's MARCXML into INSPIRE Legacy's MARCXML flavor. It then - employs `inspire-dojson `_ to - transform the legacy INSPIRE MARCXML into the new INSPIRE Schema. + It uses `inspire-dojson `_ to + translate from CDS's MARCXML into the new INSPIRE Schema. """ name = 'CDS' @@ -57,23 +49,13 @@ def __init__(self, def parse_record(self, selector): selector.remove_namespaces() - try: - cds_bibrec, ok, errs = create_bibrec(selector.xpath('.//record').extract()[0]) - if not ok: - raise RuntimeError("Cannot parse record %s: %s", selector, errs) - self.logger.info("Here's the record: %s" % cds_bibrec) - inspire_bibrec = CDS2Inspire(cds_bibrec).get_record() - marcxml_record = record_xml_output(inspire_bibrec) - record = create_record(marcxml_record) - app = Flask('hepcrawl') - app.config.update( - self.settings.getdict('MARC_TO_HEP_SETTINGS', {}) - ) - with app.app_context(): - json_record = hep.do(record) - base_uri = self.settings['SCHEMA_BASE_URI'] - json_record['$schema'] = base_uri + 'hep.json' - return ParsedItem(record=json_record, record_format='hep') - except Exception: - logger.exception("Error when parsing record") - return None + record = create_record(selector.xpath('.//record').extract()[0]) + app = Flask('hepcrawl') + app.config.update( + self.settings.getdict('MARC_TO_HEP_SETTINGS', {}) + ) + with app.app_context(): + json_record = hep.do(record) + base_uri = self.settings['SCHEMA_BASE_URI'] + json_record['$schema'] = base_uri + 'hep.json' + return ParsedItem(record=json_record, record_format='hep') diff --git a/tests/functional/cds/fixtures/cds_expected.json b/tests/functional/cds/fixtures/cds_expected.json index cfb94ced..d9c4360b 100644 --- a/tests/functional/cds/fixtures/cds_expected.json +++ b/tests/functional/cds/fixtures/cds_expected.json @@ -1,27 +1,41 @@ [ { - "refereed": true, - "core": true, - "preprint_date": "2009", - "documents": [ + "urls": [ { - "url": "http://cds.cern.ch/record/1200752/files/MQW7_018.pdf", - "key": "document" + "description": "Published version from PoS", + "value": "http://cds.cern.ch/record/1200752/files/MQW7_018.pdf" } ], - "citeable": true, - "_collections": [ - "Literature" + "external_system_identifiers": [ + { + "value": "1509577", + "schema": "Inspire" + }, + { + "value": "002842486CER", + "schema": "SPIRES" + } ], + "curated": true, + "license": [ + { + "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", + "license": "CC-BY-NC-SA-3.0" + } + ], + "control_number": 1200752, + "self": { + "$ref": "https://labs.inspirehep.net/api/literature/1200752" + }, "acquisition_source": { "source": "CDS", - "datetime": "2017-12-07T15:54:16.980315", "method": "hepcrawl", - "submission_number": "None" + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" }, "inspire_categories": [ { - "term": "Astrophysics" + "term": "Other" } ], "titles": [ @@ -29,15 +43,15 @@ "title": "High and very high energy gamma-ray emission from binaries" } ], - "publication_info": [ + "_private_notes": [ { - "journal_volume": "MQW7", - "page_start": "018", - "year": 2008, - "artid": "018", - "journal_title": "PoS" + "value": "SIS POS MQW7-2009" + }, + { + "value": "Inspire" } ], + "legacy_creation_date": "2009-08-17", "authors": [ { "affiliations": [ @@ -45,53 +59,72 @@ "value": "Grenoble Observ." } ], - "full_name": "Dubus, G." + "full_name": "Dubus, G" } ], - "external_system_identifiers": [ - { - "value": "1200752", - "schema": "CDS" - }, + "publication_info": [ { - "value": "1509577", - "schema": "Inspire" + "journal_volume": "MQW7", + "page_start": "018", + "journal_title": "PoS", + "artid": "018", + "year": 2008 } ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ + "article", "conference paper" ], - "curated": true + "imprints": [ + { + "date": "2009" + } + ], + "citeable": true }, { - "refereed": true, - "core": true, - "preprint_date": "2008", - "documents": [ + "urls": [ + { + "description": "Published version from PoS", + "value": "http://cds.cern.ch/record/1200753/files/MQW7_019.pdf" + } + ], + "external_system_identifiers": [ + { + "value": "1509578", + "schema": "Inspire" + }, { - "url": "http://cds.cern.ch/record/1200753/files/MQW7_019.pdf", - "key": "document" + "value": "002842487CER", + "schema": "SPIRES" } ], - "citeable": true, - "_collections": [ - "Literature" + "curated": true, + "license": [ + { + "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", + "license": "CC-BY-NC-SA-3.0" + } ], + "control_number": 1200753, "collaborations": [ { "value": "Fermi LAT" } ], + "self": { + "$ref": "https://labs.inspirehep.net/api/literature/1200753" + }, "acquisition_source": { "source": "CDS", - "datetime": "2017-12-07T15:54:17.101983", "method": "hepcrawl", - "submission_number": "None" + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" }, "inspire_categories": [ { - "term": "Astrophysics" + "term": "Other" } ], "titles": [ @@ -102,15 +135,18 @@ "title": "Fermi: Launched and Being Commissioned - Status and Prospects for Microquasars" } ], - "publication_info": [ + "_private_notes": [ { - "journal_volume": "MQW7", - "page_start": "019", - "year": 2008, - "artid": "019", - "journal_title": "PoS" + "value": "SIS POS MQW7-2009" + }, + { + "value": "No authors" + }, + { + "value": "Inspire" } ], + "legacy_creation_date": "2009-08-17", "authors": [ { "affiliations": [ @@ -118,48 +154,67 @@ "value": "SLAC" } ], - "full_name": "Dubois, R." + "full_name": "Dubois, R" } ], - "external_system_identifiers": [ - { - "value": "1200753", - "schema": "CDS" - }, + "publication_info": [ { - "value": "1509578", - "schema": "Inspire" + "journal_volume": "MQW7", + "page_start": "019", + "journal_title": "PoS", + "artid": "019", + "year": 2008 } ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ + "article", "conference paper" ], - "curated": true + "imprints": [ + { + "date": "2008" + } + ], + "citeable": true }, { - "refereed": true, - "core": true, - "preprint_date": "2008", - "documents": [ + "urls": [ + { + "description": "Published version from PoS", + "value": "http://cds.cern.ch/record/1200754/files/MQW7_020.pdf" + } + ], + "external_system_identifiers": [ + { + "value": "1509579", + "schema": "Inspire" + }, { - "url": "http://cds.cern.ch/record/1200754/files/MQW7_020.pdf", - "key": "document" + "value": "002842488CER", + "schema": "SPIRES" } ], - "citeable": true, - "_collections": [ - "Literature" + "curated": true, + "license": [ + { + "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", + "license": "CC-BY-NC-SA-3.0" + } ], + "control_number": 1200754, + "self": { + "$ref": "https://labs.inspirehep.net/api/literature/1200754" + }, "acquisition_source": { "source": "CDS", - "datetime": "2017-12-07T15:54:17.153496", "method": "hepcrawl", - "submission_number": "None" + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" }, "inspire_categories": [ { - "term": "Astrophysics" + "term": "Other" } ], "titles": [ @@ -167,15 +222,15 @@ "title": "Hadronic models of high-energy radiation from microquasars: recent developments" } ], - "publication_info": [ + "_private_notes": [ { - "journal_volume": "MQW7", - "page_start": "020", - "year": 2008, - "artid": "020", - "journal_title": "PoS" + "value": "SIS POS MQW7-2009" + }, + { + "value": "Inspire" } ], + "legacy_creation_date": "2009-08-17", "authors": [ { "affiliations": [ @@ -186,48 +241,67 @@ "value": "La Plata U." } ], - "full_name": "Romero, G. E." + "full_name": "Romero, G E" } ], - "external_system_identifiers": [ - { - "value": "1200754", - "schema": "CDS" - }, + "publication_info": [ { - "value": "1509579", - "schema": "Inspire" + "journal_volume": "MQW7", + "page_start": "020", + "journal_title": "PoS", + "artid": "020", + "year": 2008 } ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ + "article", "conference paper" ], - "curated": true + "imprints": [ + { + "date": "2008" + } + ], + "citeable": true }, { - "refereed": true, - "core": true, - "preprint_date": "2008", - "documents": [ + "urls": [ { - "url": "http://cds.cern.ch/record/1203280/files/NIC20X_104.pdf", - "key": "document" + "description": "Published version from PoS", + "value": "http://cds.cern.ch/record/1203280/files/NIC20X_104.pdf" + } + ], + "external_system_identifiers": [ + { + "value": "1509595", + "schema": "Inspire" + }, + { + "value": "002844587CER", + "schema": "SPIRES" } ], "curated": true, - "_collections": [ - "Literature" + "license": [ + { + "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", + "license": "CC-BY-NC-SA-3.0" + } ], + "control_number": 1203280, + "self": { + "$ref": "https://labs.inspirehep.net/api/literature/1203280" + }, "acquisition_source": { "source": "CDS", - "datetime": "2017-12-07T15:54:17.209845", "method": "hepcrawl", - "submission_number": "None" + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" }, "inspire_categories": [ { - "term": "Astrophysics" + "term": "Other" } ], "titles": [ @@ -235,15 +309,15 @@ "title": "Studying matrix elements for the neutrinoless double beta decay of 150Nd via the 150Sm(t,3He)150Pm* and 150Nd(3He,t)150Pm* reactions" } ], - "publication_info": [ + "_private_notes": [ { - "journal_volume": "NIC X", - "page_start": "104", - "year": 2008, - "artid": "104", - "journal_title": "PoS" + "value": "SIS POS NIC X-2009" + }, + { + "value": "Inspire" } ], + "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -257,7 +331,7 @@ "value": "Michigan State U., JINA" } ], - "full_name": "Guess, C. J." + "full_name": "Guess, C J" }, { "affiliations": [ @@ -268,7 +342,7 @@ "value": "Michigan State U., JINA" } ], - "full_name": "Austin, S. M." + "full_name": "Austin, S M" }, { "affiliations": [ @@ -276,7 +350,7 @@ "value": "Michigan State U., NSCL" } ], - "full_name": "Bazin, D." + "full_name": "Bazin, D" }, { "affiliations": [ @@ -290,7 +364,7 @@ "value": "Michigan State U., JINA" } ], - "full_name": "Brown, B. A." + "full_name": "Brown, B A" }, { "affiliations": [ @@ -301,7 +375,7 @@ "value": "Mainz U." } ], - "full_name": "Caesar, C." + "full_name": "Caesar, C" }, { "affiliations": [ @@ -315,7 +389,7 @@ "value": "Michigan State U., JINA" } ], - "full_name": "Deaven, J. M." + "full_name": "Deaven, J M" }, { "affiliations": [ @@ -326,7 +400,7 @@ "value": "Mainz U." } ], - "full_name": "Herlitzius, C." + "full_name": "Herlitzius, C" }, { "affiliations": [ @@ -340,7 +414,7 @@ "value": "Michigan State U., JINA" } ], - "full_name": "Hitt, G. W." + "full_name": "Hitt, G W" }, { "affiliations": [ @@ -354,7 +428,7 @@ "value": "Michigan State U., JINA" } ], - "full_name": "Meharchand, R. T." + "full_name": "Meharchand, R T" }, { "affiliations": [ @@ -365,7 +439,7 @@ "value": "Michigan State U., JINA" } ], - "full_name": "Perdikakis, G." + "full_name": "Perdikakis, G" }, { "affiliations": [ @@ -373,7 +447,7 @@ "value": "Niigata U., Grad. Sch. Sci. Tech." } ], - "full_name": "Shimbara, Y." + "full_name": "Shimbara, Y" }, { "affiliations": [ @@ -384,7 +458,7 @@ "value": "Michigan State U., JINA" } ], - "full_name": "Tur, C." + "full_name": "Tur, C" }, { "affiliations": [ @@ -398,48 +472,67 @@ "value": "Michigan State U., JINA" } ], - "full_name": "Zegers, R. G. T." + "full_name": "Zegers, R G T" } ], - "external_system_identifiers": [ - { - "value": "1203280", - "schema": "CDS" - }, + "publication_info": [ { - "value": "1509595", - "schema": "Inspire" + "journal_volume": "NIC X", + "page_start": "104", + "journal_title": "PoS", + "artid": "104", + "year": 2008 } ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ + "article", "conference paper" ], + "imprints": [ + { + "date": "2008" + } + ], "citeable": true }, { - "refereed": true, - "core": true, - "preprint_date": "2008", - "documents": [ + "urls": [ + { + "description": "Published version from PoS", + "value": "http://cds.cern.ch/record/1203281/files/NIC20X_107.pdf" + } + ], + "external_system_identifiers": [ + { + "value": "1509596", + "schema": "Inspire" + }, { - "url": "http://cds.cern.ch/record/1203281/files/NIC20X_107.pdf", - "key": "document" + "value": "002844588CER", + "schema": "SPIRES" } ], "curated": true, - "_collections": [ - "Literature" + "license": [ + { + "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", + "license": "CC-BY-NC-SA-3.0" + } ], + "control_number": 1203281, + "self": { + "$ref": "https://labs.inspirehep.net/api/literature/1203281" + }, "acquisition_source": { "source": "CDS", - "datetime": "2017-12-07T15:54:17.271385", "method": "hepcrawl", - "submission_number": "None" + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" }, "inspire_categories": [ { - "term": "Astrophysics" + "term": "Other" } ], "titles": [ @@ -447,15 +540,15 @@ "title": "Untangling supernova-neutrino oscillations with beta-beam data" } ], - "publication_info": [ + "_private_notes": [ { - "journal_volume": "NIC X", - "page_start": "107", - "year": 2008, - "artid": "107", - "journal_title": "PoS" + "value": "SIS POS NIC X-2009" + }, + { + "value": "Inspire" } ], + "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -463,7 +556,7 @@ "value": "Ghent U." } ], - "full_name": "Jachowicz, N." + "full_name": "Jachowicz, N" }, { "affiliations": [ @@ -471,7 +564,7 @@ "value": "North Carolina State U." } ], - "full_name": "McLaughlin, G. C." + "full_name": "McLaughlin, G C" }, { "affiliations": [ @@ -479,48 +572,67 @@ "value": "Orsay, IPN" } ], - "full_name": "Volpe, C." + "full_name": "Volpe, C" } ], - "external_system_identifiers": [ - { - "value": "1203281", - "schema": "CDS" - }, + "publication_info": [ { - "value": "1509596", - "schema": "Inspire" + "journal_volume": "NIC X", + "page_start": "107", + "journal_title": "PoS", + "artid": "107", + "year": 2008 } ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ + "article", "conference paper" ], + "imprints": [ + { + "date": "2008" + } + ], "citeable": true }, { - "refereed": true, - "core": true, - "preprint_date": "2008", - "documents": [ + "urls": [ { - "url": "http://cds.cern.ch/record/1203361/files/NIC20X_109.pdf", - "key": "document" + "description": "Published version from PoS", + "value": "http://cds.cern.ch/record/1203361/files/NIC20X_109.pdf" + } + ], + "external_system_identifiers": [ + { + "value": "1509597", + "schema": "Inspire" + }, + { + "value": "002844668CER", + "schema": "SPIRES" } ], "curated": true, - "_collections": [ - "Literature" + "license": [ + { + "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", + "license": "CC-BY-NC-SA-3.0" + } ], + "control_number": 1203361, + "self": { + "$ref": "https://labs.inspirehep.net/api/literature/1203361" + }, "acquisition_source": { "source": "CDS", - "datetime": "2017-12-07T15:54:17.324204", "method": "hepcrawl", - "submission_number": "None" + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" }, "inspire_categories": [ { - "term": "Astrophysics" + "term": "Other" } ], "titles": [ @@ -528,15 +640,15 @@ "title": "Neutrino oscillations in non-spherical supernova explosions" } ], - "publication_info": [ + "_private_notes": [ { - "journal_volume": "NIC X", - "page_start": "109", - "year": 2008, - "artid": "109", - "journal_title": "PoS" + "value": "SIS POS NIC X-2009" + }, + { + "value": "Inspire" } ], + "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -544,7 +656,7 @@ "value": "Tokyo U." } ], - "full_name": "Kawagoe, S." + "full_name": "Kawagoe, S" }, { "affiliations": [ @@ -552,7 +664,7 @@ "value": "Tokyo U." } ], - "full_name": "Takiwaki, T." + "full_name": "Takiwaki, T" }, { "affiliations": [ @@ -560,48 +672,67 @@ "value": "Natl. Astron. Observ. of Japan" } ], - "full_name": "Kotake, K." + "full_name": "Kotake, K" } ], - "external_system_identifiers": [ - { - "value": "1203361", - "schema": "CDS" - }, + "publication_info": [ { - "value": "1509597", - "schema": "Inspire" + "journal_volume": "NIC X", + "page_start": "109", + "journal_title": "PoS", + "artid": "109", + "year": 2008 } ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ + "article", "conference paper" ], + "imprints": [ + { + "date": "2008" + } + ], "citeable": true }, { - "refereed": true, - "core": true, - "preprint_date": "2008", - "documents": [ + "urls": [ { - "url": "http://cds.cern.ch/record/1203362/files/NIC20X_116.pdf", - "key": "document" + "description": "Published version from PoS", + "value": "http://cds.cern.ch/record/1203362/files/NIC20X_116.pdf" + } + ], + "external_system_identifiers": [ + { + "value": "1509598", + "schema": "Inspire" + }, + { + "value": "002844669CER", + "schema": "SPIRES" } ], "curated": true, - "_collections": [ - "Literature" + "license": [ + { + "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", + "license": "CC-BY-NC-SA-3.0" + } ], + "control_number": 1203362, + "self": { + "$ref": "https://labs.inspirehep.net/api/literature/1203362" + }, "acquisition_source": { "source": "CDS", - "datetime": "2017-12-07T15:54:17.378354", "method": "hepcrawl", - "submission_number": "None" + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" }, "inspire_categories": [ { - "term": "Astrophysics" + "term": "Other" } ], "titles": [ @@ -609,15 +740,15 @@ "title": "Neutrino Emission from Stellar Collapse including Hadron-Quark Mixed Phase" } ], - "publication_info": [ + "_private_notes": [ { - "journal_volume": "NIC X", - "page_start": "116", - "year": 2008, - "artid": "116", - "journal_title": "PoS" + "value": "SIS POS NIC X-2009" + }, + { + "value": "Inspire" } ], + "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -625,7 +756,7 @@ "value": "Waseda U." } ], - "full_name": "Nakazato, K." + "full_name": "Nakazato, K" }, { "affiliations": [ @@ -633,7 +764,7 @@ "value": "Numazu Coll. Tech." } ], - "full_name": "Sumiyoshi, K." + "full_name": "Sumiyoshi, K" }, { "affiliations": [ @@ -641,48 +772,67 @@ "value": "Waseda U." } ], - "full_name": "Yamada, s." + "full_name": "Yamada, s" } ], - "external_system_identifiers": [ - { - "value": "1203362", - "schema": "CDS" - }, + "publication_info": [ { - "value": "1509598", - "schema": "Inspire" + "journal_volume": "NIC X", + "page_start": "116", + "journal_title": "PoS", + "artid": "116", + "year": 2008 } ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ + "article", "conference paper" ], + "imprints": [ + { + "date": "2008" + } + ], "citeable": true }, { - "refereed": true, - "core": true, - "preprint_date": "2008", - "documents": [ + "urls": [ { - "url": "http://cds.cern.ch/record/1203363/files/NICX_122.pdf", - "key": "document" + "description": "Published version from PoS", + "value": "http://cds.cern.ch/record/1203363/files/NICX_122.pdf" + } + ], + "external_system_identifiers": [ + { + "value": "1509599", + "schema": "Inspire" + }, + { + "value": "002844670CER", + "schema": "SPIRES" } ], "curated": true, - "_collections": [ - "Literature" + "license": [ + { + "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", + "license": "CC-BY-NC-SA-3.0" + } ], + "control_number": 1203363, + "self": { + "$ref": "https://labs.inspirehep.net/api/literature/1203363" + }, "acquisition_source": { "source": "CDS", - "datetime": "2017-12-07T15:54:17.431432", "method": "hepcrawl", - "submission_number": "None" + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" }, "inspire_categories": [ { - "term": "Astrophysics" + "term": "Other" } ], "titles": [ @@ -690,15 +840,15 @@ "title": "Short neutrino burst from failed supernovae as a probe of dense matter with hyperon mixture" } ], - "publication_info": [ + "_private_notes": [ { - "journal_volume": "NIC X", - "page_start": "122", - "year": 2008, - "artid": "122", - "journal_title": "PoS" + "value": "SIS POS NIC X-2009" + }, + { + "value": "Inspire" } ], + "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -706,7 +856,7 @@ "value": "Numazu Coll. Tech." } ], - "full_name": "Sumiyoshi, K." + "full_name": "Sumiyoshi, K" }, { "affiliations": [ @@ -714,7 +864,7 @@ "value": "Hokkaido U." } ], - "full_name": "Ishizuka, C." + "full_name": "Ishizuka, C" }, { "affiliations": [ @@ -722,7 +872,7 @@ "value": "Kyoto U., Yukawa Inst., Kyoto" } ], - "full_name": "Ohnishi, A." + "full_name": "Ohnishi, A" }, { "affiliations": [ @@ -730,7 +880,7 @@ "value": "Waseda U." } ], - "full_name": "Yamada, S." + "full_name": "Yamada, S" }, { "affiliations": [ @@ -738,48 +888,67 @@ "value": "Tokyo U. of Sci." } ], - "full_name": "Suzuki, H." + "full_name": "Suzuki, H" } ], - "external_system_identifiers": [ - { - "value": "1203363", - "schema": "CDS" - }, + "publication_info": [ { - "value": "1509599", - "schema": "Inspire" + "journal_volume": "NIC X", + "page_start": "122", + "journal_title": "PoS", + "artid": "122", + "year": 2008 } ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ + "article", "conference paper" ], + "imprints": [ + { + "date": "2008" + } + ], "citeable": true }, { - "refereed": true, - "core": true, - "preprint_date": "2008", - "documents": [ + "urls": [ + { + "description": "Published version from PoS", + "value": "http://cds.cern.ch/record/1203364/files/NICX_123.pdf" + } + ], + "external_system_identifiers": [ + { + "value": "1509600", + "schema": "Inspire" + }, { - "url": "http://cds.cern.ch/record/1203364/files/NICX_123.pdf", - "key": "document" + "value": "002844671CER", + "schema": "SPIRES" } ], "curated": true, - "_collections": [ - "Literature" + "license": [ + { + "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", + "license": "CC-BY-NC-SA-3.0" + } ], + "control_number": 1203364, + "self": { + "$ref": "https://labs.inspirehep.net/api/literature/1203364" + }, "acquisition_source": { "source": "CDS", - "datetime": "2017-12-07T15:54:17.486814", "method": "hepcrawl", - "submission_number": "None" + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" }, "inspire_categories": [ { - "term": "Astrophysics" + "term": "Other" } ], "titles": [ @@ -787,15 +956,15 @@ "title": "Neutrino Nucleus Reactions and Nucleosynthesis in Stars" } ], - "publication_info": [ + "_private_notes": [ { - "journal_volume": "NIC X", - "page_start": "123", - "year": 2008, - "artid": "123", - "journal_title": "PoS" + "value": "SIS POS NIC X-2009" + }, + { + "value": "Inspire" } ], + "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -803,7 +972,7 @@ "value": "Tokyo U." } ], - "full_name": "Suzuki, T." + "full_name": "Suzuki, T" }, { "affiliations": [ @@ -811,7 +980,7 @@ "value": "Natl. Astron. Observ. of Japan" } ], - "full_name": "Yoshida, T." + "full_name": "Yoshida, T" }, { "affiliations": [ @@ -819,7 +988,7 @@ "value": "JAEA, Ibaraki" } ], - "full_name": "Chiba, S." + "full_name": "Chiba, S" }, { "affiliations": [ @@ -827,7 +996,7 @@ "value": "Aizu U." } ], - "full_name": "Honma, M." + "full_name": "Honma, M" }, { "affiliations": [ @@ -835,7 +1004,7 @@ "value": "Chiba Inst. Tech." } ], - "full_name": "Higashiyama, K." + "full_name": "Higashiyama, K" }, { "affiliations": [ @@ -843,7 +1012,7 @@ "value": "Tokyo U." } ], - "full_name": "Umeda, H." + "full_name": "Umeda, H" }, { "affiliations": [ @@ -851,7 +1020,7 @@ "value": "Tokyo U." } ], - "full_name": "Nomoto, K." + "full_name": "Nomoto, K" }, { "affiliations": [ @@ -862,7 +1031,7 @@ "value": "Natl. Astron. Observ. of Japan" } ], - "full_name": "Kajino, T." + "full_name": "Kajino, T" }, { "affiliations": [ @@ -870,48 +1039,67 @@ "value": "Tokyo U." } ], - "full_name": "Otsuka, T." + "full_name": "Otsuka, T" } ], - "external_system_identifiers": [ - { - "value": "1203364", - "schema": "CDS" - }, + "publication_info": [ { - "value": "1509600", - "schema": "Inspire" + "journal_volume": "NIC X", + "page_start": "123", + "journal_title": "PoS", + "artid": "123", + "year": 2008 } ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ + "article", "conference paper" ], + "imprints": [ + { + "date": "2008" + } + ], "citeable": true }, { - "refereed": true, - "core": true, - "preprint_date": "2008", - "documents": [ + "urls": [ + { + "description": "Published version from PoS", + "value": "http://cds.cern.ch/record/1203365/files/NICX_243.pdf" + } + ], + "external_system_identifiers": [ { - "url": "http://cds.cern.ch/record/1203365/files/NICX_243.pdf", - "key": "document" + "value": "1509601", + "schema": "Inspire" + }, + { + "value": "002844672CER", + "schema": "SPIRES" } ], "curated": true, - "_collections": [ - "Literature" + "license": [ + { + "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", + "license": "CC-BY-NC-SA-3.0" + } ], + "control_number": 1203365, + "self": { + "$ref": "https://labs.inspirehep.net/api/literature/1203365" + }, "acquisition_source": { "source": "CDS", - "datetime": "2017-12-07T15:54:17.541467", "method": "hepcrawl", - "submission_number": "None" + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" }, "inspire_categories": [ { - "term": "Astrophysics" + "term": "Other" } ], "titles": [ @@ -922,15 +1110,15 @@ "title": "A new approach to neutrino transport in 3D simulations of core-collapse supernovae" } ], - "publication_info": [ + "_private_notes": [ { - "journal_volume": "NIC X", - "page_start": "243", - "year": 2008, - "artid": "243", - "journal_title": "PoS" + "value": "SIS POS NIC X-2009" + }, + { + "value": "Inspire" } ], + "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -938,7 +1126,7 @@ "value": "Basel U." } ], - "full_name": "Whitehouse, S." + "full_name": "Whitehouse, S" }, { "affiliations": [ @@ -946,48 +1134,67 @@ "value": "Basel U." } ], - "full_name": "Liebendörfer, M." + "full_name": "Liebend\u00f6rfer, M" } ], - "external_system_identifiers": [ - { - "value": "1203365", - "schema": "CDS" - }, + "publication_info": [ { - "value": "1509601", - "schema": "Inspire" + "journal_volume": "NIC X", + "page_start": "243", + "journal_title": "PoS", + "artid": "243", + "year": 2008 } ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ + "article", "conference paper" ], + "imprints": [ + { + "date": "2008" + } + ], "citeable": true }, { - "refereed": true, - "core": true, - "preprint_date": "2008", - "documents": [ + "urls": [ { - "url": "http://cds.cern.ch/record/1203366/files/NIC20X_128.pdf", - "key": "document" + "description": "Published version from PoS", + "value": "http://cds.cern.ch/record/1203366/files/NIC20X_128.pdf" + } + ], + "external_system_identifiers": [ + { + "value": "1509602", + "schema": "Inspire" + }, + { + "value": "002844673CER", + "schema": "SPIRES" } ], "curated": true, - "_collections": [ - "Literature" + "license": [ + { + "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", + "license": "CC-BY-NC-SA-3.0" + } ], + "control_number": 1203366, + "self": { + "$ref": "https://labs.inspirehep.net/api/literature/1203366" + }, "acquisition_source": { "source": "CDS", - "datetime": "2017-12-07T15:54:17.595372", "method": "hepcrawl", - "submission_number": "None" + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" }, "inspire_categories": [ { - "term": "Astrophysics" + "term": "Other" } ], "titles": [ @@ -995,15 +1202,15 @@ "title": "Neutrino-driven winds and nucleosynthesis" } ], - "publication_info": [ + "_private_notes": [ { - "journal_volume": "NIC X", - "page_start": "128", - "year": 2008, - "artid": "128", - "journal_title": "PoS" + "value": "SIS POS NIC X-2009" + }, + { + "value": "Inspire" } ], + "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -1014,7 +1221,7 @@ "value": "Darmstadt, GSI" } ], - "full_name": "Arcones, A." + "full_name": "Arcones, A" }, { "affiliations": [ @@ -1022,7 +1229,7 @@ "value": "Darmstadt, GSI" } ], - "full_name": "Martínez-Pinedo, G." + "full_name": "Mart\u00ednez-Pinedo, G" }, { "affiliations": [ @@ -1030,7 +1237,7 @@ "value": "TRIUMF" } ], - "full_name": "Schwenk, A." + "full_name": "Schwenk, A" }, { "affiliations": [ @@ -1041,7 +1248,7 @@ "value": "Caltech" } ], - "full_name": "O’Connor, E." + "full_name": "O\u2019Connor, E" }, { "affiliations": [ @@ -1052,7 +1259,7 @@ "value": "Darmstadt, GSI" } ], - "full_name": "Langanke, K." + "full_name": "Langanke, K" }, { "affiliations": [ @@ -1060,7 +1267,7 @@ "value": "Indiana U." } ], - "full_name": "Horowitz, C. J." + "full_name": "Horowitz, C J" }, { "affiliations": [ @@ -1068,48 +1275,67 @@ "value": "Garching, Max Planck Inst." } ], - "full_name": "Janka, H. T." + "full_name": "Janka, H T" } ], - "external_system_identifiers": [ - { - "value": "1203366", - "schema": "CDS" - }, + "publication_info": [ { - "value": "1509602", - "schema": "Inspire" + "journal_volume": "NIC X", + "page_start": "128", + "journal_title": "PoS", + "artid": "128", + "year": 2008 } ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ + "article", "conference paper" ], + "imprints": [ + { + "date": "2008" + } + ], "citeable": true }, { - "refereed": true, - "core": true, - "preprint_date": "2008", - "documents": [ + "urls": [ { - "url": "http://cds.cern.ch/record/1203367/files/NICX_146.pdf", - "key": "document" + "description": "Published version from PoS", + "value": "http://cds.cern.ch/record/1203367/files/NICX_146.pdf" + } + ], + "external_system_identifiers": [ + { + "value": "1509603", + "schema": "Inspire" + }, + { + "value": "002844674CER", + "schema": "SPIRES" } ], "curated": true, - "_collections": [ - "Literature" + "license": [ + { + "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", + "license": "CC-BY-NC-SA-3.0" + } ], + "control_number": 1203367, + "self": { + "$ref": "https://labs.inspirehep.net/api/literature/1203367" + }, "acquisition_source": { "source": "CDS", - "datetime": "2017-12-07T15:54:17.650283", "method": "hepcrawl", - "submission_number": "None" + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" }, "inspire_categories": [ { - "term": "Astrophysics" + "term": "Other" } ], "titles": [ @@ -1117,15 +1343,15 @@ "title": "Nucleosynthesis in the Neutrino Driven Wind of Protoneutron Stars" } ], - "publication_info": [ + "_private_notes": [ { - "journal_volume": "NIC X", - "page_start": "146", - "year": 2008, - "artid": "146", - "journal_title": "PoS" + "value": "SIS POS NIC X-2009" + }, + { + "value": "Inspire" } ], + "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -1133,7 +1359,7 @@ "value": "UC, Santa Cruz, Astron. Astrophys." } ], - "full_name": "Roberts, L." + "full_name": "Roberts, L" }, { "affiliations": [ @@ -1141,7 +1367,7 @@ "value": "UC, Santa Cruz, Astron. Astrophys." } ], - "full_name": "Woosley, S." + "full_name": "Woosley, S" }, { "affiliations": [ @@ -1149,7 +1375,7 @@ "value": "Minnesota U." } ], - "full_name": "Heger, A." + "full_name": "Heger, A" }, { "affiliations": [ @@ -1157,48 +1383,67 @@ "value": "LLNL, Livermore" } ], - "full_name": "Hoffman, R." + "full_name": "Hoffman, R" } ], - "external_system_identifiers": [ - { - "value": "1203367", - "schema": "CDS" - }, + "publication_info": [ { - "value": "1509603", - "schema": "Inspire" + "journal_volume": "NIC X", + "page_start": "146", + "journal_title": "PoS", + "artid": "146", + "year": 2008 } ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ + "article", "conference paper" ], + "imprints": [ + { + "date": "2008" + } + ], "citeable": true }, { - "refereed": true, - "core": true, - "preprint_date": "2008", - "documents": [ + "urls": [ { - "url": "http://cds.cern.ch/record/1203369/files/NICX_226.pdf", - "key": "document" + "description": "Published version from PoS", + "value": "http://cds.cern.ch/record/1203369/files/NICX_226.pdf" + } + ], + "external_system_identifiers": [ + { + "value": "1509604", + "schema": "Inspire" + }, + { + "value": "002844676CER", + "schema": "SPIRES" } ], "curated": true, - "_collections": [ - "Literature" + "license": [ + { + "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", + "license": "CC-BY-NC-SA-3.0" + } ], + "control_number": 1203369, + "self": { + "$ref": "https://labs.inspirehep.net/api/literature/1203369" + }, "acquisition_source": { "source": "CDS", - "datetime": "2017-12-07T15:54:17.704338", "method": "hepcrawl", - "submission_number": "None" + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" }, "inspire_categories": [ { - "term": "Astrophysics" + "term": "Other" } ], "titles": [ @@ -1209,15 +1454,15 @@ "title": "Neutrino effects in cosmology with A primordial magnetic field" } ], - "publication_info": [ + "_private_notes": [ { - "journal_volume": "NIC X", - "page_start": "226", - "year": 2008, - "artid": "226", - "journal_title": "PoS" + "value": "SIS POS NIC X-2009" + }, + { + "value": "Inspire" } ], + "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -1228,7 +1473,7 @@ "value": "Natl. Astron. Observ. of Japan" } ], - "full_name": "Kojima, K." + "full_name": "Kojima, K" }, { "affiliations": [ @@ -1236,7 +1481,7 @@ "value": "Nagoya U." } ], - "full_name": "Ichiki, K." + "full_name": "Ichiki, K" }, { "affiliations": [ @@ -1247,7 +1492,7 @@ "value": "Natl. Astron. Observ. of Japan" } ], - "full_name": "Kajino, T." + "full_name": "Kajino, T" }, { "affiliations": [ @@ -1258,48 +1503,67 @@ "value": "Natl. Astron. Observ. of Japan" } ], - "full_name": "Mathews, G. J." + "full_name": "Mathews, G J" } ], - "external_system_identifiers": [ - { - "value": "1203369", - "schema": "CDS" - }, + "publication_info": [ { - "value": "1509604", - "schema": "Inspire" + "journal_volume": "NIC X", + "page_start": "226", + "journal_title": "PoS", + "artid": "226", + "year": 2008 } ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ + "article", "conference paper" ], + "imprints": [ + { + "date": "2008" + } + ], "citeable": true }, { - "refereed": true, - "core": true, - "preprint_date": "2008", - "documents": [ + "urls": [ { - "url": "http://cds.cern.ch/record/1203370/files/NICX_239.pdf", - "key": "document" + "description": "Published version from PoS", + "value": "http://cds.cern.ch/record/1203370/files/NICX_239.pdf" + } + ], + "external_system_identifiers": [ + { + "value": "1509605", + "schema": "Inspire" + }, + { + "value": "002844677CER", + "schema": "SPIRES" } ], "curated": true, - "_collections": [ - "Literature" + "license": [ + { + "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", + "license": "CC-BY-NC-SA-3.0" + } ], + "control_number": 1203370, + "self": { + "$ref": "https://labs.inspirehep.net/api/literature/1203370" + }, "acquisition_source": { "source": "CDS", - "datetime": "2017-12-07T15:54:17.758624", "method": "hepcrawl", - "submission_number": "None" + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" }, "inspire_categories": [ { - "term": "Astrophysics" + "term": "Other" } ], "titles": [ @@ -1307,15 +1571,15 @@ "title": "A Strong Constraint on the Neutrino Mass from the Formation of Large Scale Structure in the Presence of the Primordial Magnetic Field" } ], - "publication_info": [ + "_private_notes": [ { - "journal_volume": "NIC X", - "page_start": "239", - "year": 2008, - "artid": "239", - "journal_title": "PoS" + "value": "SIS POS NIC X-2009" + }, + { + "value": "Inspire" } ], + "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -1323,7 +1587,7 @@ "value": "Natl. Astron. Observ. of Japan" } ], - "full_name": "Yamazaki, D. G." + "full_name": "Yamazaki, D G" }, { "affiliations": [ @@ -1331,7 +1595,7 @@ "value": "Tokyo U." } ], - "full_name": "Ichiki, K." + "full_name": "Ichiki, K" }, { "affiliations": [ @@ -1339,7 +1603,7 @@ "value": "Natl. Astron. Observ. of Japan" } ], - "full_name": "Kajino, T." + "full_name": "Kajino, T" }, { "affiliations": [ @@ -1347,23 +1611,28 @@ "value": "Notre Dame U." } ], - "full_name": "Mathews, G. J." + "full_name": "Mathews, G J" } ], - "external_system_identifiers": [ - { - "value": "1203370", - "schema": "CDS" - }, + "publication_info": [ { - "value": "1509605", - "schema": "Inspire" + "journal_volume": "NIC X", + "page_start": "239", + "journal_title": "PoS", + "artid": "239", + "year": 2008 } ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ + "article", "conference paper" ], + "imprints": [ + { + "date": "2008" + } + ], "citeable": true } ] From fff7c95552aa6dc3aac7fc62650b0ea2c91ea7fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Tue, 12 Dec 2017 17:10:51 +0100 Subject: [PATCH 07/21] remove unused import MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- tests/functional/cds/test_cds.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/functional/cds/test_cds.py b/tests/functional/cds/test_cds.py index a864d534..32e4e23d 100644 --- a/tests/functional/cds/test_cds.py +++ b/tests/functional/cds/test_cds.py @@ -16,7 +16,6 @@ from hepcrawl.testlib.celery_monitor import CeleryMonitor from hepcrawl.testlib.fixtures import ( - get_test_suite_path, expected_json_results_from_file, clean_dir, ) From 4895b0798ba64bd0708c6f28182a96a9e18521ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Wed, 13 Dec 2017 10:36:34 +0100 Subject: [PATCH 08/21] fix failure on lack of last runs file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes a bug that would raise a TypeError when no last runs would have been present. Signed-off-by: Szymon Łopaciuk --- hepcrawl/spiders/oaipmh_spider.py | 2 ++ tests/unit/test_oaipmh.py | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/hepcrawl/spiders/oaipmh_spider.py b/hepcrawl/spiders/oaipmh_spider.py index 30121365..ce948a27 100644 --- a/hepcrawl/spiders/oaipmh_spider.py +++ b/hepcrawl/spiders/oaipmh_spider.py @@ -189,6 +189,8 @@ def _save_run(self, started_at): @property def _resume_from(self): last_run = self._load_last_run() + if not last_run: + return None resume_at = last_run['until_date'] or last_run['last_run_finished_at'] date_parsed = dateparser.parse(resume_at) return self.granularity.format(date_parsed) diff --git a/tests/unit/test_oaipmh.py b/tests/unit/test_oaipmh.py index faa173ec..a899aed0 100644 --- a/tests/unit/test_oaipmh.py +++ b/tests/unit/test_oaipmh.py @@ -82,11 +82,16 @@ def test_store_and_load_last_run(spider, cleanup): assert expected == result -def test_load_inexisting(spider): +def test_load_nonexistent(spider): last_run = spider._load_last_run() assert last_run == None +def test_resume_from_nonexistent_no_error(spider): + resume_from = spider._resume_from + assert resume_from == None + + @pytest.mark.parametrize('until_date,last_run,expected,granularity', [ ('2017-12-08T13:54:00.0', '2017-12-08T13:54:00.0', '2017-12-08', _Granularity.DATE), ('2017-12-08T13:54:00.0', '2017-12-08T13:54:00.0', '2017-12-08T13:54:00Z', _Granularity.SECOND), From b7c3fc4a08bc43a53bf9d55c3812133e0ae4fc7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Wed, 13 Dec 2017 16:40:52 +0100 Subject: [PATCH 09/21] remove ignoring the exception on item validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the docs the spider will only be closed after CLOSESPIDER_ERRORCOUNT number of exceptions, which by default allows ininitely many. Thus this is not needed, and it's better if we know if there are validation issues. Signed-off-by: Szymon Łopaciuk --- hepcrawl/pipelines.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py index 100cda5d..b30ff6c7 100644 --- a/hepcrawl/pipelines.py +++ b/hepcrawl/pipelines.py @@ -116,11 +116,8 @@ def process_item(self, item, spider): hep_record = self._post_enhance_item(item, spider) - try: - validate(hep_record, 'hep') - spider.logger.debug('Validated item by Inspire Schemas.') - except Exception as err: - spider.logger.error('ERROR in validating {}: {}'.format(hep_record, err)) + validate(hep_record, 'hep') + spider.logger.debug('Validated item by Inspire Schemas.') self.results_data.append(hep_record) From bb5c83490d8c41369cd14addf50e23c261540e11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Wed, 13 Dec 2017 16:50:25 +0100 Subject: [PATCH 10/21] style fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- hepcrawl/spiders/cds_spider.py | 6 ++++-- hepcrawl/spiders/oaipmh_spider.py | 36 +++++++++++++++++-------------- tests/functional/cds/test_cds.py | 2 +- 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/hepcrawl/spiders/cds_spider.py b/hepcrawl/spiders/cds_spider.py index ed88cf9a..dd0ef0aa 100644 --- a/hepcrawl/spiders/cds_spider.py +++ b/hepcrawl/spiders/cds_spider.py @@ -17,7 +17,9 @@ from .oaipmh_spider import OAIPMHSpider from ..utils import ParsedItem -logger = logging.getLogger(__name__) + +LOGGER = logging.getLogger(__name__) + class CDSSpider(OAIPMHSpider): """Spider for crawling the CERN Document Server OAI-PMH XML files. @@ -49,7 +51,7 @@ def __init__(self, def parse_record(self, selector): selector.remove_namespaces() - record = create_record(selector.xpath('.//record').extract()[0]) + record = selector.xpath('.//record').extract_first() app = Flask('hepcrawl') app.config.update( self.settings.getdict('MARC_TO_HEP_SETTINGS', {}) diff --git a/hepcrawl/spiders/oaipmh_spider.py b/hepcrawl/spiders/oaipmh_spider.py index ce948a27..9a99e6b4 100644 --- a/hepcrawl/spiders/oaipmh_spider.py +++ b/hepcrawl/spiders/oaipmh_spider.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of hepcrawl. -# Copyright (C) 2015, 2016, 2017 CERN. +# Copyright (C) 2017 CERN. # # hepcrawl is a free software; you can redistribute it and/or modify it # under the terms of the Revised BSD License; see LICENSE file for @@ -26,7 +26,8 @@ from scrapy.selector import Selector from . import StatefulSpider -logger = logging.getLogger(__name__) + +LOGGER = logging.getLogger(__name__) class _Granularity(Enum): @@ -51,15 +52,18 @@ class OAIPMHSpider(StatefulSpider): name = 'OAI-PMH' granularity = _Granularity.DATE - def __init__(self, - url, - metadata_prefix='oai_dc', - oai_set=None, - alias=None, - from_date=None, - until_date=None, - granularity=_Granularity.DATE, - record_class=Record, *args, **kwargs): + def __init__( + self, + url, + metadata_prefix='oai_dc', + oai_set=None, + alias=None, + from_date=None, + until_date=None, + granularity=_Granularity.DATE, + record_class=Record, + *args, **kwargs + ): super(OAIPMHSpider, self).__init__(*args, **kwargs) self.url = url self.metadata_prefix = metadata_prefix @@ -74,7 +78,7 @@ def start_requests(self): self.from_date = self.from_date or self._resume_from started_at = datetime.utcnow() - logger.info("Starting harvesting of {url} with set={set} and " + LOGGER.info("Starting harvesting of {url} with set={set} and " "metadataPrefix={metadata_prefix}, from={from_date}, " "until={until_date}".format( url=self.url, @@ -90,7 +94,7 @@ def start_requests(self): now = datetime.utcnow() self._save_run(started_at) - logger.info("Harvesting completed. Next harvesting will resume from {}" + LOGGER.info("Harvesting completed. Next harvesting will resume from {}" .format(self.until_date or self.granularity.format(now))) def parse_record(self, record): @@ -115,7 +119,7 @@ def parse(self, response): 'until': self.until_date, }) except NoRecordsMatch as err: - logger.warning(err) + LOGGER.warning(err) raise StopIteration() for record in records: response = XmlResponse(self.url, encoding='utf-8', body=record.raw) @@ -149,7 +153,7 @@ def _load_last_run(self): try: with open(file_path) as f: last_run = json.load(f) - logger.info('Last run file loaded: {}'.format(repr(last_run))) + LOGGER.info('Last run file loaded: {}'.format(repr(last_run))) return last_run except IOError: return None @@ -175,7 +179,7 @@ def _save_run(self, started_at): 'last_run_finished_at': datetime.utcnow().isoformat(), } file_path = self._last_run_file_path() - logger.info("Last run file saved to {}".format(file_path)) + LOGGER.info("Last run file saved to {}".format(file_path)) try: makedirs(path.dirname(file_path)) except OSError as exc: diff --git a/tests/functional/cds/test_cds.py b/tests/functional/cds/test_cds.py index 32e4e23d..ae314225 100644 --- a/tests/functional/cds/test_cds.py +++ b/tests/functional/cds/test_cds.py @@ -79,7 +79,7 @@ def test_cds( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, - events_limit=2, + events_limit=1, crawler_instance=crawler, project=config['CRAWLER_PROJECT'], spider='CDS', From acf9125b288228d1e26cf3eeea0ac91090e42463 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Thu, 14 Dec 2017 09:16:37 +0100 Subject: [PATCH 11/21] bump inspire-dojson~=57.0,>=57.1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- hepcrawl/spiders/cds_spider.py | 4 +- hepcrawl/spiders/oaipmh_spider.py | 1 - .../functional/cds/fixtures/cds_expected.json | 768 +++++++----------- 3 files changed, 274 insertions(+), 499 deletions(-) diff --git a/hepcrawl/spiders/cds_spider.py b/hepcrawl/spiders/cds_spider.py index dd0ef0aa..f5359536 100644 --- a/hepcrawl/spiders/cds_spider.py +++ b/hepcrawl/spiders/cds_spider.py @@ -12,7 +12,7 @@ import logging from flask.app import Flask from dojson.contrib.marc21.utils import create_record -from inspire_dojson.hep import hep +from inspire_dojson import marcxml2record from .oaipmh_spider import OAIPMHSpider from ..utils import ParsedItem @@ -57,7 +57,7 @@ def parse_record(self, selector): self.settings.getdict('MARC_TO_HEP_SETTINGS', {}) ) with app.app_context(): - json_record = hep.do(record) + json_record = marcxml2record(record) base_uri = self.settings['SCHEMA_BASE_URI'] json_record['$schema'] = base_uri + 'hep.json' return ParsedItem(record=json_record, record_format='hep') diff --git a/hepcrawl/spiders/oaipmh_spider.py b/hepcrawl/spiders/oaipmh_spider.py index 9a99e6b4..0f3adf6d 100644 --- a/hepcrawl/spiders/oaipmh_spider.py +++ b/hepcrawl/spiders/oaipmh_spider.py @@ -19,7 +19,6 @@ from os import path, makedirs from sickle import Sickle -from sickle.models import Record from sickle.oaiexceptions import NoRecordsMatch from scrapy.http import Request, XmlResponse diff --git a/tests/functional/cds/fixtures/cds_expected.json b/tests/functional/cds/fixtures/cds_expected.json index d9c4360b..a4cf0cb5 100644 --- a/tests/functional/cds/fixtures/cds_expected.json +++ b/tests/functional/cds/fixtures/cds_expected.json @@ -1,57 +1,36 @@ [ { - "urls": [ + "core": true, + "documents": [ { + "url": "http://cds.cern.ch/record/1200752/files/MQW7_018.pdf", + "source": "CDS", "description": "Published version from PoS", - "value": "http://cds.cern.ch/record/1200752/files/MQW7_018.pdf" - } - ], - "external_system_identifiers": [ - { - "value": "1509577", - "schema": "Inspire" - }, - { - "value": "002842486CER", - "schema": "SPIRES" + "key": "MQW7_018.pdf" } ], "curated": true, - "license": [ - { - "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", - "license": "CC-BY-NC-SA-3.0" - } + "_collections": [ + "Literature" ], - "control_number": 1200752, - "self": { - "$ref": "https://labs.inspirehep.net/api/literature/1200752" - }, - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "5652c7f6190f11e79e8000224dabeaad", - "datetime": "2017-04-03T10:26:40.365216" - }, "inspire_categories": [ { - "term": "Other" + "source": "cds", + "term": "Astrophysics" } ], "titles": [ { + "source": "CDS", "title": "High and very high energy gamma-ray emission from binaries" } ], "_private_notes": [ { - "value": "SIS POS MQW7-2009" - }, - { - "value": "Inspire" + "source": "CDS", + "value": "CDS-1200752" } ], - "legacy_creation_date": "2009-08-17", "authors": [ { "affiliations": [ @@ -73,80 +52,62 @@ ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ - "article", "conference paper" ], + "citeable": true, "imprints": [ { "date": "2009" } ], - "citeable": true + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:03.875113" + } }, { - "urls": [ + "core": true, + "documents": [ { + "url": "http://cds.cern.ch/record/1200753/files/MQW7_019.pdf", + "source": "CDS", "description": "Published version from PoS", - "value": "http://cds.cern.ch/record/1200753/files/MQW7_019.pdf" - } - ], - "external_system_identifiers": [ - { - "value": "1509578", - "schema": "Inspire" - }, - { - "value": "002842487CER", - "schema": "SPIRES" + "key": "MQW7_019.pdf" } ], "curated": true, - "license": [ - { - "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", - "license": "CC-BY-NC-SA-3.0" - } + "_collections": [ + "Literature" ], - "control_number": 1200753, "collaborations": [ { "value": "Fermi LAT" } ], - "self": { - "$ref": "https://labs.inspirehep.net/api/literature/1200753" - }, - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "5652c7f6190f11e79e8000224dabeaad", - "datetime": "2017-04-03T10:26:40.365216" - }, "inspire_categories": [ { - "term": "Other" + "source": "cds", + "term": "Astrophysics" } ], "titles": [ { + "source": "CDS", "title": "GLAST: Launched and Being Commissioned - Status and Prospects for Microquasars" }, { + "source": "CDS", "title": "Fermi: Launched and Being Commissioned - Status and Prospects for Microquasars" } ], "_private_notes": [ { - "value": "SIS POS MQW7-2009" - }, - { - "value": "No authors" - }, - { - "value": "Inspire" + "source": "CDS", + "value": "CDS-1200753" } ], - "legacy_creation_date": "2009-08-17", "authors": [ { "affiliations": [ @@ -168,69 +129,53 @@ ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ - "article", "conference paper" ], + "citeable": true, "imprints": [ { "date": "2008" } ], - "citeable": true + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:03.951904" + } }, { - "urls": [ + "core": true, + "documents": [ { + "url": "http://cds.cern.ch/record/1200754/files/MQW7_020.pdf", + "source": "CDS", "description": "Published version from PoS", - "value": "http://cds.cern.ch/record/1200754/files/MQW7_020.pdf" - } - ], - "external_system_identifiers": [ - { - "value": "1509579", - "schema": "Inspire" - }, - { - "value": "002842488CER", - "schema": "SPIRES" + "key": "MQW7_020.pdf" } ], "curated": true, - "license": [ - { - "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", - "license": "CC-BY-NC-SA-3.0" - } + "_collections": [ + "Literature" ], - "control_number": 1200754, - "self": { - "$ref": "https://labs.inspirehep.net/api/literature/1200754" - }, - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "5652c7f6190f11e79e8000224dabeaad", - "datetime": "2017-04-03T10:26:40.365216" - }, "inspire_categories": [ { - "term": "Other" + "source": "cds", + "term": "Astrophysics" } ], "titles": [ { + "source": "CDS", "title": "Hadronic models of high-energy radiation from microquasars: recent developments" } ], "_private_notes": [ { - "value": "SIS POS MQW7-2009" - }, - { - "value": "Inspire" + "source": "CDS", + "value": "CDS-1200754" } ], - "legacy_creation_date": "2009-08-17", "authors": [ { "affiliations": [ @@ -255,69 +200,53 @@ ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ - "article", "conference paper" ], + "citeable": true, "imprints": [ { "date": "2008" } ], - "citeable": true + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:03.984541" + } }, { - "urls": [ + "core": true, + "documents": [ { + "url": "http://cds.cern.ch/record/1203280/files/NIC20X_104.pdf", + "source": "CDS", "description": "Published version from PoS", - "value": "http://cds.cern.ch/record/1203280/files/NIC20X_104.pdf" - } - ], - "external_system_identifiers": [ - { - "value": "1509595", - "schema": "Inspire" - }, - { - "value": "002844587CER", - "schema": "SPIRES" + "key": "NIC20X_104.pdf" } ], "curated": true, - "license": [ - { - "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", - "license": "CC-BY-NC-SA-3.0" - } + "_collections": [ + "Literature" ], - "control_number": 1203280, - "self": { - "$ref": "https://labs.inspirehep.net/api/literature/1203280" - }, - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "5652c7f6190f11e79e8000224dabeaad", - "datetime": "2017-04-03T10:26:40.365216" - }, "inspire_categories": [ { - "term": "Other" + "source": "cds", + "term": "Astrophysics" } ], "titles": [ { + "source": "CDS", "title": "Studying matrix elements for the neutrinoless double beta decay of 150Nd via the 150Sm(t,3He)150Pm* and 150Nd(3He,t)150Pm* reactions" } ], "_private_notes": [ { - "value": "SIS POS NIC X-2009" - }, - { - "value": "Inspire" + "source": "CDS", + "value": "CDS-1203280" } ], - "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -486,69 +415,53 @@ ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ - "article", "conference paper" ], + "citeable": true, "imprints": [ { "date": "2008" } ], - "citeable": true + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:04.019463" + } }, { - "urls": [ + "core": true, + "documents": [ { + "url": "http://cds.cern.ch/record/1203281/files/NIC20X_107.pdf", + "source": "CDS", "description": "Published version from PoS", - "value": "http://cds.cern.ch/record/1203281/files/NIC20X_107.pdf" - } - ], - "external_system_identifiers": [ - { - "value": "1509596", - "schema": "Inspire" - }, - { - "value": "002844588CER", - "schema": "SPIRES" + "key": "NIC20X_107.pdf" } ], "curated": true, - "license": [ - { - "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", - "license": "CC-BY-NC-SA-3.0" - } + "_collections": [ + "Literature" ], - "control_number": 1203281, - "self": { - "$ref": "https://labs.inspirehep.net/api/literature/1203281" - }, - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "5652c7f6190f11e79e8000224dabeaad", - "datetime": "2017-04-03T10:26:40.365216" - }, "inspire_categories": [ { - "term": "Other" + "source": "cds", + "term": "Astrophysics" } ], "titles": [ { + "source": "CDS", "title": "Untangling supernova-neutrino oscillations with beta-beam data" } ], "_private_notes": [ { - "value": "SIS POS NIC X-2009" - }, - { - "value": "Inspire" + "source": "CDS", + "value": "CDS-1203281" } ], - "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -586,69 +499,53 @@ ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ - "article", "conference paper" ], + "citeable": true, "imprints": [ { "date": "2008" } ], - "citeable": true + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:04.058926" + } }, { - "urls": [ + "core": true, + "documents": [ { + "url": "http://cds.cern.ch/record/1203361/files/NIC20X_109.pdf", + "source": "CDS", "description": "Published version from PoS", - "value": "http://cds.cern.ch/record/1203361/files/NIC20X_109.pdf" - } - ], - "external_system_identifiers": [ - { - "value": "1509597", - "schema": "Inspire" - }, - { - "value": "002844668CER", - "schema": "SPIRES" + "key": "NIC20X_109.pdf" } ], "curated": true, - "license": [ - { - "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", - "license": "CC-BY-NC-SA-3.0" - } + "_collections": [ + "Literature" ], - "control_number": 1203361, - "self": { - "$ref": "https://labs.inspirehep.net/api/literature/1203361" - }, - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "5652c7f6190f11e79e8000224dabeaad", - "datetime": "2017-04-03T10:26:40.365216" - }, "inspire_categories": [ { - "term": "Other" + "source": "cds", + "term": "Astrophysics" } ], "titles": [ { + "source": "CDS", "title": "Neutrino oscillations in non-spherical supernova explosions" } ], "_private_notes": [ { - "value": "SIS POS NIC X-2009" - }, - { - "value": "Inspire" + "source": "CDS", + "value": "CDS-1203361" } ], - "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -686,69 +583,53 @@ ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ - "article", "conference paper" ], + "citeable": true, "imprints": [ { "date": "2008" } ], - "citeable": true + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:04.091842" + } }, { - "urls": [ + "core": true, + "documents": [ { + "url": "http://cds.cern.ch/record/1203362/files/NIC20X_116.pdf", + "source": "CDS", "description": "Published version from PoS", - "value": "http://cds.cern.ch/record/1203362/files/NIC20X_116.pdf" - } - ], - "external_system_identifiers": [ - { - "value": "1509598", - "schema": "Inspire" - }, - { - "value": "002844669CER", - "schema": "SPIRES" + "key": "NIC20X_116.pdf" } ], "curated": true, - "license": [ - { - "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", - "license": "CC-BY-NC-SA-3.0" - } + "_collections": [ + "Literature" ], - "control_number": 1203362, - "self": { - "$ref": "https://labs.inspirehep.net/api/literature/1203362" - }, - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "5652c7f6190f11e79e8000224dabeaad", - "datetime": "2017-04-03T10:26:40.365216" - }, "inspire_categories": [ { - "term": "Other" + "source": "cds", + "term": "Astrophysics" } ], "titles": [ { + "source": "CDS", "title": "Neutrino Emission from Stellar Collapse including Hadron-Quark Mixed Phase" } ], "_private_notes": [ { - "value": "SIS POS NIC X-2009" - }, - { - "value": "Inspire" + "source": "CDS", + "value": "CDS-1203362" } ], - "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -786,69 +667,53 @@ ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ - "article", "conference paper" ], + "citeable": true, "imprints": [ { "date": "2008" } ], - "citeable": true + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:04.125345" + } }, { - "urls": [ + "core": true, + "documents": [ { + "url": "http://cds.cern.ch/record/1203363/files/NICX_122.pdf", + "source": "CDS", "description": "Published version from PoS", - "value": "http://cds.cern.ch/record/1203363/files/NICX_122.pdf" - } - ], - "external_system_identifiers": [ - { - "value": "1509599", - "schema": "Inspire" - }, - { - "value": "002844670CER", - "schema": "SPIRES" + "key": "NICX_122.pdf" } ], "curated": true, - "license": [ - { - "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", - "license": "CC-BY-NC-SA-3.0" - } + "_collections": [ + "Literature" ], - "control_number": 1203363, - "self": { - "$ref": "https://labs.inspirehep.net/api/literature/1203363" - }, - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "5652c7f6190f11e79e8000224dabeaad", - "datetime": "2017-04-03T10:26:40.365216" - }, "inspire_categories": [ { - "term": "Other" + "source": "cds", + "term": "Astrophysics" } ], "titles": [ { + "source": "CDS", "title": "Short neutrino burst from failed supernovae as a probe of dense matter with hyperon mixture" } ], "_private_notes": [ { - "value": "SIS POS NIC X-2009" - }, - { - "value": "Inspire" + "source": "CDS", + "value": "CDS-1203363" } ], - "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -902,69 +767,53 @@ ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ - "article", "conference paper" ], + "citeable": true, "imprints": [ { "date": "2008" } ], - "citeable": true + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:04.158736" + } }, { - "urls": [ + "core": true, + "documents": [ { + "url": "http://cds.cern.ch/record/1203364/files/NICX_123.pdf", + "source": "CDS", "description": "Published version from PoS", - "value": "http://cds.cern.ch/record/1203364/files/NICX_123.pdf" - } - ], - "external_system_identifiers": [ - { - "value": "1509600", - "schema": "Inspire" - }, - { - "value": "002844671CER", - "schema": "SPIRES" + "key": "NICX_123.pdf" } ], "curated": true, - "license": [ - { - "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", - "license": "CC-BY-NC-SA-3.0" - } + "_collections": [ + "Literature" ], - "control_number": 1203364, - "self": { - "$ref": "https://labs.inspirehep.net/api/literature/1203364" - }, - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "5652c7f6190f11e79e8000224dabeaad", - "datetime": "2017-04-03T10:26:40.365216" - }, "inspire_categories": [ { - "term": "Other" + "source": "cds", + "term": "Astrophysics" } ], "titles": [ { + "source": "CDS", "title": "Neutrino Nucleus Reactions and Nucleosynthesis in Stars" } ], "_private_notes": [ { - "value": "SIS POS NIC X-2009" - }, - { - "value": "Inspire" + "source": "CDS", + "value": "CDS-1203364" } ], - "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -1053,72 +902,57 @@ ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ - "article", "conference paper" ], + "citeable": true, "imprints": [ { "date": "2008" } ], - "citeable": true + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:04.193230" + } }, { - "urls": [ + "core": true, + "documents": [ { + "url": "http://cds.cern.ch/record/1203365/files/NICX_243.pdf", + "source": "CDS", "description": "Published version from PoS", - "value": "http://cds.cern.ch/record/1203365/files/NICX_243.pdf" - } - ], - "external_system_identifiers": [ - { - "value": "1509601", - "schema": "Inspire" - }, - { - "value": "002844672CER", - "schema": "SPIRES" + "key": "NICX_243.pdf" } ], "curated": true, - "license": [ - { - "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", - "license": "CC-BY-NC-SA-3.0" - } + "_collections": [ + "Literature" ], - "control_number": 1203365, - "self": { - "$ref": "https://labs.inspirehep.net/api/literature/1203365" - }, - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "5652c7f6190f11e79e8000224dabeaad", - "datetime": "2017-04-03T10:26:40.365216" - }, "inspire_categories": [ { - "term": "Other" + "source": "cds", + "term": "Astrophysics" } ], "titles": [ { + "source": "CDS", "title": "Neutrino transport in 3D simulations of core-collapse supernovae" }, { + "source": "CDS", "title": "A new approach to neutrino transport in 3D simulations of core-collapse supernovae" } ], "_private_notes": [ { - "value": "SIS POS NIC X-2009" - }, - { - "value": "Inspire" + "source": "CDS", + "value": "CDS-1203365" } ], - "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -1134,7 +968,7 @@ "value": "Basel U." } ], - "full_name": "Liebend\u00f6rfer, M" + "full_name": "Liebendörfer, M" } ], "publication_info": [ @@ -1148,69 +982,53 @@ ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ - "article", "conference paper" ], + "citeable": true, "imprints": [ { "date": "2008" } ], - "citeable": true + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:04.228093" + } }, { - "urls": [ + "core": true, + "documents": [ { + "url": "http://cds.cern.ch/record/1203366/files/NIC20X_128.pdf", + "source": "CDS", "description": "Published version from PoS", - "value": "http://cds.cern.ch/record/1203366/files/NIC20X_128.pdf" - } - ], - "external_system_identifiers": [ - { - "value": "1509602", - "schema": "Inspire" - }, - { - "value": "002844673CER", - "schema": "SPIRES" + "key": "NIC20X_128.pdf" } ], "curated": true, - "license": [ - { - "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", - "license": "CC-BY-NC-SA-3.0" - } + "_collections": [ + "Literature" ], - "control_number": 1203366, - "self": { - "$ref": "https://labs.inspirehep.net/api/literature/1203366" - }, - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "5652c7f6190f11e79e8000224dabeaad", - "datetime": "2017-04-03T10:26:40.365216" - }, "inspire_categories": [ { - "term": "Other" + "source": "cds", + "term": "Astrophysics" } ], "titles": [ { + "source": "CDS", "title": "Neutrino-driven winds and nucleosynthesis" } ], "_private_notes": [ { - "value": "SIS POS NIC X-2009" - }, - { - "value": "Inspire" + "source": "CDS", + "value": "CDS-1203366" } ], - "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -1229,7 +1047,7 @@ "value": "Darmstadt, GSI" } ], - "full_name": "Mart\u00ednez-Pinedo, G" + "full_name": "Martínez-Pinedo, G" }, { "affiliations": [ @@ -1248,7 +1066,7 @@ "value": "Caltech" } ], - "full_name": "O\u2019Connor, E" + "full_name": "O’Connor, E" }, { "affiliations": [ @@ -1289,69 +1107,53 @@ ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ - "article", "conference paper" ], + "citeable": true, "imprints": [ { "date": "2008" } ], - "citeable": true + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:04.261882" + } }, { - "urls": [ + "core": true, + "documents": [ { + "url": "http://cds.cern.ch/record/1203367/files/NICX_146.pdf", + "source": "CDS", "description": "Published version from PoS", - "value": "http://cds.cern.ch/record/1203367/files/NICX_146.pdf" - } - ], - "external_system_identifiers": [ - { - "value": "1509603", - "schema": "Inspire" - }, - { - "value": "002844674CER", - "schema": "SPIRES" + "key": "NICX_146.pdf" } ], "curated": true, - "license": [ - { - "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", - "license": "CC-BY-NC-SA-3.0" - } + "_collections": [ + "Literature" ], - "control_number": 1203367, - "self": { - "$ref": "https://labs.inspirehep.net/api/literature/1203367" - }, - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "5652c7f6190f11e79e8000224dabeaad", - "datetime": "2017-04-03T10:26:40.365216" - }, "inspire_categories": [ { - "term": "Other" + "source": "cds", + "term": "Astrophysics" } ], "titles": [ { + "source": "CDS", "title": "Nucleosynthesis in the Neutrino Driven Wind of Protoneutron Stars" } ], "_private_notes": [ { - "value": "SIS POS NIC X-2009" - }, - { - "value": "Inspire" + "source": "CDS", + "value": "CDS-1203367" } ], - "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -1397,72 +1199,57 @@ ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ - "article", "conference paper" ], + "citeable": true, "imprints": [ { "date": "2008" } ], - "citeable": true + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:04.296986" + } }, { - "urls": [ + "core": true, + "documents": [ { + "url": "http://cds.cern.ch/record/1203369/files/NICX_226.pdf", + "source": "CDS", "description": "Published version from PoS", - "value": "http://cds.cern.ch/record/1203369/files/NICX_226.pdf" - } - ], - "external_system_identifiers": [ - { - "value": "1509604", - "schema": "Inspire" - }, - { - "value": "002844676CER", - "schema": "SPIRES" + "key": "NICX_226.pdf" } ], "curated": true, - "license": [ - { - "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", - "license": "CC-BY-NC-SA-3.0" - } + "_collections": [ + "Literature" ], - "control_number": 1203369, - "self": { - "$ref": "https://labs.inspirehep.net/api/literature/1203369" - }, - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "5652c7f6190f11e79e8000224dabeaad", - "datetime": "2017-04-03T10:26:40.365216" - }, "inspire_categories": [ { - "term": "Other" + "source": "cds", + "term": "Astrophysics" } ], "titles": [ { + "source": "CDS", "title": "Neutrino effect in cosmology with the primordial magnetic field" }, { + "source": "CDS", "title": "Neutrino effects in cosmology with A primordial magnetic field" } ], "_private_notes": [ { - "value": "SIS POS NIC X-2009" - }, - { - "value": "Inspire" + "source": "CDS", + "value": "CDS-1203369" } ], - "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -1517,69 +1304,53 @@ ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ - "article", "conference paper" ], + "citeable": true, "imprints": [ { "date": "2008" } ], - "citeable": true + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:04.330649" + } }, { - "urls": [ + "core": true, + "documents": [ { + "url": "http://cds.cern.ch/record/1203370/files/NICX_239.pdf", + "source": "CDS", "description": "Published version from PoS", - "value": "http://cds.cern.ch/record/1203370/files/NICX_239.pdf" - } - ], - "external_system_identifiers": [ - { - "value": "1509605", - "schema": "Inspire" - }, - { - "value": "002844677CER", - "schema": "SPIRES" + "key": "NICX_239.pdf" } ], "curated": true, - "license": [ - { - "url": "http://creativecommons.org/licenses/by-nc-sa/3.0/", - "license": "CC-BY-NC-SA-3.0" - } + "_collections": [ + "Literature" ], - "control_number": 1203370, - "self": { - "$ref": "https://labs.inspirehep.net/api/literature/1203370" - }, - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "5652c7f6190f11e79e8000224dabeaad", - "datetime": "2017-04-03T10:26:40.365216" - }, "inspire_categories": [ { - "term": "Other" + "source": "cds", + "term": "Astrophysics" } ], "titles": [ { + "source": "CDS", "title": "A Strong Constraint on the Neutrino Mass from the Formation of Large Scale Structure in the Presence of the Primordial Magnetic Field" } ], "_private_notes": [ { - "value": "SIS POS NIC X-2009" - }, - { - "value": "Inspire" + "source": "CDS", + "value": "CDS-1203370" } ], - "legacy_creation_date": "2009-08-27", "authors": [ { "affiliations": [ @@ -1625,14 +1396,19 @@ ], "$schema": "http://localhost/schemas/records/hep.json", "document_type": [ - "article", "conference paper" ], + "citeable": true, "imprints": [ { "date": "2008" } ], - "citeable": true + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:04.366880" + } } ] From 9a4f2858ac54cf731ad6c7f965a67511c8b69fc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Thu, 14 Dec 2017 09:18:34 +0100 Subject: [PATCH 12/21] remove record_class field, as Record is default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- hepcrawl/spiders/oaipmh_spider.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/hepcrawl/spiders/oaipmh_spider.py b/hepcrawl/spiders/oaipmh_spider.py index 0f3adf6d..e92c5b28 100644 --- a/hepcrawl/spiders/oaipmh_spider.py +++ b/hepcrawl/spiders/oaipmh_spider.py @@ -60,7 +60,6 @@ def __init__( from_date=None, until_date=None, granularity=_Granularity.DATE, - record_class=Record, *args, **kwargs ): super(OAIPMHSpider, self).__init__(*args, **kwargs) @@ -71,7 +70,6 @@ def __init__( self.alias = alias or self._make_alias() self.from_date = from_date self.until_date = until_date - self.record_class = record_class def start_requests(self): self.from_date = self.from_date or self._resume_from @@ -106,10 +104,7 @@ def parse_record(self, record): raise NotImplementedError() def parse(self, response): - sickle = Sickle(self.url, class_mapping={ - 'ListRecords': self.record_class, - 'GetRecord': self.record_class, - }) + sickle = Sickle(self.url) try: records = sickle.ListRecords(**{ 'metadataPrefix': self.metadata_prefix, From 077c1f172e99489cdbb2b10f95877b2ed9385ae3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Thu, 14 Dec 2017 09:23:36 +0100 Subject: [PATCH 13/21] use os.path.json in cds_spider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- hepcrawl/spiders/cds_spider.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hepcrawl/spiders/cds_spider.py b/hepcrawl/spiders/cds_spider.py index f5359536..edcdeb12 100644 --- a/hepcrawl/spiders/cds_spider.py +++ b/hepcrawl/spiders/cds_spider.py @@ -11,8 +11,8 @@ import logging from flask.app import Flask -from dojson.contrib.marc21.utils import create_record from inspire_dojson import marcxml2record +from os.path import join as path_join from .oaipmh_spider import OAIPMHSpider from ..utils import ParsedItem @@ -59,5 +59,5 @@ def parse_record(self, selector): with app.app_context(): json_record = marcxml2record(record) base_uri = self.settings['SCHEMA_BASE_URI'] - json_record['$schema'] = base_uri + 'hep.json' + json_record['$schema'] = path_join(base_uri, 'hep.json') return ParsedItem(record=json_record, record_format='hep') From 054aa0b4d2eb2b716f439f99d3112d083e95ee09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Thu, 14 Dec 2017 09:34:25 +0100 Subject: [PATCH 14/21] remove url from the last_run file hash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- hepcrawl/spiders/oaipmh_spider.py | 4 +--- tests/unit/test_oaipmh.py | 11 +++++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/hepcrawl/spiders/oaipmh_spider.py b/hepcrawl/spiders/oaipmh_spider.py index e92c5b28..119ba470 100644 --- a/hepcrawl/spiders/oaipmh_spider.py +++ b/hepcrawl/spiders/oaipmh_spider.py @@ -67,7 +67,6 @@ def __init__( self.metadata_prefix = metadata_prefix self.set = oai_set self.granularity = granularity - self.alias = alias or self._make_alias() self.from_date = from_date self.until_date = until_date @@ -121,8 +120,7 @@ def parse(self, response): yield self.parse_record(selector) def _make_alias(self): - return '{url}?metadataPrefix={metadata_prefix}&set={set}'.format( - url=self.url, + return 'metadataPrefix={metadata_prefix}&set={set}'.format( metadata_prefix=self.metadata_prefix, set=self.set ) diff --git a/tests/unit/test_oaipmh.py b/tests/unit/test_oaipmh.py index a899aed0..aeea0c57 100644 --- a/tests/unit/test_oaipmh.py +++ b/tests/unit/test_oaipmh.py @@ -17,6 +17,9 @@ from scrapy.utils.project import get_project_settings +LAST_RUN_TEST_FILE_SHA1 = '4fabe0a2d2f3cb58e656f307b6290b3edd46acd6' + + def override_dynamic_fields(run): if 'last_run_finished_at' in run: run['last_run_finished_at'] = '2017-12-08T23:55:54.794969' @@ -26,7 +29,7 @@ def override_dynamic_fields(run): @pytest.fixture(scope='function') def cleanup(): yield - remove('/tmp/last_runs/OAI-PMH/2cea86bbc1d329b4273a29dc603fb8c0bb91439c.json') + remove('/tmp/last_runs/OAI-PMH/{}.json'.format(LAST_RUN_TEST_FILE_SHA1)) rmdir('/tmp/last_runs/OAI-PMH') rmdir('/tmp/last_runs') @@ -43,7 +46,7 @@ def settings(): @pytest.fixture def spider(settings): - spider = OAIPMHSpider('http://export.arxiv.org/oai2', settings=settings) + spider = OAIPMHSpider('http://0.0.0.0/oai2', settings=settings) spider.from_date = '2017-12-08' spider.set = 'physics:hep-th' spider.metadata_prefix = 'marcxml' @@ -51,7 +54,7 @@ def spider(settings): def test_last_run_file_path(spider): - expected = '/tmp/last_runs/OAI-PMH/2cea86bbc1d329b4273a29dc603fb8c0bb91439c.json' + expected = '/tmp/last_runs/OAI-PMH/{}.json'.format(LAST_RUN_TEST_FILE_SHA1) result = spider._last_run_file_path() assert expected == result @@ -65,7 +68,7 @@ def test_store_and_load_last_run(spider, cleanup): expected = override_dynamic_fields({ 'spider': 'OAI-PMH', - 'url': 'http://export.arxiv.org/oai2', + 'url': 'http://0.0.0.0/oai2', 'metadata_prefix': 'marcxml', 'set': 'physics:hep-th', 'granularity': 'YYYY-MM-DD', From b3159f715bb92eceb6338b9a294b3b523b3fe414 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Thu, 14 Dec 2017 09:45:57 +0100 Subject: [PATCH 15/21] remove granularity, default to YYYY-MM-DD for now MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- hepcrawl/spiders/oaipmh_spider.py | 20 ++------------------ tests/unit/test_oaipmh.py | 29 +---------------------------- 2 files changed, 3 insertions(+), 46 deletions(-) diff --git a/hepcrawl/spiders/oaipmh_spider.py b/hepcrawl/spiders/oaipmh_spider.py index 119ba470..1a057dd2 100644 --- a/hepcrawl/spiders/oaipmh_spider.py +++ b/hepcrawl/spiders/oaipmh_spider.py @@ -10,7 +10,6 @@ """Generic spider for OAI-PMH servers.""" import logging -from enum import Enum from errno import EEXIST from datetime import datetime from dateutil import parser as dateparser @@ -29,17 +28,6 @@ LOGGER = logging.getLogger(__name__) -class _Granularity(Enum): - DATE = 'YYYY-MM-DD' - SECOND = 'YYYY-MM-DDThh:mm:ssZ' - - def format(self, datetime_object): - if self == self.DATE: - return datetime_object.strftime('%Y-%m-%d') - if self == self.SECOND: - return datetime_object.strftime('%Y-%m-%dT%H:%M:%SZ') - - class OAIPMHSpider(StatefulSpider): """ Implements a spider for the OAI-PMH protocol by using the Python sickle library. @@ -49,7 +37,6 @@ class OAIPMHSpider(StatefulSpider): next harvest. """ name = 'OAI-PMH' - granularity = _Granularity.DATE def __init__( self, @@ -59,14 +46,12 @@ def __init__( alias=None, from_date=None, until_date=None, - granularity=_Granularity.DATE, *args, **kwargs ): super(OAIPMHSpider, self).__init__(*args, **kwargs) self.url = url self.metadata_prefix = metadata_prefix self.set = oai_set - self.granularity = granularity self.from_date = from_date self.until_date = until_date @@ -91,7 +76,7 @@ def start_requests(self): self._save_run(started_at) LOGGER.info("Harvesting completed. Next harvesting will resume from {}" - .format(self.until_date or self.granularity.format(now))) + .format(self.until_date or now.strftime('%Y-%m-%d'))) def parse_record(self, record): """ @@ -164,7 +149,6 @@ def _save_run(self, started_at): 'url': self.url, 'metadata_prefix': self.metadata_prefix, 'set': self.set, - 'granularity': self.granularity.value, 'from_date': self.from_date, 'until_date': self.until_date, 'last_run_started_at': started_at.isoformat(), @@ -189,4 +173,4 @@ def _resume_from(self): return None resume_at = last_run['until_date'] or last_run['last_run_finished_at'] date_parsed = dateparser.parse(resume_at) - return self.granularity.format(date_parsed) + return date_parsed.strftime('%Y-%m-%d') diff --git a/tests/unit/test_oaipmh.py b/tests/unit/test_oaipmh.py index aeea0c57..9d1e03e9 100644 --- a/tests/unit/test_oaipmh.py +++ b/tests/unit/test_oaipmh.py @@ -13,7 +13,7 @@ from os import remove, rmdir import pytest -from hepcrawl.spiders.oaipmh_spider import OAIPMHSpider, _Granularity +from hepcrawl.spiders.oaipmh_spider import OAIPMHSpider from scrapy.utils.project import get_project_settings @@ -71,7 +71,6 @@ def test_store_and_load_last_run(spider, cleanup): 'url': 'http://0.0.0.0/oai2', 'metadata_prefix': 'marcxml', 'set': 'physics:hep-th', - 'granularity': 'YYYY-MM-DD', 'from_date': '2017-12-08', 'until_date': None, 'last_run_started_at': now.isoformat(), @@ -93,29 +92,3 @@ def test_load_nonexistent(spider): def test_resume_from_nonexistent_no_error(spider): resume_from = spider._resume_from assert resume_from == None - - -@pytest.mark.parametrize('until_date,last_run,expected,granularity', [ - ('2017-12-08T13:54:00.0', '2017-12-08T13:54:00.0', '2017-12-08', _Granularity.DATE), - ('2017-12-08T13:54:00.0', '2017-12-08T13:54:00.0', '2017-12-08T13:54:00Z', _Granularity.SECOND), - ('2017-12-08', '2017-12-08', '2017-12-08', _Granularity.DATE), - ('2017-12-08', '2017-12-08', '2017-12-08T00:00:00Z', _Granularity.SECOND), - (None, '2017-12-10T13:54:00.0', '2017-12-10', _Granularity.DATE), - (None, '2017-12-10', '2017-12-10T00:00:00Z', _Granularity.SECOND), -]) -def test_resume_from(spider, until_date, last_run, expected, granularity, cleanup): - spider.until_date = until_date - spider.granularity = granularity - spider._save_run(started_at=datetime.utcnow()) - - with open(spider._last_run_file_path(), 'r') as f: - run_record = json.load(f) - - run_record['last_run_finished_at'] = last_run - - with open(spider._last_run_file_path(), 'w+') as f: - json.dump(run_record, f) - - result = spider._resume_from - - assert expected == result From 10804f70184857f7d45b0af232665641e7878fc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Thu, 14 Dec 2017 09:56:31 +0100 Subject: [PATCH 16/21] refactor tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- tests/functional/cds/test_cds.py | 12 +++--------- tests/unit/test_oaipmh.py | 5 ++--- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/tests/functional/cds/test_cds.py b/tests/functional/cds/test_cds.py index ae314225..eb25cd27 100644 --- a/tests/functional/cds/test_cds.py +++ b/tests/functional/cds/test_cds.py @@ -20,7 +20,7 @@ clean_dir, ) from hepcrawl.testlib.tasks import app as celery_app -from hepcrawl.testlib.utils import get_crawler_instance +from hepcrawl.testlib.utils import get_crawler_instance, deep_sort @pytest.fixture(scope='function', autouse=True) @@ -92,13 +92,7 @@ def test_cds( override_generated_fields(expected) for expected in expected_results ] - gotten_results = sorted( - gotten_results, - key=lambda x: x['document_type'] - ) - expected_results = sorted( - expected_results, - key=lambda x: x['document_type'] - ) + gotten_results = deep_sort(gotten_results) + expected_results = deep_sort(expected_results) assert gotten_results == expected_results diff --git a/tests/unit/test_oaipmh.py b/tests/unit/test_oaipmh.py index 9d1e03e9..497ff25b 100644 --- a/tests/unit/test_oaipmh.py +++ b/tests/unit/test_oaipmh.py @@ -14,6 +14,7 @@ import pytest from hepcrawl.spiders.oaipmh_spider import OAIPMHSpider +from hepcrawl.testlib.fixtures import clean_dir from scrapy.utils.project import get_project_settings @@ -29,9 +30,7 @@ def override_dynamic_fields(run): @pytest.fixture(scope='function') def cleanup(): yield - remove('/tmp/last_runs/OAI-PMH/{}.json'.format(LAST_RUN_TEST_FILE_SHA1)) - rmdir('/tmp/last_runs/OAI-PMH') - rmdir('/tmp/last_runs') + clean_dir('/tmp/last_runs/') @pytest.fixture From 5851258429f557f9dea6cd80491f380084427baa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Thu, 14 Dec 2017 10:17:31 +0100 Subject: [PATCH 17/21] stricter error catching when loading last_runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create an exception for when a last funs file doesn't exist. Signed-off-by: Szymon Łopaciuk --- hepcrawl/spiders/oaipmh_spider.py | 29 +- tests/functional/cds/fixtures/cds.xml | 1480 ------------------------- tests/unit/test_oaipmh.py | 9 +- 3 files changed, 22 insertions(+), 1496 deletions(-) delete mode 100644 tests/functional/cds/fixtures/cds.xml diff --git a/hepcrawl/spiders/oaipmh_spider.py b/hepcrawl/spiders/oaipmh_spider.py index 1a057dd2..d785b214 100644 --- a/hepcrawl/spiders/oaipmh_spider.py +++ b/hepcrawl/spiders/oaipmh_spider.py @@ -10,7 +10,7 @@ """Generic spider for OAI-PMH servers.""" import logging -from errno import EEXIST +from errno import EEXIST as FILE_EXISTS, ENOENT as NO_SUCH_FILE_OR_DIR from datetime import datetime from dateutil import parser as dateparser import hashlib @@ -28,6 +28,12 @@ LOGGER = logging.getLogger(__name__) +class NoLastRunToLoad(Exception): + """Error raised when there was a problem with loading the last_runs file""" + def __init__(self, file_path): + self.message = "Failed to load file at {}".format(file_path) + + class OAIPMHSpider(StatefulSpider): """ Implements a spider for the OAI-PMH protocol by using the Python sickle library. @@ -132,8 +138,10 @@ def _load_last_run(self): last_run = json.load(f) LOGGER.info('Last run file loaded: {}'.format(repr(last_run))) return last_run - except IOError: - return None + except IOError as exc: + if exc.errno == NO_SUCH_FILE_OR_DIR: + raise NoLastRunToLoad(file_path) + raise def _save_run(self, started_at): """Store last run information @@ -159,18 +167,17 @@ def _save_run(self, started_at): try: makedirs(path.dirname(file_path)) except OSError as exc: - if exc.errno == EEXIST: - pass - else: + if exc.errno != FILE_EXISTS: raise with open(file_path, 'w') as f: json.dump(last_run_info, f, indent=4) @property def _resume_from(self): - last_run = self._load_last_run() - if not last_run: + try: + last_run = self._load_last_run() + resume_at = last_run['until_date'] or last_run['last_run_finished_at'] + date_parsed = dateparser.parse(resume_at) + return date_parsed.strftime('%Y-%m-%d') + except NoLastRunToLoad: return None - resume_at = last_run['until_date'] or last_run['last_run_finished_at'] - date_parsed = dateparser.parse(resume_at) - return date_parsed.strftime('%Y-%m-%d') diff --git a/tests/functional/cds/fixtures/cds.xml b/tests/functional/cds/fixtures/cds.xml deleted file mode 100644 index 9bec8576..00000000 --- a/tests/functional/cds/fixtures/cds.xml +++ /dev/null @@ -1,1480 +0,0 @@ - - - -2017-12-07T15:05:26Zhttp://cds.cern.ch/oai2d -
oai:cds.cern.ch:12007522017-11-16T08:09:30Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1200752 - SzGeCERN - 20171116090930.0 - - oai:cds.cern.ch:1200752 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509577 - - - eng - - - Dubus, G - Grenoble Observ. - - - High and very high energy gamma-ray emission from binaries - - - 2009 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS MQW7-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - 018 - PoS - MQW7 - 2008 - - - http://cds.cern.ch/record/1200752/files/MQW7_018.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090817 - - - 1129423 - 018 - izmir20080901 - - - PUBLIC - - - 002842486CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12007532017-11-16T08:09:30Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1200753 - SzGeCERN - 20171116090930.0 - - oai:cds.cern.ch:1200753 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509578 - - - eng - - - Dubois, R - SLAC - - - GLAST: Launched and Being Commissioned - Status and Prospects for Microquasars - - - Fermi: Launched and Being Commissioned - Status and Prospects for Microquasars - Other title - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS MQW7-2009 - - - No authors - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - for the Fermi LAT Collaboration - - - 019 - PoS - MQW7 - 2008 - - - http://cds.cern.ch/record/1200753/files/MQW7_019.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090817 - - - 1129423 - 019 - izmir20080901 - - - PUBLIC - - - 002842487CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12007542017-11-16T08:09:30Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1200754 - SzGeCERN - 20171116090930.0 - - oai:cds.cern.ch:1200754 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509579 - - - eng - - - Romero, G E - Villa Elisa, Inst. Argentino Radioastron. - La Plata U. - - - Hadronic models of high-energy radiation from microquasars: recent developments - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS MQW7-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - 020 - PoS - MQW7 - 2008 - - - http://cds.cern.ch/record/1200754/files/MQW7_020.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090817 - - - 1129423 - 020 - izmir20080901 - - - PUBLIC - - - 002842488CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12032802017-11-16T08:09:52Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203280 - SzGeCERN - 20171116090952.0 - - oai:cds.cern.ch:1203280 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509595 - - - eng - - - Guess, C J - Michigan State U., NSCL - Michigan U. - Michigan State U., JINA - - - Studying matrix elements for the neutrinoless double beta decay of 150Nd via the 150Sm(t,3He)150Pm* and 150Nd(3He,t)150Pm* reactions - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Austin, S M - Michigan State U., NSCL - Michigan State U., JINA - - - Bazin, D - Michigan State U., NSCL - - - Brown, B A - Michigan State U., NSCL - Michigan U. - Michigan State U., JINA - - - Caesar, C - Michigan State U., NSCL - Mainz U. - - - Deaven, J M - Michigan State U., NSCL - Michigan U. - Michigan State U., JINA - - - Herlitzius, C - Michigan State U., NSCL - Mainz U. - - - Hitt, G W - Michigan State U., NSCL - Michigan U. - Michigan State U., JINA - - - Meharchand, R T - Michigan State U., NSCL - Michigan U. - Michigan State U., JINA - - - Perdikakis, G - Michigan State U., NSCL - Michigan State U., JINA - - - Shimbara, Y - Niigata U., Grad. Sch. Sci. Tech. - - - Tur, C - Michigan State U., NSCL - Michigan State U., JINA - - - Zegers, R G T - Michigan State U., NSCL - Michigan U. - Michigan State U., JINA - - - 104 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203280/files/NIC20X_104.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 104 - mackinacisland20080727 - - - PUBLIC - - - 002844587CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12032812017-11-16T08:09:55Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203281 - SzGeCERN - 20171116090955.0 - - oai:cds.cern.ch:1203281 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509596 - - - eng - - - Jachowicz, N - Ghent U. - - - Untangling supernova-neutrino oscillations with beta-beam data - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - McLaughlin, G C - North Carolina State U. - - - Volpe, C - Orsay, IPN - - - 107 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203281/files/NIC20X_107.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 107 - mackinacisland20080727 - - - PUBLIC - - - 002844588CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12033612017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203361 - SzGeCERN - 20171116090958.0 - - oai:cds.cern.ch:1203361 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509597 - - - eng - - - Kawagoe, S - Tokyo U. - - - Neutrino oscillations in non-spherical supernova explosions - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Takiwaki, T - Tokyo U. - - - Kotake, K - Natl. Astron. Observ. of Japan - - - 109 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203361/files/NIC20X_109.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 109 - mackinacisland20080727 - - - PUBLIC - - - 002844668CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12033622017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203362 - SzGeCERN - 20171116090958.0 - - oai:cds.cern.ch:1203362 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509598 - - - eng - - - Nakazato, K - Waseda U. - - - Neutrino Emission from Stellar Collapse including Hadron-Quark Mixed Phase - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Sumiyoshi, K - Numazu Coll. Tech. - - - Yamada, s - Waseda U. - - - 116 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203362/files/NIC20X_116.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 116 - mackinacisland20080727 - - - PUBLIC - - - 002844669CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12033632017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203363 - SzGeCERN - 20171116090958.0 - - oai:cds.cern.ch:1203363 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509599 - - - eng - - - Sumiyoshi, K - Numazu Coll. Tech. - - - Short neutrino burst from failed supernovae as a probe of dense matter with hyperon mixture - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Ishizuka, C - Hokkaido U. - - - Ohnishi, A - Kyoto U., Yukawa Inst., Kyoto - - - Yamada, S - Waseda U. - - - Suzuki, H - Tokyo U. of Sci. - - - 122 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203363/files/NICX_122.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 122 - mackinacisland20080727 - - - PUBLIC - - - 002844670CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12033642017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203364 - SzGeCERN - 20171116090958.0 - - oai:cds.cern.ch:1203364 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509600 - - - eng - - - Suzuki, T - Tokyo U. - - - Neutrino Nucleus Reactions and Nucleosynthesis in Stars - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Yoshida, T - Natl. Astron. Observ. of Japan - - - Chiba, S - JAEA, Ibaraki - - - Honma, M - Aizu U. - - - Higashiyama, K - Chiba Inst. Tech. - - - Umeda, H - Tokyo U. - - - Nomoto, K - Tokyo U. - - - Kajino, T - Tokyo U. - Natl. Astron. Observ. of Japan - - - Otsuka, T - Tokyo U. - - - 123 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203364/files/NICX_123.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 123 - mackinacisland20080727 - - - PUBLIC - - - 002844671CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12033652017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203365 - SzGeCERN - 20171116090958.0 - - oai:cds.cern.ch:1203365 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509601 - - - eng - - - Whitehouse, S - Basel U. - - - Neutrino transport in 3D simulations of core-collapse supernovae - - - A new approach to neutrino transport in 3D simulations of core-collapse supernovae - Other title - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Liebendörfer, M - Basel U. - - - 243 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203365/files/NICX_243.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 243 - mackinacisland20080727 - - - PUBLIC - - - 002844672CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12033662017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203366 - SzGeCERN - 20171116090958.0 - - oai:cds.cern.ch:1203366 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509602 - - - eng - - - Arcones, A - Damstadt, Tech. Hochsch. - Darmstadt, GSI - - - Neutrino-driven winds and nucleosynthesis - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Martínez-Pinedo, G - Darmstadt, GSI - - - Schwenk, A - TRIUMF - - - O’Connor, E - TRIUMF - Caltech - - - Langanke, K - Damstadt, Tech. Hochsch. - Darmstadt, GSI - - - Horowitz, C J - Indiana U. - - - Janka, H T - Garching, Max Planck Inst. - - - 128 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203366/files/NIC20X_128.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 128 - mackinacisland20080727 - - - PUBLIC - - - 002844673CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12033672017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203367 - SzGeCERN - 20171116090958.0 - - oai:cds.cern.ch:1203367 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509603 - - - eng - - - Roberts, L - UC, Santa Cruz, Astron. Astrophys. - - - Nucleosynthesis in the Neutrino Driven Wind of Protoneutron Stars - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Woosley, S - UC, Santa Cruz, Astron. Astrophys. - - - Heger, A - Minnesota U. - - - Hoffman, R - LLNL, Livermore - - - 146 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203367/files/NICX_146.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 146 - mackinacisland20080727 - - - PUBLIC - - - 002844674CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12033692017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203369 - SzGeCERN - 20171116090958.0 - - oai:cds.cern.ch:1203369 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509604 - - - eng - - - Kojima, K - Tokyo U. - Natl. Astron. Observ. of Japan - - - Neutrino effect in cosmology with the primordial magnetic field - - - Neutrino effects in cosmology with A primordial magnetic field - Other title - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Ichiki, K - Nagoya U. - - - Kajino, T - Tokyo U. - Natl. Astron. Observ. of Japan - - - Mathews, G J - Notre Dame U. - Natl. Astron. Observ. of Japan - - - 226 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203369/files/NICX_226.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 226 - mackinacisland20080727 - - - PUBLIC - - - 002844676CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12033702017-11-16T08:09:47Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203370 - SzGeCERN - 20171116090947.0 - - oai:cds.cern.ch:1203370 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509605 - - - eng - - - Yamazaki, D G - Natl. Astron. Observ. of Japan - - - A Strong Constraint on the Neutrino Mass from the Formation of Large Scale Structure in the Presence of the Primordial Magnetic Field - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Ichiki, K - Tokyo U. - - - Kajino, T - Natl. Astron. Observ. of Japan - - - Mathews, G J - Notre Dame U. - - - 239 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203370/files/NICX_239.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 239 - mackinacisland20080727 - - - PUBLIC - - - 002844677CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
-
- diff --git a/tests/unit/test_oaipmh.py b/tests/unit/test_oaipmh.py index 497ff25b..42420f1b 100644 --- a/tests/unit/test_oaipmh.py +++ b/tests/unit/test_oaipmh.py @@ -10,10 +10,9 @@ from datetime import datetime import json from mock import patch -from os import remove, rmdir import pytest -from hepcrawl.spiders.oaipmh_spider import OAIPMHSpider +from hepcrawl.spiders.oaipmh_spider import OAIPMHSpider, NoLastRunToLoad from hepcrawl.testlib.fixtures import clean_dir from scrapy.utils.project import get_project_settings @@ -58,7 +57,7 @@ def test_last_run_file_path(spider): assert expected == result -def test_store_and_load_last_run(spider, cleanup): +def test_load_last_run(spider, cleanup): now = datetime.utcnow() spider._save_run(started_at=now) @@ -84,8 +83,8 @@ def test_store_and_load_last_run(spider, cleanup): def test_load_nonexistent(spider): - last_run = spider._load_last_run() - assert last_run == None + with pytest.raises(NoLastRunToLoad): + spider._load_last_run() def test_resume_from_nonexistent_no_error(spider): From 23c3d90475ce0d6fa1971ddcae9f3e91e7f8cd75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Thu, 14 Dec 2017 10:47:07 +0100 Subject: [PATCH 18/21] leave only a few test records, remove the rest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- .../functional/cds/fixtures/cds_expected.json | 1196 ----------------- .../cds/fixtures/http_server/records/cds.xml | 1195 ---------------- 2 files changed, 2391 deletions(-) diff --git a/tests/functional/cds/fixtures/cds_expected.json b/tests/functional/cds/fixtures/cds_expected.json index a4cf0cb5..51d219e1 100644 --- a/tests/functional/cds/fixtures/cds_expected.json +++ b/tests/functional/cds/fixtures/cds_expected.json @@ -214,1201 +214,5 @@ "submission_number": "None", "datetime": "2017-12-14T08:10:03.984541" } - }, - { - "core": true, - "documents": [ - { - "url": "http://cds.cern.ch/record/1203280/files/NIC20X_104.pdf", - "source": "CDS", - "description": "Published version from PoS", - "key": "NIC20X_104.pdf" - } - ], - "curated": true, - "_collections": [ - "Literature" - ], - "inspire_categories": [ - { - "source": "cds", - "term": "Astrophysics" - } - ], - "titles": [ - { - "source": "CDS", - "title": "Studying matrix elements for the neutrinoless double beta decay of 150Nd via the 150Sm(t,3He)150Pm* and 150Nd(3He,t)150Pm* reactions" - } - ], - "_private_notes": [ - { - "source": "CDS", - "value": "CDS-1203280" - } - ], - "authors": [ - { - "affiliations": [ - { - "value": "Michigan State U., NSCL" - }, - { - "value": "Michigan U." - }, - { - "value": "Michigan State U., JINA" - } - ], - "full_name": "Guess, C J" - }, - { - "affiliations": [ - { - "value": "Michigan State U., NSCL" - }, - { - "value": "Michigan State U., JINA" - } - ], - "full_name": "Austin, S M" - }, - { - "affiliations": [ - { - "value": "Michigan State U., NSCL" - } - ], - "full_name": "Bazin, D" - }, - { - "affiliations": [ - { - "value": "Michigan State U., NSCL" - }, - { - "value": "Michigan U." - }, - { - "value": "Michigan State U., JINA" - } - ], - "full_name": "Brown, B A" - }, - { - "affiliations": [ - { - "value": "Michigan State U., NSCL" - }, - { - "value": "Mainz U." - } - ], - "full_name": "Caesar, C" - }, - { - "affiliations": [ - { - "value": "Michigan State U., NSCL" - }, - { - "value": "Michigan U." - }, - { - "value": "Michigan State U., JINA" - } - ], - "full_name": "Deaven, J M" - }, - { - "affiliations": [ - { - "value": "Michigan State U., NSCL" - }, - { - "value": "Mainz U." - } - ], - "full_name": "Herlitzius, C" - }, - { - "affiliations": [ - { - "value": "Michigan State U., NSCL" - }, - { - "value": "Michigan U." - }, - { - "value": "Michigan State U., JINA" - } - ], - "full_name": "Hitt, G W" - }, - { - "affiliations": [ - { - "value": "Michigan State U., NSCL" - }, - { - "value": "Michigan U." - }, - { - "value": "Michigan State U., JINA" - } - ], - "full_name": "Meharchand, R T" - }, - { - "affiliations": [ - { - "value": "Michigan State U., NSCL" - }, - { - "value": "Michigan State U., JINA" - } - ], - "full_name": "Perdikakis, G" - }, - { - "affiliations": [ - { - "value": "Niigata U., Grad. Sch. Sci. Tech." - } - ], - "full_name": "Shimbara, Y" - }, - { - "affiliations": [ - { - "value": "Michigan State U., NSCL" - }, - { - "value": "Michigan State U., JINA" - } - ], - "full_name": "Tur, C" - }, - { - "affiliations": [ - { - "value": "Michigan State U., NSCL" - }, - { - "value": "Michigan U." - }, - { - "value": "Michigan State U., JINA" - } - ], - "full_name": "Zegers, R G T" - } - ], - "publication_info": [ - { - "journal_volume": "NIC X", - "page_start": "104", - "journal_title": "PoS", - "artid": "104", - "year": 2008 - } - ], - "$schema": "http://localhost/schemas/records/hep.json", - "document_type": [ - "conference paper" - ], - "citeable": true, - "imprints": [ - { - "date": "2008" - } - ], - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "None", - "datetime": "2017-12-14T08:10:04.019463" - } - }, - { - "core": true, - "documents": [ - { - "url": "http://cds.cern.ch/record/1203281/files/NIC20X_107.pdf", - "source": "CDS", - "description": "Published version from PoS", - "key": "NIC20X_107.pdf" - } - ], - "curated": true, - "_collections": [ - "Literature" - ], - "inspire_categories": [ - { - "source": "cds", - "term": "Astrophysics" - } - ], - "titles": [ - { - "source": "CDS", - "title": "Untangling supernova-neutrino oscillations with beta-beam data" - } - ], - "_private_notes": [ - { - "source": "CDS", - "value": "CDS-1203281" - } - ], - "authors": [ - { - "affiliations": [ - { - "value": "Ghent U." - } - ], - "full_name": "Jachowicz, N" - }, - { - "affiliations": [ - { - "value": "North Carolina State U." - } - ], - "full_name": "McLaughlin, G C" - }, - { - "affiliations": [ - { - "value": "Orsay, IPN" - } - ], - "full_name": "Volpe, C" - } - ], - "publication_info": [ - { - "journal_volume": "NIC X", - "page_start": "107", - "journal_title": "PoS", - "artid": "107", - "year": 2008 - } - ], - "$schema": "http://localhost/schemas/records/hep.json", - "document_type": [ - "conference paper" - ], - "citeable": true, - "imprints": [ - { - "date": "2008" - } - ], - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "None", - "datetime": "2017-12-14T08:10:04.058926" - } - }, - { - "core": true, - "documents": [ - { - "url": "http://cds.cern.ch/record/1203361/files/NIC20X_109.pdf", - "source": "CDS", - "description": "Published version from PoS", - "key": "NIC20X_109.pdf" - } - ], - "curated": true, - "_collections": [ - "Literature" - ], - "inspire_categories": [ - { - "source": "cds", - "term": "Astrophysics" - } - ], - "titles": [ - { - "source": "CDS", - "title": "Neutrino oscillations in non-spherical supernova explosions" - } - ], - "_private_notes": [ - { - "source": "CDS", - "value": "CDS-1203361" - } - ], - "authors": [ - { - "affiliations": [ - { - "value": "Tokyo U." - } - ], - "full_name": "Kawagoe, S" - }, - { - "affiliations": [ - { - "value": "Tokyo U." - } - ], - "full_name": "Takiwaki, T" - }, - { - "affiliations": [ - { - "value": "Natl. Astron. Observ. of Japan" - } - ], - "full_name": "Kotake, K" - } - ], - "publication_info": [ - { - "journal_volume": "NIC X", - "page_start": "109", - "journal_title": "PoS", - "artid": "109", - "year": 2008 - } - ], - "$schema": "http://localhost/schemas/records/hep.json", - "document_type": [ - "conference paper" - ], - "citeable": true, - "imprints": [ - { - "date": "2008" - } - ], - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "None", - "datetime": "2017-12-14T08:10:04.091842" - } - }, - { - "core": true, - "documents": [ - { - "url": "http://cds.cern.ch/record/1203362/files/NIC20X_116.pdf", - "source": "CDS", - "description": "Published version from PoS", - "key": "NIC20X_116.pdf" - } - ], - "curated": true, - "_collections": [ - "Literature" - ], - "inspire_categories": [ - { - "source": "cds", - "term": "Astrophysics" - } - ], - "titles": [ - { - "source": "CDS", - "title": "Neutrino Emission from Stellar Collapse including Hadron-Quark Mixed Phase" - } - ], - "_private_notes": [ - { - "source": "CDS", - "value": "CDS-1203362" - } - ], - "authors": [ - { - "affiliations": [ - { - "value": "Waseda U." - } - ], - "full_name": "Nakazato, K" - }, - { - "affiliations": [ - { - "value": "Numazu Coll. Tech." - } - ], - "full_name": "Sumiyoshi, K" - }, - { - "affiliations": [ - { - "value": "Waseda U." - } - ], - "full_name": "Yamada, s" - } - ], - "publication_info": [ - { - "journal_volume": "NIC X", - "page_start": "116", - "journal_title": "PoS", - "artid": "116", - "year": 2008 - } - ], - "$schema": "http://localhost/schemas/records/hep.json", - "document_type": [ - "conference paper" - ], - "citeable": true, - "imprints": [ - { - "date": "2008" - } - ], - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "None", - "datetime": "2017-12-14T08:10:04.125345" - } - }, - { - "core": true, - "documents": [ - { - "url": "http://cds.cern.ch/record/1203363/files/NICX_122.pdf", - "source": "CDS", - "description": "Published version from PoS", - "key": "NICX_122.pdf" - } - ], - "curated": true, - "_collections": [ - "Literature" - ], - "inspire_categories": [ - { - "source": "cds", - "term": "Astrophysics" - } - ], - "titles": [ - { - "source": "CDS", - "title": "Short neutrino burst from failed supernovae as a probe of dense matter with hyperon mixture" - } - ], - "_private_notes": [ - { - "source": "CDS", - "value": "CDS-1203363" - } - ], - "authors": [ - { - "affiliations": [ - { - "value": "Numazu Coll. Tech." - } - ], - "full_name": "Sumiyoshi, K" - }, - { - "affiliations": [ - { - "value": "Hokkaido U." - } - ], - "full_name": "Ishizuka, C" - }, - { - "affiliations": [ - { - "value": "Kyoto U., Yukawa Inst., Kyoto" - } - ], - "full_name": "Ohnishi, A" - }, - { - "affiliations": [ - { - "value": "Waseda U." - } - ], - "full_name": "Yamada, S" - }, - { - "affiliations": [ - { - "value": "Tokyo U. of Sci." - } - ], - "full_name": "Suzuki, H" - } - ], - "publication_info": [ - { - "journal_volume": "NIC X", - "page_start": "122", - "journal_title": "PoS", - "artid": "122", - "year": 2008 - } - ], - "$schema": "http://localhost/schemas/records/hep.json", - "document_type": [ - "conference paper" - ], - "citeable": true, - "imprints": [ - { - "date": "2008" - } - ], - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "None", - "datetime": "2017-12-14T08:10:04.158736" - } - }, - { - "core": true, - "documents": [ - { - "url": "http://cds.cern.ch/record/1203364/files/NICX_123.pdf", - "source": "CDS", - "description": "Published version from PoS", - "key": "NICX_123.pdf" - } - ], - "curated": true, - "_collections": [ - "Literature" - ], - "inspire_categories": [ - { - "source": "cds", - "term": "Astrophysics" - } - ], - "titles": [ - { - "source": "CDS", - "title": "Neutrino Nucleus Reactions and Nucleosynthesis in Stars" - } - ], - "_private_notes": [ - { - "source": "CDS", - "value": "CDS-1203364" - } - ], - "authors": [ - { - "affiliations": [ - { - "value": "Tokyo U." - } - ], - "full_name": "Suzuki, T" - }, - { - "affiliations": [ - { - "value": "Natl. Astron. Observ. of Japan" - } - ], - "full_name": "Yoshida, T" - }, - { - "affiliations": [ - { - "value": "JAEA, Ibaraki" - } - ], - "full_name": "Chiba, S" - }, - { - "affiliations": [ - { - "value": "Aizu U." - } - ], - "full_name": "Honma, M" - }, - { - "affiliations": [ - { - "value": "Chiba Inst. Tech." - } - ], - "full_name": "Higashiyama, K" - }, - { - "affiliations": [ - { - "value": "Tokyo U." - } - ], - "full_name": "Umeda, H" - }, - { - "affiliations": [ - { - "value": "Tokyo U." - } - ], - "full_name": "Nomoto, K" - }, - { - "affiliations": [ - { - "value": "Tokyo U." - }, - { - "value": "Natl. Astron. Observ. of Japan" - } - ], - "full_name": "Kajino, T" - }, - { - "affiliations": [ - { - "value": "Tokyo U." - } - ], - "full_name": "Otsuka, T" - } - ], - "publication_info": [ - { - "journal_volume": "NIC X", - "page_start": "123", - "journal_title": "PoS", - "artid": "123", - "year": 2008 - } - ], - "$schema": "http://localhost/schemas/records/hep.json", - "document_type": [ - "conference paper" - ], - "citeable": true, - "imprints": [ - { - "date": "2008" - } - ], - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "None", - "datetime": "2017-12-14T08:10:04.193230" - } - }, - { - "core": true, - "documents": [ - { - "url": "http://cds.cern.ch/record/1203365/files/NICX_243.pdf", - "source": "CDS", - "description": "Published version from PoS", - "key": "NICX_243.pdf" - } - ], - "curated": true, - "_collections": [ - "Literature" - ], - "inspire_categories": [ - { - "source": "cds", - "term": "Astrophysics" - } - ], - "titles": [ - { - "source": "CDS", - "title": "Neutrino transport in 3D simulations of core-collapse supernovae" - }, - { - "source": "CDS", - "title": "A new approach to neutrino transport in 3D simulations of core-collapse supernovae" - } - ], - "_private_notes": [ - { - "source": "CDS", - "value": "CDS-1203365" - } - ], - "authors": [ - { - "affiliations": [ - { - "value": "Basel U." - } - ], - "full_name": "Whitehouse, S" - }, - { - "affiliations": [ - { - "value": "Basel U." - } - ], - "full_name": "Liebendörfer, M" - } - ], - "publication_info": [ - { - "journal_volume": "NIC X", - "page_start": "243", - "journal_title": "PoS", - "artid": "243", - "year": 2008 - } - ], - "$schema": "http://localhost/schemas/records/hep.json", - "document_type": [ - "conference paper" - ], - "citeable": true, - "imprints": [ - { - "date": "2008" - } - ], - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "None", - "datetime": "2017-12-14T08:10:04.228093" - } - }, - { - "core": true, - "documents": [ - { - "url": "http://cds.cern.ch/record/1203366/files/NIC20X_128.pdf", - "source": "CDS", - "description": "Published version from PoS", - "key": "NIC20X_128.pdf" - } - ], - "curated": true, - "_collections": [ - "Literature" - ], - "inspire_categories": [ - { - "source": "cds", - "term": "Astrophysics" - } - ], - "titles": [ - { - "source": "CDS", - "title": "Neutrino-driven winds and nucleosynthesis" - } - ], - "_private_notes": [ - { - "source": "CDS", - "value": "CDS-1203366" - } - ], - "authors": [ - { - "affiliations": [ - { - "value": "Damstadt, Tech. Hochsch." - }, - { - "value": "Darmstadt, GSI" - } - ], - "full_name": "Arcones, A" - }, - { - "affiliations": [ - { - "value": "Darmstadt, GSI" - } - ], - "full_name": "Martínez-Pinedo, G" - }, - { - "affiliations": [ - { - "value": "TRIUMF" - } - ], - "full_name": "Schwenk, A" - }, - { - "affiliations": [ - { - "value": "TRIUMF" - }, - { - "value": "Caltech" - } - ], - "full_name": "O’Connor, E" - }, - { - "affiliations": [ - { - "value": "Damstadt, Tech. Hochsch." - }, - { - "value": "Darmstadt, GSI" - } - ], - "full_name": "Langanke, K" - }, - { - "affiliations": [ - { - "value": "Indiana U." - } - ], - "full_name": "Horowitz, C J" - }, - { - "affiliations": [ - { - "value": "Garching, Max Planck Inst." - } - ], - "full_name": "Janka, H T" - } - ], - "publication_info": [ - { - "journal_volume": "NIC X", - "page_start": "128", - "journal_title": "PoS", - "artid": "128", - "year": 2008 - } - ], - "$schema": "http://localhost/schemas/records/hep.json", - "document_type": [ - "conference paper" - ], - "citeable": true, - "imprints": [ - { - "date": "2008" - } - ], - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "None", - "datetime": "2017-12-14T08:10:04.261882" - } - }, - { - "core": true, - "documents": [ - { - "url": "http://cds.cern.ch/record/1203367/files/NICX_146.pdf", - "source": "CDS", - "description": "Published version from PoS", - "key": "NICX_146.pdf" - } - ], - "curated": true, - "_collections": [ - "Literature" - ], - "inspire_categories": [ - { - "source": "cds", - "term": "Astrophysics" - } - ], - "titles": [ - { - "source": "CDS", - "title": "Nucleosynthesis in the Neutrino Driven Wind of Protoneutron Stars" - } - ], - "_private_notes": [ - { - "source": "CDS", - "value": "CDS-1203367" - } - ], - "authors": [ - { - "affiliations": [ - { - "value": "UC, Santa Cruz, Astron. Astrophys." - } - ], - "full_name": "Roberts, L" - }, - { - "affiliations": [ - { - "value": "UC, Santa Cruz, Astron. Astrophys." - } - ], - "full_name": "Woosley, S" - }, - { - "affiliations": [ - { - "value": "Minnesota U." - } - ], - "full_name": "Heger, A" - }, - { - "affiliations": [ - { - "value": "LLNL, Livermore" - } - ], - "full_name": "Hoffman, R" - } - ], - "publication_info": [ - { - "journal_volume": "NIC X", - "page_start": "146", - "journal_title": "PoS", - "artid": "146", - "year": 2008 - } - ], - "$schema": "http://localhost/schemas/records/hep.json", - "document_type": [ - "conference paper" - ], - "citeable": true, - "imprints": [ - { - "date": "2008" - } - ], - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "None", - "datetime": "2017-12-14T08:10:04.296986" - } - }, - { - "core": true, - "documents": [ - { - "url": "http://cds.cern.ch/record/1203369/files/NICX_226.pdf", - "source": "CDS", - "description": "Published version from PoS", - "key": "NICX_226.pdf" - } - ], - "curated": true, - "_collections": [ - "Literature" - ], - "inspire_categories": [ - { - "source": "cds", - "term": "Astrophysics" - } - ], - "titles": [ - { - "source": "CDS", - "title": "Neutrino effect in cosmology with the primordial magnetic field" - }, - { - "source": "CDS", - "title": "Neutrino effects in cosmology with A primordial magnetic field" - } - ], - "_private_notes": [ - { - "source": "CDS", - "value": "CDS-1203369" - } - ], - "authors": [ - { - "affiliations": [ - { - "value": "Tokyo U." - }, - { - "value": "Natl. Astron. Observ. of Japan" - } - ], - "full_name": "Kojima, K" - }, - { - "affiliations": [ - { - "value": "Nagoya U." - } - ], - "full_name": "Ichiki, K" - }, - { - "affiliations": [ - { - "value": "Tokyo U." - }, - { - "value": "Natl. Astron. Observ. of Japan" - } - ], - "full_name": "Kajino, T" - }, - { - "affiliations": [ - { - "value": "Notre Dame U." - }, - { - "value": "Natl. Astron. Observ. of Japan" - } - ], - "full_name": "Mathews, G J" - } - ], - "publication_info": [ - { - "journal_volume": "NIC X", - "page_start": "226", - "journal_title": "PoS", - "artid": "226", - "year": 2008 - } - ], - "$schema": "http://localhost/schemas/records/hep.json", - "document_type": [ - "conference paper" - ], - "citeable": true, - "imprints": [ - { - "date": "2008" - } - ], - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "None", - "datetime": "2017-12-14T08:10:04.330649" - } - }, - { - "core": true, - "documents": [ - { - "url": "http://cds.cern.ch/record/1203370/files/NICX_239.pdf", - "source": "CDS", - "description": "Published version from PoS", - "key": "NICX_239.pdf" - } - ], - "curated": true, - "_collections": [ - "Literature" - ], - "inspire_categories": [ - { - "source": "cds", - "term": "Astrophysics" - } - ], - "titles": [ - { - "source": "CDS", - "title": "A Strong Constraint on the Neutrino Mass from the Formation of Large Scale Structure in the Presence of the Primordial Magnetic Field" - } - ], - "_private_notes": [ - { - "source": "CDS", - "value": "CDS-1203370" - } - ], - "authors": [ - { - "affiliations": [ - { - "value": "Natl. Astron. Observ. of Japan" - } - ], - "full_name": "Yamazaki, D G" - }, - { - "affiliations": [ - { - "value": "Tokyo U." - } - ], - "full_name": "Ichiki, K" - }, - { - "affiliations": [ - { - "value": "Natl. Astron. Observ. of Japan" - } - ], - "full_name": "Kajino, T" - }, - { - "affiliations": [ - { - "value": "Notre Dame U." - } - ], - "full_name": "Mathews, G J" - } - ], - "publication_info": [ - { - "journal_volume": "NIC X", - "page_start": "239", - "journal_title": "PoS", - "artid": "239", - "year": 2008 - } - ], - "$schema": "http://localhost/schemas/records/hep.json", - "document_type": [ - "conference paper" - ], - "citeable": true, - "imprints": [ - { - "date": "2008" - } - ], - "acquisition_source": { - "source": "CDS", - "method": "hepcrawl", - "submission_number": "None", - "datetime": "2017-12-14T08:10:04.366880" - } } ] diff --git a/tests/functional/cds/fixtures/http_server/records/cds.xml b/tests/functional/cds/fixtures/http_server/records/cds.xml index 9bec8576..c23aee04 100644 --- a/tests/functional/cds/fixtures/http_server/records/cds.xml +++ b/tests/functional/cds/fixtures/http_server/records/cds.xml @@ -280,1201 +280,6 @@ Hidden -
oai:cds.cern.ch:12032802017-11-16T08:09:52Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203280 - SzGeCERN - 20171116090952.0 - - oai:cds.cern.ch:1203280 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509595 - - - eng - - - Guess, C J - Michigan State U., NSCL - Michigan U. - Michigan State U., JINA - - - Studying matrix elements for the neutrinoless double beta decay of 150Nd via the 150Sm(t,3He)150Pm* and 150Nd(3He,t)150Pm* reactions - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Austin, S M - Michigan State U., NSCL - Michigan State U., JINA - - - Bazin, D - Michigan State U., NSCL - - - Brown, B A - Michigan State U., NSCL - Michigan U. - Michigan State U., JINA - - - Caesar, C - Michigan State U., NSCL - Mainz U. - - - Deaven, J M - Michigan State U., NSCL - Michigan U. - Michigan State U., JINA - - - Herlitzius, C - Michigan State U., NSCL - Mainz U. - - - Hitt, G W - Michigan State U., NSCL - Michigan U. - Michigan State U., JINA - - - Meharchand, R T - Michigan State U., NSCL - Michigan U. - Michigan State U., JINA - - - Perdikakis, G - Michigan State U., NSCL - Michigan State U., JINA - - - Shimbara, Y - Niigata U., Grad. Sch. Sci. Tech. - - - Tur, C - Michigan State U., NSCL - Michigan State U., JINA - - - Zegers, R G T - Michigan State U., NSCL - Michigan U. - Michigan State U., JINA - - - 104 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203280/files/NIC20X_104.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 104 - mackinacisland20080727 - - - PUBLIC - - - 002844587CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12032812017-11-16T08:09:55Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203281 - SzGeCERN - 20171116090955.0 - - oai:cds.cern.ch:1203281 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509596 - - - eng - - - Jachowicz, N - Ghent U. - - - Untangling supernova-neutrino oscillations with beta-beam data - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - McLaughlin, G C - North Carolina State U. - - - Volpe, C - Orsay, IPN - - - 107 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203281/files/NIC20X_107.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 107 - mackinacisland20080727 - - - PUBLIC - - - 002844588CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12033612017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203361 - SzGeCERN - 20171116090958.0 - - oai:cds.cern.ch:1203361 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509597 - - - eng - - - Kawagoe, S - Tokyo U. - - - Neutrino oscillations in non-spherical supernova explosions - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Takiwaki, T - Tokyo U. - - - Kotake, K - Natl. Astron. Observ. of Japan - - - 109 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203361/files/NIC20X_109.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 109 - mackinacisland20080727 - - - PUBLIC - - - 002844668CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12033622017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203362 - SzGeCERN - 20171116090958.0 - - oai:cds.cern.ch:1203362 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509598 - - - eng - - - Nakazato, K - Waseda U. - - - Neutrino Emission from Stellar Collapse including Hadron-Quark Mixed Phase - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Sumiyoshi, K - Numazu Coll. Tech. - - - Yamada, s - Waseda U. - - - 116 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203362/files/NIC20X_116.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 116 - mackinacisland20080727 - - - PUBLIC - - - 002844669CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12033632017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203363 - SzGeCERN - 20171116090958.0 - - oai:cds.cern.ch:1203363 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509599 - - - eng - - - Sumiyoshi, K - Numazu Coll. Tech. - - - Short neutrino burst from failed supernovae as a probe of dense matter with hyperon mixture - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Ishizuka, C - Hokkaido U. - - - Ohnishi, A - Kyoto U., Yukawa Inst., Kyoto - - - Yamada, S - Waseda U. - - - Suzuki, H - Tokyo U. of Sci. - - - 122 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203363/files/NICX_122.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 122 - mackinacisland20080727 - - - PUBLIC - - - 002844670CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12033642017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203364 - SzGeCERN - 20171116090958.0 - - oai:cds.cern.ch:1203364 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509600 - - - eng - - - Suzuki, T - Tokyo U. - - - Neutrino Nucleus Reactions and Nucleosynthesis in Stars - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Yoshida, T - Natl. Astron. Observ. of Japan - - - Chiba, S - JAEA, Ibaraki - - - Honma, M - Aizu U. - - - Higashiyama, K - Chiba Inst. Tech. - - - Umeda, H - Tokyo U. - - - Nomoto, K - Tokyo U. - - - Kajino, T - Tokyo U. - Natl. Astron. Observ. of Japan - - - Otsuka, T - Tokyo U. - - - 123 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203364/files/NICX_123.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 123 - mackinacisland20080727 - - - PUBLIC - - - 002844671CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12033652017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203365 - SzGeCERN - 20171116090958.0 - - oai:cds.cern.ch:1203365 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509601 - - - eng - - - Whitehouse, S - Basel U. - - - Neutrino transport in 3D simulations of core-collapse supernovae - - - A new approach to neutrino transport in 3D simulations of core-collapse supernovae - Other title - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Liebendörfer, M - Basel U. - - - 243 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203365/files/NICX_243.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 243 - mackinacisland20080727 - - - PUBLIC - - - 002844672CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12033662017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203366 - SzGeCERN - 20171116090958.0 - - oai:cds.cern.ch:1203366 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509602 - - - eng - - - Arcones, A - Damstadt, Tech. Hochsch. - Darmstadt, GSI - - - Neutrino-driven winds and nucleosynthesis - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Martínez-Pinedo, G - Darmstadt, GSI - - - Schwenk, A - TRIUMF - - - O’Connor, E - TRIUMF - Caltech - - - Langanke, K - Damstadt, Tech. Hochsch. - Darmstadt, GSI - - - Horowitz, C J - Indiana U. - - - Janka, H T - Garching, Max Planck Inst. - - - 128 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203366/files/NIC20X_128.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 128 - mackinacisland20080727 - - - PUBLIC - - - 002844673CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12033672017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203367 - SzGeCERN - 20171116090958.0 - - oai:cds.cern.ch:1203367 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509603 - - - eng - - - Roberts, L - UC, Santa Cruz, Astron. Astrophys. - - - Nucleosynthesis in the Neutrino Driven Wind of Protoneutron Stars - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Woosley, S - UC, Santa Cruz, Astron. Astrophys. - - - Heger, A - Minnesota U. - - - Hoffman, R - LLNL, Livermore - - - 146 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203367/files/NICX_146.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 146 - mackinacisland20080727 - - - PUBLIC - - - 002844674CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12033692017-11-16T08:09:58Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203369 - SzGeCERN - 20171116090958.0 - - oai:cds.cern.ch:1203369 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509604 - - - eng - - - Kojima, K - Tokyo U. - Natl. Astron. Observ. of Japan - - - Neutrino effect in cosmology with the primordial magnetic field - - - Neutrino effects in cosmology with A primordial magnetic field - Other title - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Ichiki, K - Nagoya U. - - - Kajino, T - Tokyo U. - Natl. Astron. Observ. of Japan - - - Mathews, G J - Notre Dame U. - Natl. Astron. Observ. of Japan - - - 226 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203369/files/NICX_226.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 226 - mackinacisland20080727 - - - PUBLIC - - - 002844676CER - - - ARTICLE - - - ConferencePaper - - - Hidden - - -
oai:cds.cern.ch:12033702017-11-16T08:09:47Zcerncds:FULLTEXTforINSPIRE
- 00000coc 2200000uu 4500 - 1203370 - SzGeCERN - 20171116090947.0 - - oai:cds.cern.ch:1203370 - cerncds:FULLTEXT - forINSPIRE - - - Inspire - 1509605 - - - eng - - - Yamazaki, D G - Natl. Astron. Observ. of Japan - - - A Strong Constraint on the Neutrino Mass from the Formation of Large Scale Structure in the Presence of the Primordial Magnetic Field - - - 2008 - - - Open Access - CC-BY-NC-SA-3.0 - http://creativecommons.org/licenses/by-nc-sa/3.0/ - - - SIS POS NIC X-2009 - - - Inspire - - - SzGeCERN - Astrophysics and Astronomy - - - ARTICLE - - - Ichiki, K - Tokyo U. - - - Kajino, T - Natl. Astron. Observ. of Japan - - - Mathews, G J - Notre Dame U. - - - 239 - PoS - NIC X - 2008 - - - http://cds.cern.ch/record/1203370/files/NICX_239.pdf - Published version from PoS - - - n - 200933 - - - 13 - - - 20110201 - 1448 - CER01 - 20090827 - - - 1024674 - 239 - mackinacisland20080727 - - - PUBLIC - - - 002844677CER - - - ARTICLE - - - ConferencePaper - - - Hidden - -
From 332071f2f0af573d3bf64ff4b380994379e3cfb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Thu, 14 Dec 2017 10:50:40 +0100 Subject: [PATCH 19/21] tests: naming nad don't load directly from file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- tests/unit/test_oaipmh.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/unit/test_oaipmh.py b/tests/unit/test_oaipmh.py index 42420f1b..6db74b4c 100644 --- a/tests/unit/test_oaipmh.py +++ b/tests/unit/test_oaipmh.py @@ -8,7 +8,6 @@ # more details. from datetime import datetime -import json from mock import patch import pytest @@ -61,9 +60,6 @@ def test_load_last_run(spider, cleanup): now = datetime.utcnow() spider._save_run(started_at=now) - file_path = spider._last_run_file_path() - result = override_dynamic_fields(json.load(open(file_path))) - expected = override_dynamic_fields({ 'spider': 'OAI-PMH', 'url': 'http://0.0.0.0/oai2', @@ -75,14 +71,12 @@ def test_load_last_run(spider, cleanup): 'last_run_finished_at': '2017-12-08T13:55:00.000000', }) - assert expected == result - result = override_dynamic_fields(spider._load_last_run()) assert expected == result -def test_load_nonexistent(spider): +def test_load_last_run_nonexistent(spider): with pytest.raises(NoLastRunToLoad): spider._load_last_run() From a96f3c4140bd3a17846e67d850261a8127fb9ac4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Thu, 14 Dec 2017 11:24:26 +0100 Subject: [PATCH 20/21] make parse_record abstract MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- hepcrawl/spiders/oaipmh_spider.py | 3 +++ tests/unit/test_oaipmh.py | 6 +++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/hepcrawl/spiders/oaipmh_spider.py b/hepcrawl/spiders/oaipmh_spider.py index d785b214..cc3b9452 100644 --- a/hepcrawl/spiders/oaipmh_spider.py +++ b/hepcrawl/spiders/oaipmh_spider.py @@ -9,6 +9,7 @@ """Generic spider for OAI-PMH servers.""" +import abc import logging from errno import EEXIST as FILE_EXISTS, ENOENT as NO_SUCH_FILE_OR_DIR from datetime import datetime @@ -42,6 +43,7 @@ class OAIPMHSpider(StatefulSpider): the initial starting date and will use it as `from_date` argument on the next harvest. """ + __metaclass__ = abc.ABCMeta name = 'OAI-PMH' def __init__( @@ -84,6 +86,7 @@ def start_requests(self): LOGGER.info("Harvesting completed. Next harvesting will resume from {}" .format(self.until_date or now.strftime('%Y-%m-%d'))) + @abc.abstractmethod def parse_record(self, record): """ This method need to be reimplemented in order to provide special parsing. diff --git a/tests/unit/test_oaipmh.py b/tests/unit/test_oaipmh.py index 6db74b4c..2366d72d 100644 --- a/tests/unit/test_oaipmh.py +++ b/tests/unit/test_oaipmh.py @@ -43,7 +43,11 @@ def settings(): @pytest.fixture def spider(settings): - spider = OAIPMHSpider('http://0.0.0.0/oai2', settings=settings) + class TestOAIPMHSpider(OAIPMHSpider): + def parse_record(self, record): + return None + + spider = TestOAIPMHSpider('http://0.0.0.0/oai2', settings=settings) spider.from_date = '2017-12-08' spider.set = 'physics:hep-th' spider.metadata_prefix = 'marcxml' From 6b7d886e29e97c27986c1d0ff4a4fb60f4220341 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Tue, 16 Jan 2018 14:05:16 +0100 Subject: [PATCH 21/21] spiders: move Statetul and OAI to common module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- hepcrawl/spiders/__init__.py | 8 -------- hepcrawl/spiders/alpha_spider.py | 2 +- hepcrawl/spiders/aps_spider.py | 2 +- hepcrawl/spiders/arxiv_spider.py | 2 +- hepcrawl/spiders/base_spider.py | 2 +- hepcrawl/spiders/brown_spider.py | 2 +- hepcrawl/spiders/cds_spider.py | 2 +- hepcrawl/spiders/common/__init__.py | 13 +++++++++++++ hepcrawl/spiders/{ => common}/oaipmh_spider.py | 2 +- hepcrawl/spiders/common/stateful_spider.py | 18 ++++++++++++++++++ hepcrawl/spiders/desy_spider.py | 2 +- hepcrawl/spiders/dnb_spider.py | 2 +- hepcrawl/spiders/edp_spider.py | 2 +- hepcrawl/spiders/elsevier_spider.py | 2 +- hepcrawl/spiders/hindawi_spider.py | 2 +- hepcrawl/spiders/infn_spider.py | 2 +- hepcrawl/spiders/iop_spider.py | 2 +- hepcrawl/spiders/magic_spider.py | 2 +- hepcrawl/spiders/mit_spider.py | 2 +- hepcrawl/spiders/phenix_spider.py | 2 +- hepcrawl/spiders/phil_spider.py | 2 +- hepcrawl/spiders/pos_spider.py | 2 +- hepcrawl/spiders/t2k_spider.py | 2 +- hepcrawl/spiders/wsp_spider.py | 2 +- tests/unit/test_oaipmh.py | 2 +- 25 files changed, 53 insertions(+), 30 deletions(-) create mode 100644 hepcrawl/spiders/common/__init__.py rename hepcrawl/spiders/{ => common}/oaipmh_spider.py (99%) create mode 100644 hepcrawl/spiders/common/stateful_spider.py diff --git a/hepcrawl/spiders/__init__.py b/hepcrawl/spiders/__init__.py index b931594e..2d6d6746 100644 --- a/hepcrawl/spiders/__init__.py +++ b/hepcrawl/spiders/__init__.py @@ -8,11 +8,3 @@ # more details. from __future__ import absolute_import, division, print_function - -from scrapy import Spider - - -class StatefulSpider(Spider): - def __init__(self, *args, **kwargs): - self.state = {} - super(StatefulSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/alpha_spider.py b/hepcrawl/spiders/alpha_spider.py index ce334056..8a9e285e 100644 --- a/hepcrawl/spiders/alpha_spider.py +++ b/hepcrawl/spiders/alpha_spider.py @@ -18,7 +18,7 @@ from scrapy import Request from scrapy.spiders import CrawlSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/aps_spider.py b/hepcrawl/spiders/aps_spider.py index 69e19010..0adf94b1 100644 --- a/hepcrawl/spiders/aps_spider.py +++ b/hepcrawl/spiders/aps_spider.py @@ -18,7 +18,7 @@ from scrapy import Request -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/arxiv_spider.py b/hepcrawl/spiders/arxiv_spider.py index 64d076dc..a72b1e3b 100644 --- a/hepcrawl/spiders/arxiv_spider.py +++ b/hepcrawl/spiders/arxiv_spider.py @@ -16,7 +16,7 @@ from scrapy import Request, Selector from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..mappings import CONFERENCE_WORDS, THESIS_WORDS diff --git a/hepcrawl/spiders/base_spider.py b/hepcrawl/spiders/base_spider.py index 79748fde..0f596c68 100644 --- a/hepcrawl/spiders/base_spider.py +++ b/hepcrawl/spiders/base_spider.py @@ -16,7 +16,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/brown_spider.py b/hepcrawl/spiders/brown_spider.py index fe1c340d..dba3d27d 100644 --- a/hepcrawl/spiders/brown_spider.py +++ b/hepcrawl/spiders/brown_spider.py @@ -19,7 +19,7 @@ from scrapy import Request from scrapy.spiders import CrawlSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/cds_spider.py b/hepcrawl/spiders/cds_spider.py index edcdeb12..60d8d5be 100644 --- a/hepcrawl/spiders/cds_spider.py +++ b/hepcrawl/spiders/cds_spider.py @@ -14,7 +14,7 @@ from inspire_dojson import marcxml2record from os.path import join as path_join -from .oaipmh_spider import OAIPMHSpider +from .common import OAIPMHSpider from ..utils import ParsedItem diff --git a/hepcrawl/spiders/common/__init__.py b/hepcrawl/spiders/common/__init__.py new file mode 100644 index 00000000..5453444a --- /dev/null +++ b/hepcrawl/spiders/common/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2015, 2016, 2017, 2018 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +from __future__ import absolute_import, division, print_function + +from .oaipmh_spider import OAIPMHSpider +from .stateful_spider import StatefulSpider diff --git a/hepcrawl/spiders/oaipmh_spider.py b/hepcrawl/spiders/common/oaipmh_spider.py similarity index 99% rename from hepcrawl/spiders/oaipmh_spider.py rename to hepcrawl/spiders/common/oaipmh_spider.py index cc3b9452..3ea147a4 100644 --- a/hepcrawl/spiders/oaipmh_spider.py +++ b/hepcrawl/spiders/common/oaipmh_spider.py @@ -23,7 +23,7 @@ from scrapy.http import Request, XmlResponse from scrapy.selector import Selector -from . import StatefulSpider +from .stateful_spider import StatefulSpider LOGGER = logging.getLogger(__name__) diff --git a/hepcrawl/spiders/common/stateful_spider.py b/hepcrawl/spiders/common/stateful_spider.py new file mode 100644 index 00000000..3de5c613 --- /dev/null +++ b/hepcrawl/spiders/common/stateful_spider.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2015, 2016, 2017, 2018 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +from __future__ import absolute_import, division, print_function + +from scrapy import Spider + + +class StatefulSpider(Spider): + def __init__(self, *args, **kwargs): + self.state = {} + super(StatefulSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py index 05b35282..8da15c41 100644 --- a/hepcrawl/spiders/desy_spider.py +++ b/hepcrawl/spiders/desy_spider.py @@ -18,7 +18,7 @@ from scrapy import Request from six.moves import urllib -from . import StatefulSpider +from .common import StatefulSpider from ..utils import ( ftp_list_files, ftp_connection_info, diff --git a/hepcrawl/spiders/dnb_spider.py b/hepcrawl/spiders/dnb_spider.py index 5f243b94..b4a4540c 100644 --- a/hepcrawl/spiders/dnb_spider.py +++ b/hepcrawl/spiders/dnb_spider.py @@ -14,7 +14,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/edp_spider.py b/hepcrawl/spiders/edp_spider.py index c051c8ee..5c1fc8db 100644 --- a/hepcrawl/spiders/edp_spider.py +++ b/hepcrawl/spiders/edp_spider.py @@ -19,7 +19,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..extractors.jats import Jats from ..items import HEPRecord from ..loaders import HEPLoader diff --git a/hepcrawl/spiders/elsevier_spider.py b/hepcrawl/spiders/elsevier_spider.py index e2d4e919..f4d97b12 100644 --- a/hepcrawl/spiders/elsevier_spider.py +++ b/hepcrawl/spiders/elsevier_spider.py @@ -23,7 +23,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/hindawi_spider.py b/hepcrawl/spiders/hindawi_spider.py index 5f81f5b4..7c14ab41 100644 --- a/hepcrawl/spiders/hindawi_spider.py +++ b/hepcrawl/spiders/hindawi_spider.py @@ -14,7 +14,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/infn_spider.py b/hepcrawl/spiders/infn_spider.py index 2e093ab1..04240307 100644 --- a/hepcrawl/spiders/infn_spider.py +++ b/hepcrawl/spiders/infn_spider.py @@ -19,7 +19,7 @@ from scrapy.http import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/iop_spider.py b/hepcrawl/spiders/iop_spider.py index fbca3ae5..5b1f2826 100644 --- a/hepcrawl/spiders/iop_spider.py +++ b/hepcrawl/spiders/iop_spider.py @@ -18,7 +18,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..extractors.nlm import NLM from ..items import HEPRecord from ..loaders import HEPLoader diff --git a/hepcrawl/spiders/magic_spider.py b/hepcrawl/spiders/magic_spider.py index 8dfd5d51..27f79b80 100644 --- a/hepcrawl/spiders/magic_spider.py +++ b/hepcrawl/spiders/magic_spider.py @@ -16,7 +16,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/mit_spider.py b/hepcrawl/spiders/mit_spider.py index 21804873..e24fcfb0 100644 --- a/hepcrawl/spiders/mit_spider.py +++ b/hepcrawl/spiders/mit_spider.py @@ -21,7 +21,7 @@ from scrapy.http import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/phenix_spider.py b/hepcrawl/spiders/phenix_spider.py index aa54bd98..3e8b990b 100644 --- a/hepcrawl/spiders/phenix_spider.py +++ b/hepcrawl/spiders/phenix_spider.py @@ -16,7 +16,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ParsedItem diff --git a/hepcrawl/spiders/phil_spider.py b/hepcrawl/spiders/phil_spider.py index 06f52da2..b1b76284 100644 --- a/hepcrawl/spiders/phil_spider.py +++ b/hepcrawl/spiders/phil_spider.py @@ -17,7 +17,7 @@ from scrapy import Request from scrapy.spiders import CrawlSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index 024eff6b..a85e3e5c 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -18,7 +18,7 @@ from scrapy import Request, Selector -from . import StatefulSpider +from .common import StatefulSpider from ..dateutils import create_valid_date from ..items import HEPRecord from ..loaders import HEPLoader diff --git a/hepcrawl/spiders/t2k_spider.py b/hepcrawl/spiders/t2k_spider.py index db18eb1e..a165bbed 100644 --- a/hepcrawl/spiders/t2k_spider.py +++ b/hepcrawl/spiders/t2k_spider.py @@ -16,7 +16,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader from ..utils import ( diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index 280b6875..5a5776ec 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -18,7 +18,7 @@ from scrapy import Request from scrapy.spiders import XMLFeedSpider -from . import StatefulSpider +from .common import StatefulSpider from ..parsers import JatsParser from ..utils import ( ftp_list_files, diff --git a/tests/unit/test_oaipmh.py b/tests/unit/test_oaipmh.py index 2366d72d..f1715cce 100644 --- a/tests/unit/test_oaipmh.py +++ b/tests/unit/test_oaipmh.py @@ -11,7 +11,7 @@ from mock import patch import pytest -from hepcrawl.spiders.oaipmh_spider import OAIPMHSpider, NoLastRunToLoad +from hepcrawl.spiders.common.oaipmh_spider import OAIPMHSpider, NoLastRunToLoad from hepcrawl.testlib.fixtures import clean_dir from scrapy.utils.project import get_project_settings