Skip to content

Commit

Permalink
create a OAI-PMH spider to use in CDS spider
Browse files Browse the repository at this point in the history
Also remove old tests. Fixes inspirehep#197.

Co-authored-by: Samuele Kaplun <[email protected]>
Signed-off-by: Szymon Łopaciuk <[email protected]>
  • Loading branch information
kaplun authored and szymonlopaciuk committed Jan 16, 2018
1 parent 9824bd9 commit fad1b50
Show file tree
Hide file tree
Showing 10 changed files with 187 additions and 439 deletions.
22 changes: 22 additions & 0 deletions hepcrawl/downloaders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2016, 2017 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.

"""Additional downloaders."""


from scrapy.http import Response


class DummyDownloadHandler(object):
def __init__(self, *args, **kwargs):
pass

def download_request(self, request, spider):
url = request.url
return Response(url, request=request)
7 changes: 5 additions & 2 deletions hepcrawl/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,11 @@ def process_item(self, item, spider):

hep_record = self._post_enhance_item(item, spider)

validate(hep_record, 'hep')
spider.logger.debug('Validated item by Inspire Schemas.')
try:
validate(hep_record, 'hep')
spider.logger.debug('Validated item by Inspire Schemas.')
except Exception as err:
spider.logger.error('ERROR in validating {}: {}'.format(hep_record, err))

self.results_data.append(hep_record)

Expand Down
2 changes: 1 addition & 1 deletion hepcrawl/scrapy.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
default = hepcrawl.settings

[deploy]
url = http://scrapyd:6800/
url = http://localhost:6800/
project = hepcrawl
#username = scrapy
#password = secret
8 changes: 8 additions & 0 deletions hepcrawl/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

from __future__ import absolute_import, division, print_function

from scrapy.settings import default_settings

import os


Expand Down Expand Up @@ -71,6 +73,12 @@
'hepcrawl.middlewares.HepcrawlCrawlOnceMiddleware': 100,
}

DOWNLOAD_HANDLERS_BASE = dict(default_settings.DOWNLOAD_HANDLERS_BASE)
DOWNLOAD_HANDLERS_BASE.update({
'oaipmh+http': 'hepcrawl.downloaders.DummyDownloadHandler',
'oaipmh+https': 'hepcrawl.downloaders.DummyDownloadHandler',
})

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
Expand Down
2 changes: 1 addition & 1 deletion hepcrawl/spiders/arxiv_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class ArxivSpider(StatefulSpider, XMLFeedSpider):
"""

name = 'arXiv'
iterator = 'xml'
iterator = 'iternodes'
itertag = 'OAI-PMH:record'
namespaces = [
("OAI-PMH", "http://www.openarchives.org/OAI/2.0/")
Expand Down
68 changes: 32 additions & 36 deletions hepcrawl/spiders/cds_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@

"""Spider for the CERN Document Server OAI-PMH interface"""

from scrapy.spider import XMLFeedSpider
import logging
from scrapy import Request
from scrapy.http import XmlResponse
from scrapy.selector import Selector
from flask.app import Flask
from harvestingkit.inspire_cds_package.from_cds import CDS2Inspire
from harvestingkit.bibrecord import (
create_record as create_bibrec,
Expand All @@ -19,19 +22,19 @@
from dojson.contrib.marc21.utils import create_record
from inspire_dojson.hep import hep

from . import StatefulSpider
from .oaipmh_spider import OAIPMHSpider
from ..utils import ParsedItem

logger = logging.getLogger(__name__)

class CDSSpider(StatefulSpider, XMLFeedSpider):
class CDSSpider(OAIPMHSpider):
"""Spider for crawling the CERN Document Server OAI-PMH XML files.
Example:
Using OAI-PMH XML files::
$ scrapy crawl \\
cds \\
-a "source_file=file://$PWD/tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml"
$ scrapy crawl CDS \\
-a "set=forINSPIRE" -a "from_date=2017-10-10"
It uses `HarvestingKit <https://pypi.python.org/pypi/HarvestingKit>`_ to
translate from CDS's MARCXML into INSPIRE Legacy's MARCXML flavor. It then
Expand All @@ -40,36 +43,29 @@ class CDSSpider(StatefulSpider, XMLFeedSpider):
"""

name = 'CDS'
iterator = 'xml'
itertag = 'OAI-PMH:record'
namespaces = [
('OAI-PMH', 'http://www.openarchives.org/OAI/2.0/'),
('marc', 'http://www.loc.gov/MARC21/slim'),
]

def __init__(self, source_file=None, **kwargs):
super(CDSSpider, self).__init__(**kwargs)
self.source_file = source_file
def __init__(self, from_date=None, set="forINSPIRE", *args, **kwargs):
super(CDSSpider, self).__init__(url='http://cds.cern.ch/oai2d', metadata_prefix='marcxml', set=set, from_date=from_date, **kwargs)

def start_requests(self):
yield Request(self.source_file)

def parse_node(self, response, node):
node.remove_namespaces()
cds_bibrec, ok, errs = create_bibrec(
node.xpath('.//record').extract()[0]
)
if not ok:
raise RuntimeError("Cannot parse record %s: %s", node, errs)
self.logger.info("Here's the record: %s" % cds_bibrec)
inspire_bibrec = CDS2Inspire(cds_bibrec).get_record()
marcxml_record = record_xml_output(inspire_bibrec)
record = create_record(marcxml_record)
json_record = hep.do(record)
base_uri = self.settings['SCHEMA_BASE_URI']
json_record['$schema'] = base_uri + 'hep.json'
parsed_item = ParsedItem(
record=json_record,
record_format='hep',
def parse_record(self, record):
response = XmlResponse(self.url, encoding='utf-8', body=record.raw)
selector = Selector(response, type='xml')
selector.remove_namespaces()
try:
cds_bibrec, ok, errs = create_bibrec(selector.xpath('.//record').extract()[0])
if not ok:
raise RuntimeError("Cannot parse record %s: %s", record, errs)
self.logger.info("Here's the record: %s" % cds_bibrec)
inspire_bibrec = CDS2Inspire(cds_bibrec).get_record()
marcxml_record = record_xml_output(inspire_bibrec)
record = create_record(marcxml_record)
app = Flask('hepcrawl')
app.config.update(
self.settings.getdict('MARC_TO_HEP_SETTINGS', {})
)
return parsed_item
with app.app_context():
json_record = hep.do(record)
return ParsedItem(record=json_record, record_format='hep')
except Exception:
logger.exception("Error when parsing record")
return None
100 changes: 100 additions & 0 deletions hepcrawl/spiders/oaipmh_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016, 2017 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.

"""Generic spider for OAI-PMH servers."""

import logging
import sickle
from datetime import datetime

from sickle import Sickle
from sickle.models import Record
from sickle.oaiexceptions import NoRecordsMatch

from scrapy.http import Request
from scrapy.spiders import Spider

logger = logging.getLogger(__name__)

class OAIPMHSpider(Spider):
"""
Implements a spider for the OAI-PMH protocol by using the Python sickle library.
In case of successful harvest (OAI-PMH crawling) the spider will remember the initial starting
date and will use it as `from_date` argument on the next harvest.
"""
name = 'OAI-PMH'
state = {}

def __init__(self, url, metadata_prefix='marcxml', set=None, alias=None, from_date=None, until_date=None, granularity='YYYY-MM-DD', record_class=Record, *args, **kwargs):
super(OAIPMHSpider, self).__init__(*args, **kwargs)
self.url = url
self.metadata_prefix = metadata_prefix
self.set = set
self.granularity = granularity
self.alias = alias or self._make_alias()
self.from_date = from_date
logger.info("Current state:{}".format(self.state))
self.until_date = until_date
self.record_class = record_class

def start_requests(self):
self.from_date = self.from_date or self.state.get(self.alias)
logger.info("Current state 2:{}".format(self.state))
logger.info("Starting harvesting of {url} with set={set} and metadataPrefix={metadata_prefix}, from={from_date}, until={until_date}".format(
url=self.url,
set=self.set,
metadata_prefix=self.metadata_prefix,
from_date=self.from_date,
until_date=self.until_date
))
now = datetime.utcnow()
request = Request('oaipmh+{}'.format(self.url), self.parse)
yield request
self.state[self.alias] = self._format_date(now)
logger.info("Harvesting completed. Next harvesting will resume from {}".format(self.state[self.alias]))

def parse_record(self, record):
"""
This method need to be reimplemented in order to provide special parsing.
"""
return record.xml

def parse(self, response):
sickle = Sickle(self.url, class_mapping={
'ListRecords': self.record_class,
'GetRecord': self.record_class,
})
try:
records = sickle.ListRecords(**{
'metadataPrefix': self.metadata_prefix,
'set': self.set,
'from': self.from_date,
'until': self.until_date,
})
except NoRecordsMatch as err:
logger.warning(err)
raise StopIteration()
for record in records:
yield self.parse_record(record)

def _format_date(self, datetime_object):
if self.granularity == 'YYYY-MM-DD':
return datetime_object.strftime('%Y-%m-%d')
elif self.granularity == 'YYYY-MM-DDThh:mm:ssZ':
return datetime_object.strftime('%Y-%m-%dT%H:%M:%SZ')
else:
raise RuntimeError("Invalid granularity: %s" % self.granularity)

def _make_alias(self):
return '{url}-{metadata_prefix}-{set}'.format(
url=self.url,
metadata_prefix=self.metadata_prefix,
set=self.set
)
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
'python-dateutil>=2.4.2',
'python-scrapyd-api>=2.0.1',
'harvestingkit>=0.6.12',
'Sickle~=0.6,>=0.6.2',
]

tests_require = [
Expand Down
Loading

0 comments on commit fad1b50

Please sign in to comment.