Skip to content

Commit

Permalink
cds: use the OAI-PMH spider to harvest CDS
Browse files Browse the repository at this point in the history
Also remove old tests. Fixes inspirehep#197.

Signed-off-by: Szymon Łopaciuk <[email protected]>
  • Loading branch information
kaplun authored and szymonlopaciuk committed Jan 26, 2018
1 parent 25a441b commit 4fb1ce6
Show file tree
Hide file tree
Showing 9 changed files with 601 additions and 562 deletions.
30 changes: 24 additions & 6 deletions docker-compose.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,15 @@ services:
arxiv-http-server.local:
condition: service_healthy

functional_cds:
<<: *service_base
command: py.test -vv tests/functional/cds
depends_on:
scrapyd:
condition: service_healthy
cds-http-server.local:
condition: service_healthy

functional_pos:
<<: *service_base
command: py.test -vv tests/functional/pos
Expand Down Expand Up @@ -126,12 +135,6 @@ services:
- "CMD-SHELL"
- "curl https://localhost:443/"

functional_cds:
<<: *service_base
command: py.test -vv tests/functional/cds
links:
- scrapyd

arxiv-http-server.local:
image: nginx:stable-alpine
volumes:
Expand All @@ -147,6 +150,21 @@ services:
- "CMD-SHELL"
- "curl http://localhost:80/"

cds-http-server.local:
image: nginx:stable-alpine
volumes:
- ${PWD}/tests/functional/cds/fixtures/http_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf
- ${PWD}/tests/functional/cds/fixtures/http_server/records:/etc/nginx/html/
ports:
- 80:80
healthcheck:
timeout: 5s
interval: 5s
retries: 5
test:
- "CMD-SHELL"
- "curl http://localhost:80/"

rabbitmq:
image: rabbitmq
healthcheck:
Expand Down
75 changes: 26 additions & 49 deletions hepcrawl/spiders/cds_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,75 +9,52 @@

"""Spider for the CERN Document Server OAI-PMH interface"""

from dojson.contrib.marc21.utils import create_record
import logging
from flask.app import Flask
from harvestingkit.inspire_cds_package.from_cds import CDS2Inspire
from harvestingkit.bibrecord import (
create_record as create_bibrec,
record_xml_output,
)
from inspire_dojson.hep import hep
from scrapy import Request
from scrapy.spider import XMLFeedSpider
from inspire_dojson import marcxml2record
from os.path import join as path_join

from . import StatefulSpider
from .common.oaipmh_spider import OAIPMHSpider
from ..utils import ParsedItem


class CDSSpider(StatefulSpider, XMLFeedSpider):
LOGGER = logging.getLogger(__name__)


class CDSSpider(OAIPMHSpider):
"""Spider for crawling the CERN Document Server OAI-PMH XML files.
Example:
Using OAI-PMH XML files::
$ scrapy crawl \\
cds \\
-a "source_file=file://$PWD/tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml"
$ scrapy crawl CDS \\
-a "sets=forINSPIRE" -a "from_date=2017-10-10"
It uses `HarvestingKit <https://pypi.python.org/pypi/HarvestingKit>`_ to
translate from CDS's MARCXML into INSPIRE Legacy's MARCXML flavor. It then
employs `inspire-dojson <https://pypi.python.org/pypi/inspire-dojson>`_ to
transform the legacy INSPIRE MARCXML into the new INSPIRE Schema.
It uses `inspire-dojson <https://pypi.python.org/pypi/inspire-dojson>`_ to
translate from CDS's MARCXML into the new INSPIRE Schema.
"""

name = 'CDS'
iterator = 'xml'
itertag = 'OAI-PMH:record'
namespaces = [
('OAI-PMH', 'http://www.openarchives.org/OAI/2.0/'),
('marc', 'http://www.loc.gov/MARC21/slim'),
]

def __init__(self, source_file=None, **kwargs):
super(CDSSpider, self).__init__(**kwargs)
self.source_file = source_file
def __init__(self, *args, **kwargs):
kwargs.setdefault('url', 'http://cds.cern.ch/oai2d')
kwargs.setdefault('format', 'marcxml')
kwargs.setdefault('sets', 'forINSPIRE')
super(CDSSpider, self).__init__(*args, **kwargs)

def start_requests(self):
yield Request(self.source_file)

def parse_node(self, response, node):
node.remove_namespaces()
cds_bibrec, ok, errs = create_bibrec(
node.xpath('.//record').extract()[0]
)
if not ok:
raise RuntimeError("Cannot parse record %s: %s", node, errs)
self.logger.info("Here's the record: %s" % cds_bibrec)
inspire_bibrec = CDS2Inspire(cds_bibrec).get_record()
marcxml_record = record_xml_output(inspire_bibrec)
record = create_record(marcxml_record)
def get_record_identifier(self, record):
"""Extracts a unique identifier from a sickle record."""
return record.header.identifier

def parse_record(self, selector):
selector.remove_namespaces()
record = selector.xpath('.//record').extract_first()
app = Flask('hepcrawl')
app.config.update(
self.settings.getdict('MARC_TO_HEP_SETTINGS', {})
)
with app.app_context():
json_record = hep.do(record)
json_record = marcxml2record(record)
base_uri = self.settings['SCHEMA_BASE_URI']
json_record['$schema'] = base_uri + 'hep.json'

parsed_item = ParsedItem(
record=json_record,
record_format='hep',
)
return parsed_item
json_record['$schema'] = path_join(base_uri, 'hep.json')
return ParsedItem(record=json_record, record_format='hep')
2 changes: 1 addition & 1 deletion hepcrawl/spiders/common/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2017 CERN.
# Copyright (C) 2015, 2016, 2017, 2018 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
Expand Down
218 changes: 218 additions & 0 deletions tests/functional/cds/fixtures/cds_expected.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
[
{
"core": true,
"documents": [
{
"url": "http://cds.cern.ch/record/1200752/files/MQW7_018.pdf",
"source": "CDS",
"description": "Published version from PoS",
"key": "MQW7_018.pdf"
}
],
"curated": true,
"_collections": [
"Literature"
],
"inspire_categories": [
{
"source": "cds",
"term": "Astrophysics"
}
],
"titles": [
{
"source": "CDS",
"title": "High and very high energy gamma-ray emission from binaries"
}
],
"_private_notes": [
{
"source": "CDS",
"value": "CDS-1200752"
}
],
"authors": [
{
"affiliations": [
{
"value": "Grenoble Observ."
}
],
"full_name": "Dubus, G"
}
],
"publication_info": [
{
"journal_volume": "MQW7",
"page_start": "018",
"journal_title": "PoS",
"artid": "018",
"year": 2008
}
],
"$schema": "http://localhost/schemas/records/hep.json",
"document_type": [
"conference paper"
],
"citeable": true,
"imprints": [
{
"date": "2009"
}
],
"acquisition_source": {
"source": "CDS",
"method": "hepcrawl",
"submission_number": "None",
"datetime": "2017-12-14T08:10:03.875113"
}
},
{
"core": true,
"documents": [
{
"url": "http://cds.cern.ch/record/1200753/files/MQW7_019.pdf",
"source": "CDS",
"description": "Published version from PoS",
"key": "MQW7_019.pdf"
}
],
"curated": true,
"_collections": [
"Literature"
],
"collaborations": [
{
"value": "Fermi LAT"
}
],
"inspire_categories": [
{
"source": "cds",
"term": "Astrophysics"
}
],
"titles": [
{
"source": "CDS",
"title": "GLAST: Launched and Being Commissioned - Status and Prospects for Microquasars"
},
{
"source": "CDS",
"title": "Fermi: Launched and Being Commissioned - Status and Prospects for Microquasars"
}
],
"_private_notes": [
{
"source": "CDS",
"value": "CDS-1200753"
}
],
"authors": [
{
"affiliations": [
{
"value": "SLAC"
}
],
"full_name": "Dubois, R"
}
],
"publication_info": [
{
"journal_volume": "MQW7",
"page_start": "019",
"journal_title": "PoS",
"artid": "019",
"year": 2008
}
],
"$schema": "http://localhost/schemas/records/hep.json",
"document_type": [
"conference paper"
],
"citeable": true,
"imprints": [
{
"date": "2008"
}
],
"acquisition_source": {
"source": "CDS",
"method": "hepcrawl",
"submission_number": "None",
"datetime": "2017-12-14T08:10:03.951904"
}
},
{
"core": true,
"documents": [
{
"url": "http://cds.cern.ch/record/1200754/files/MQW7_020.pdf",
"source": "CDS",
"description": "Published version from PoS",
"key": "MQW7_020.pdf"
}
],
"curated": true,
"_collections": [
"Literature"
],
"inspire_categories": [
{
"source": "cds",
"term": "Astrophysics"
}
],
"titles": [
{
"source": "CDS",
"title": "Hadronic models of high-energy radiation from microquasars: recent developments"
}
],
"_private_notes": [
{
"source": "CDS",
"value": "CDS-1200754"
}
],
"authors": [
{
"affiliations": [
{
"value": "Villa Elisa, Inst. Argentino Radioastron."
},
{
"value": "La Plata U."
}
],
"full_name": "Romero, G E"
}
],
"publication_info": [
{
"journal_volume": "MQW7",
"page_start": "020",
"journal_title": "PoS",
"artid": "020",
"year": 2008
}
],
"$schema": "http://localhost/schemas/records/hep.json",
"document_type": [
"conference paper"
],
"citeable": true,
"imprints": [
{
"date": "2008"
}
],
"acquisition_source": {
"source": "CDS",
"method": "hepcrawl",
"submission_number": "None",
"datetime": "2017-12-14T08:10:03.984541"
}
}
]
Loading

0 comments on commit 4fb1ce6

Please sign in to comment.