CDS spider: drop HarvestingKit (inspirehep#199)

Harvests CDS through dojson directly: closes inspirehep#199. Signed-off-by: Szymon Łopaciuk <[email protected]>
kaplun · Jan 16, 2018 · adb1906 · adb1906
1 parent 80efc44
commit adb1906
Show file tree

Hide file tree

Showing 2 changed files with 683 additions and 432 deletions.
diff --git a/hepcrawl/spiders/cds_spider.py b/hepcrawl/spiders/cds_spider.py
@@ -10,13 +10,7 @@
 """Spider for the CERN Document Server OAI-PMH interface"""
 
 import logging
-from scrapy import Request
 from flask.app import Flask
-from harvestingkit.inspire_cds_package.from_cds import CDS2Inspire
-from harvestingkit.bibrecord import (
-    create_record as create_bibrec,
-    record_xml_output,
-)
 from dojson.contrib.marc21.utils import create_record
 from inspire_dojson.hep import hep
 
@@ -34,10 +28,8 @@ class CDSSpider(OAIPMHSpider):
             $ scrapy crawl CDS \\
                 -a "oai_set=forINSPIRE" -a "from_date=2017-10-10"
 
-    It uses `HarvestingKit <https://pypi.python.org/pypi/HarvestingKit>`_ to
-    translate from CDS's MARCXML into INSPIRE Legacy's MARCXML flavor. It then
-    employs `inspire-dojson <https://pypi.python.org/pypi/inspire-dojson>`_ to
-    transform the legacy INSPIRE MARCXML into the new INSPIRE Schema.
+    It uses `inspire-dojson <https://pypi.python.org/pypi/inspire-dojson>`_ to
+    translate from CDS's MARCXML into the new INSPIRE Schema.
     """
 
     name = 'CDS'
@@ -57,23 +49,13 @@ def __init__(self,
 
     def parse_record(self, selector):
         selector.remove_namespaces()
-        try:
-            cds_bibrec, ok, errs = create_bibrec(selector.xpath('.//record').extract()[0])
-            if not ok:
-                raise RuntimeError("Cannot parse record %s: %s", selector, errs)
-            self.logger.info("Here's the record: %s" % cds_bibrec)
-            inspire_bibrec = CDS2Inspire(cds_bibrec).get_record()
-            marcxml_record = record_xml_output(inspire_bibrec)
-            record = create_record(marcxml_record)
-            app = Flask('hepcrawl')
-            app.config.update(
-                self.settings.getdict('MARC_TO_HEP_SETTINGS', {})
-            )
-            with app.app_context():
-                json_record = hep.do(record)
-                base_uri = self.settings['SCHEMA_BASE_URI']
-                json_record['$schema'] = base_uri + 'hep.json'
-            return ParsedItem(record=json_record, record_format='hep')
-        except Exception:
-            logger.exception("Error when parsing record")
-            return None
+        record = create_record(selector.xpath('.//record').extract()[0])
+        app = Flask('hepcrawl')
+        app.config.update(
+            self.settings.getdict('MARC_TO_HEP_SETTINGS', {})
+        )
+        with app.app_context():
+            json_record = hep.do(record)
+            base_uri = self.settings['SCHEMA_BASE_URI']
+            json_record['$schema'] = base_uri + 'hep.json'
+        return ParsedItem(record=json_record, record_format='hep')