Skip to content

Commit

Permalink
Merge pull request inspirehep#214 from david-caro/avoid_timing_out
Browse files Browse the repository at this point in the history
oai: retrieve all the records at once
  • Loading branch information
david-caro authored Jan 23, 2018
2 parents 47ffa38 + e171dd2 commit 940f8d0
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 12 deletions.
20 changes: 14 additions & 6 deletions hepcrawl/spiders/cds_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,16 @@

"""Spider for the CERN Document Server OAI-PMH interface"""

from scrapy.spider import XMLFeedSpider
from scrapy import Request
from dojson.contrib.marc21.utils import create_record
from flask.app import Flask
from harvestingkit.inspire_cds_package.from_cds import CDS2Inspire
from harvestingkit.bibrecord import (
create_record as create_bibrec,
record_xml_output,
)
from dojson.contrib.marc21.utils import create_record
from inspire_dojson.hep import hep
from scrapy import Request
from scrapy.spider import XMLFeedSpider

from . import StatefulSpider
from ..utils import ParsedItem
Expand Down Expand Up @@ -65,9 +66,16 @@ def parse_node(self, response, node):
inspire_bibrec = CDS2Inspire(cds_bibrec).get_record()
marcxml_record = record_xml_output(inspire_bibrec)
record = create_record(marcxml_record)
json_record = hep.do(record)
base_uri = self.settings['SCHEMA_BASE_URI']
json_record['$schema'] = base_uri + 'hep.json'

app = Flask('hepcrawl')
app.config.update(
self.settings.getdict('MARC_TO_HEP_SETTINGS', {})
)
with app.app_context():
json_record = hep.do(record)
base_uri = self.settings['SCHEMA_BASE_URI']
json_record['$schema'] = base_uri + 'hep.json'

parsed_item = ParsedItem(
record=json_record,
record_format='hep',
Expand Down
23 changes: 17 additions & 6 deletions hepcrawl/spiders/common/oaipmh_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,16 +131,27 @@ def parse_record(self, record):

def parse(self, response):
sickle = Sickle(self.url)
params = {
'metadataPrefix': self.format,
'set': response.meta['set'],
'from': response.meta['from_date'],
'until': self.until_date,
}
try:
records = sickle.ListRecords(**{
'metadataPrefix': self.format,
'set': response.meta['set'],
'from': response.meta['from_date'],
'until': self.until_date,
})
records = sickle.ListRecords(**params)
except NoRecordsMatch as err:
LOGGER.warning(err)
raise StopIteration()

# Avoid timing out the resumption token
# TODO: implemente a storage-based solution, to be able to handle large
# amounts of records.
records = list(records)
LOGGER.info(
'Harvested %s record for params %s',
len(records),
params,
)
for record in records:
response = XmlResponse(self.url, encoding='utf-8', body=record.raw)
selector = Selector(response, type='xml')
Expand Down

0 comments on commit 940f8d0

Please sign in to comment.