diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 25af4fb3..2d9ee620 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -58,6 +58,15 @@ services: arxiv-http-server.local: condition: service_healthy + functional_cds: + <<: *service_base + command: py.test -vv tests/functional/cds + depends_on: + scrapyd: + condition: service_healthy + cds-http-server.local: + condition: service_healthy + functional_pos: <<: *service_base command: py.test -vv tests/functional/pos @@ -126,12 +135,6 @@ services: - "CMD-SHELL" - "curl https://localhost:443/" - functional_cds: - <<: *service_base - command: py.test -vv tests/functional/cds - links: - - scrapyd - arxiv-http-server.local: image: nginx:stable-alpine volumes: @@ -147,6 +150,21 @@ services: - "CMD-SHELL" - "curl http://localhost:80/" + cds-http-server.local: + image: nginx:stable-alpine + volumes: + - ${PWD}/tests/functional/cds/fixtures/http_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf + - ${PWD}/tests/functional/cds/fixtures/http_server/records:/etc/nginx/html/ + ports: + - 80:80 + healthcheck: + timeout: 5s + interval: 5s + retries: 5 + test: + - "CMD-SHELL" + - "curl http://localhost:80/" + rabbitmq: image: rabbitmq healthcheck: diff --git a/hepcrawl/spiders/cds_spider.py b/hepcrawl/spiders/cds_spider.py index 415a62f0..9ba997c7 100644 --- a/hepcrawl/spiders/cds_spider.py +++ b/hepcrawl/spiders/cds_spider.py @@ -9,75 +9,52 @@ """Spider for the CERN Document Server OAI-PMH interface""" -from dojson.contrib.marc21.utils import create_record +import logging from flask.app import Flask -from harvestingkit.inspire_cds_package.from_cds import CDS2Inspire -from harvestingkit.bibrecord import ( - create_record as create_bibrec, - record_xml_output, -) -from inspire_dojson.hep import hep -from scrapy import Request -from scrapy.spider import XMLFeedSpider +from inspire_dojson import marcxml2record +from os.path import join as path_join -from . import StatefulSpider +from .common.oaipmh_spider import OAIPMHSpider from ..utils import ParsedItem -class CDSSpider(StatefulSpider, XMLFeedSpider): +LOGGER = logging.getLogger(__name__) + + +class CDSSpider(OAIPMHSpider): """Spider for crawling the CERN Document Server OAI-PMH XML files. Example: Using OAI-PMH XML files:: - $ scrapy crawl \\ - cds \\ - -a "source_file=file://$PWD/tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml" + $ scrapy crawl CDS \\ + -a "sets=forINSPIRE" -a "from_date=2017-10-10" - It uses `HarvestingKit `_ to - translate from CDS's MARCXML into INSPIRE Legacy's MARCXML flavor. It then - employs `inspire-dojson `_ to - transform the legacy INSPIRE MARCXML into the new INSPIRE Schema. + It uses `inspire-dojson `_ to + translate from CDS's MARCXML into the new INSPIRE Schema. """ name = 'CDS' - iterator = 'xml' - itertag = 'OAI-PMH:record' - namespaces = [ - ('OAI-PMH', 'http://www.openarchives.org/OAI/2.0/'), - ('marc', 'http://www.loc.gov/MARC21/slim'), - ] - def __init__(self, source_file=None, **kwargs): - super(CDSSpider, self).__init__(**kwargs) - self.source_file = source_file + def __init__(self, *args, **kwargs): + kwargs.setdefault('url', 'http://cds.cern.ch/oai2d') + kwargs.setdefault('format', 'marcxml') + kwargs.setdefault('sets', 'forINSPIRE') + super(CDSSpider, self).__init__(*args, **kwargs) - def start_requests(self): - yield Request(self.source_file) - - def parse_node(self, response, node): - node.remove_namespaces() - cds_bibrec, ok, errs = create_bibrec( - node.xpath('.//record').extract()[0] - ) - if not ok: - raise RuntimeError("Cannot parse record %s: %s", node, errs) - self.logger.info("Here's the record: %s" % cds_bibrec) - inspire_bibrec = CDS2Inspire(cds_bibrec).get_record() - marcxml_record = record_xml_output(inspire_bibrec) - record = create_record(marcxml_record) + def get_record_identifier(self, record): + """Extracts a unique identifier from a sickle record.""" + return record.header.identifier + def parse_record(self, selector): + selector.remove_namespaces() + record = selector.xpath('.//record').extract_first() app = Flask('hepcrawl') app.config.update( self.settings.getdict('MARC_TO_HEP_SETTINGS', {}) ) with app.app_context(): - json_record = hep.do(record) + json_record = marcxml2record(record) base_uri = self.settings['SCHEMA_BASE_URI'] - json_record['$schema'] = base_uri + 'hep.json' - - parsed_item = ParsedItem( - record=json_record, - record_format='hep', - ) - return parsed_item + json_record['$schema'] = path_join(base_uri, 'hep.json') + return ParsedItem(record=json_record, record_format='hep') diff --git a/hepcrawl/spiders/common/__init__.py b/hepcrawl/spiders/common/__init__.py index 69c31530..4de1e2b0 100644 --- a/hepcrawl/spiders/common/__init__.py +++ b/hepcrawl/spiders/common/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of hepcrawl. -# Copyright (C) 2017 CERN. +# Copyright (C) 2015, 2016, 2017, 2018 CERN. # # hepcrawl is a free software; you can redistribute it and/or modify it # under the terms of the Revised BSD License; see LICENSE file for diff --git a/tests/functional/cds/fixtures/cds_expected.json b/tests/functional/cds/fixtures/cds_expected.json new file mode 100644 index 00000000..51d219e1 --- /dev/null +++ b/tests/functional/cds/fixtures/cds_expected.json @@ -0,0 +1,218 @@ +[ + { + "core": true, + "documents": [ + { + "url": "http://cds.cern.ch/record/1200752/files/MQW7_018.pdf", + "source": "CDS", + "description": "Published version from PoS", + "key": "MQW7_018.pdf" + } + ], + "curated": true, + "_collections": [ + "Literature" + ], + "inspire_categories": [ + { + "source": "cds", + "term": "Astrophysics" + } + ], + "titles": [ + { + "source": "CDS", + "title": "High and very high energy gamma-ray emission from binaries" + } + ], + "_private_notes": [ + { + "source": "CDS", + "value": "CDS-1200752" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "Grenoble Observ." + } + ], + "full_name": "Dubus, G" + } + ], + "publication_info": [ + { + "journal_volume": "MQW7", + "page_start": "018", + "journal_title": "PoS", + "artid": "018", + "year": 2008 + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "citeable": true, + "imprints": [ + { + "date": "2009" + } + ], + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:03.875113" + } + }, + { + "core": true, + "documents": [ + { + "url": "http://cds.cern.ch/record/1200753/files/MQW7_019.pdf", + "source": "CDS", + "description": "Published version from PoS", + "key": "MQW7_019.pdf" + } + ], + "curated": true, + "_collections": [ + "Literature" + ], + "collaborations": [ + { + "value": "Fermi LAT" + } + ], + "inspire_categories": [ + { + "source": "cds", + "term": "Astrophysics" + } + ], + "titles": [ + { + "source": "CDS", + "title": "GLAST: Launched and Being Commissioned - Status and Prospects for Microquasars" + }, + { + "source": "CDS", + "title": "Fermi: Launched and Being Commissioned - Status and Prospects for Microquasars" + } + ], + "_private_notes": [ + { + "source": "CDS", + "value": "CDS-1200753" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "SLAC" + } + ], + "full_name": "Dubois, R" + } + ], + "publication_info": [ + { + "journal_volume": "MQW7", + "page_start": "019", + "journal_title": "PoS", + "artid": "019", + "year": 2008 + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "citeable": true, + "imprints": [ + { + "date": "2008" + } + ], + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:03.951904" + } + }, + { + "core": true, + "documents": [ + { + "url": "http://cds.cern.ch/record/1200754/files/MQW7_020.pdf", + "source": "CDS", + "description": "Published version from PoS", + "key": "MQW7_020.pdf" + } + ], + "curated": true, + "_collections": [ + "Literature" + ], + "inspire_categories": [ + { + "source": "cds", + "term": "Astrophysics" + } + ], + "titles": [ + { + "source": "CDS", + "title": "Hadronic models of high-energy radiation from microquasars: recent developments" + } + ], + "_private_notes": [ + { + "source": "CDS", + "value": "CDS-1200754" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "Villa Elisa, Inst. Argentino Radioastron." + }, + { + "value": "La Plata U." + } + ], + "full_name": "Romero, G E" + } + ], + "publication_info": [ + { + "journal_volume": "MQW7", + "page_start": "020", + "journal_title": "PoS", + "artid": "020", + "year": 2008 + } + ], + "$schema": "http://localhost/schemas/records/hep.json", + "document_type": [ + "conference paper" + ], + "citeable": true, + "imprints": [ + { + "date": "2008" + } + ], + "acquisition_source": { + "source": "CDS", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2017-12-14T08:10:03.984541" + } + } +] diff --git a/tests/functional/cds/fixtures/cds_smoke_records_expected.json b/tests/functional/cds/fixtures/cds_smoke_records_expected.json deleted file mode 100644 index f6f6a7f8..00000000 --- a/tests/functional/cds/fixtures/cds_smoke_records_expected.json +++ /dev/null @@ -1,153 +0,0 @@ -[ - { - "$schema": "http://localhost/schemas/records/hep.json", - "_collections": [ - "Literature" - ], - "accelerator_experiments": [ - { - "legacy_name": "CERN-SPS---" - } - ], - "acquisition_source": { - "datetime": "2017-10-04T14:07:59.746165", - "method": "hepcrawl", - "source": "CDS", - "submission_number": "None" - }, - "core": true, - "curated": true, - "corporate_author": [ - "European Organization for Nuclear Research" - ], - "documents": [ - { - "url": "http://cds.cern.ch/record/21099/files/CM-P00077286-e.pdf", - "key": "document" - }, - { - "url": "http://cds.cern.ch/record/21099/files/CM-P00078235-f.pdf", - "key": "1_document" - } - ], - "document_type": [ - "article" - ], - "external_system_identifiers": [ - { - "schema": "Inspire", - "value": "1614043" - }, - { - "schema": "ADMADM", - "value": "0003711" - }, - { - "schema": "CDS", - "value": "21099" - } - ], - "inspire_categories": [ - { - "term": "Accelerators" - } - ], - "languages": [ - "fr" - ], - "preprint_date": "1967-05-30", - "report_numbers": [ - { - "value": "CERN/0702" - }, - { - "value": "CM-P00077286-e" - }, - { - "value": "CM-P00078235-f" - } - ], - "titles": [ - { - "title": "Addendum to the Report on the Design Study of a 300 GeV Proton Synchrotron (CERN/563) (AR/Int. SG/64-15)" - }, - { - "title": "Suppl\u00e9ment au Rapport sur le projet du synchrotron \u00e0 prontons de 300 GeV (CERN/563) (Ar/Int. SG/64-15)" - } - ] - }, - { - "$schema": "http://localhost/schemas/records/hep.json", - "_collections": [ - "Literature" - ], - "accelerator_experiments": [ - { - "legacy_name": "CERN-LEP---" - } - ], - "acquisition_source": { - "datetime": "2017-10-04T14:07:59.783028", - "method": "hepcrawl", - "source": "CDS", - "submission_number": "None" - }, - "core": true, - "curated": true, - "documents": [ - { - "url": "http://cds.cern.ch/record/60936/files/CM-P00098683-f.pdf", - "key": "document" - }, - { - "url": "http://cds.cern.ch/record/60936/files/CERN-SPC-426.pdf", - "key": "1_document" - } - ], - "document_type": [ - "article" - ], - "external_system_identifiers": [ - { - "schema": "ADMADM", - "value": "0009846" - }, - { - "schema": "Inspire", - "value": "1614044" - }, - { - "schema": "CDS", - "value": "60936" - } - ], - "inspire_categories": [ - { - "term": "Accelerators" - } - ], - "languages": [ - "fr" - ], - "preprint_date": "1978-10-06", - "report_numbers": [ - { - "value": "CERN/SPC/0426" - }, - { - "value": "CM-P00095369-e" - }, - { - "value": "CM-P00098683-f" - } - ], - "titles": [ - { - "title": "LEP Studies 1979 to 1981" - }, - { - "title": "Les Etudes sur le LEP de 1979 -1981" - } - ] - } -] diff --git a/tests/functional/cds/fixtures/http_server/conf/proxy.conf b/tests/functional/cds/fixtures/http_server/conf/proxy.conf new file mode 100644 index 00000000..68d70722 --- /dev/null +++ b/tests/functional/cds/fixtures/http_server/conf/proxy.conf @@ -0,0 +1,12 @@ +server { + listen 80; + server_name localhost; + charset_types text/xml; + charset UTF-8; + + location /oai2d { + if ($args ~ from=2017-11-15&verb=ListRecords&set=forINSPIRE&metadataPrefix=marcxml) { + rewrite ^.*$ /cds.xml permanent; + } + } +} diff --git a/tests/functional/cds/fixtures/http_server/records/cds.xml b/tests/functional/cds/fixtures/http_server/records/cds.xml new file mode 100644 index 00000000..c23aee04 --- /dev/null +++ b/tests/functional/cds/fixtures/http_server/records/cds.xml @@ -0,0 +1,285 @@ + + + +2017-12-07T15:05:26Zhttp://cds.cern.ch/oai2d +
oai:cds.cern.ch:12007522017-11-16T08:09:30Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1200752 + SzGeCERN + 20171116090930.0 + + oai:cds.cern.ch:1200752 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509577 + + + eng + + + Dubus, G + Grenoble Observ. + + + High and very high energy gamma-ray emission from binaries + + + 2009 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS MQW7-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + 018 + PoS + MQW7 + 2008 + + + http://cds.cern.ch/record/1200752/files/MQW7_018.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090817 + + + 1129423 + 018 + izmir20080901 + + + PUBLIC + + + 002842486CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12007532017-11-16T08:09:30Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1200753 + SzGeCERN + 20171116090930.0 + + oai:cds.cern.ch:1200753 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509578 + + + eng + + + Dubois, R + SLAC + + + GLAST: Launched and Being Commissioned - Status and Prospects for Microquasars + + + Fermi: Launched and Being Commissioned - Status and Prospects for Microquasars + Other title + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS MQW7-2009 + + + No authors + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + for the Fermi LAT Collaboration + + + 019 + PoS + MQW7 + 2008 + + + http://cds.cern.ch/record/1200753/files/MQW7_019.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090817 + + + 1129423 + 019 + izmir20080901 + + + PUBLIC + + + 002842487CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
oai:cds.cern.ch:12007542017-11-16T08:09:30Zcerncds:FULLTEXTforINSPIRE
+ 00000coc 2200000uu 4500 + 1200754 + SzGeCERN + 20171116090930.0 + + oai:cds.cern.ch:1200754 + cerncds:FULLTEXT + forINSPIRE + + + Inspire + 1509579 + + + eng + + + Romero, G E + Villa Elisa, Inst. Argentino Radioastron. + La Plata U. + + + Hadronic models of high-energy radiation from microquasars: recent developments + + + 2008 + + + Open Access + CC-BY-NC-SA-3.0 + http://creativecommons.org/licenses/by-nc-sa/3.0/ + + + SIS POS MQW7-2009 + + + Inspire + + + SzGeCERN + Astrophysics and Astronomy + + + ARTICLE + + + 020 + PoS + MQW7 + 2008 + + + http://cds.cern.ch/record/1200754/files/MQW7_020.pdf + Published version from PoS + + + n + 200933 + + + 13 + + + 20110201 + 1448 + CER01 + 20090817 + + + 1129423 + 020 + izmir20080901 + + + PUBLIC + + + 002842488CER + + + ARTICLE + + + ConferencePaper + + + Hidden + + +
+
+ diff --git a/tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml b/tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml deleted file mode 100644 index b3c521f6..00000000 --- a/tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml +++ /dev/null @@ -1,246 +0,0 @@ - - - - 2017-10-04T07:25:58Z - http://cds.cern.ch/oai2d - - -
- oai:cds.cern.ch:21099 - 2017-07-27T21:14:27Z - cerncds:FULLTEXT - forINSPIRE -
- - - 00000coc 2200000uu 4500 - 21099 - SzGeCERN - 20170803223648.0 - - oai:cds.cern.ch:21099 - cerncds:FULLTEXT - forINSPIRE - - - ADMADM - 0003711 - - - Inspire - 1614043 - - - eng - - - fre - - - COUNCIL-0702 - - - CERN/0702 - - - CM-P00077286-e - - - CM-P00078235-f - - - European Organization for Nuclear Research - - - Addendum to the Report on the Design Study of a 300 GeV Proton Synchrotron (CERN/563) (AR/Int. SG/64-15) - - - Supplément au Rapport sur le projet du synchrotron à prontons de 300 GeV (CERN/563) (Ar/Int. SG/64-15) - Titre français - - - 1967 - - - 1967-05-30 - - - SISARC-2009 - - - CLAS1 - - - Inspire - - - SzGeCERN - Accelerators and Storage Rings - - - Design Report - - - CERN - - - CERN SPS - - - http://cds.cern.ch/record/21099/files/CM-P00077286-e.pdf - English - - - http://cds.cern.ch/record/21099/files/CM-P00078235-f.pdf - French - - - n - 200319 - - - 60 - - - 20031203 - 0855 - MAN01 - 19990126 - - - PUBLIC - - - 000003711MAN - - - COUNCIL - - - -
- -
- oai:cds.cern.ch:60936 - 2017-07-27T21:14:28Z - cerncds:FULLTEXT - forINSPIRE -
- - - 00000coc 2200000uu 4500 - 60936 - SzGeCERN - 20170803223648.0 - - oai:cds.cern.ch:60936 - cerncds:FULLTEXT - forINSPIRE - - - ADMADM - 0009846 - - - Inspire - 1614044 - - - eng - - - fre - - - CERN/SPC/0426 - - - CM-P00095369-e - - - CM-P00098683-f - - - 19781023 - 104th Meeting of Scientific Policy Committee - CERN, Geneva, Switzerland - 23 - 24 Oct 1978 - 1978 - cern19781023 - 104 - CH - 19781024 - - - LEP Studies 1979 to 1981 - - - Les Etudes sur le LEP de 1979 -1981 - Titre français - - - 1978 - - - 1978-10-06 - - - 78/140/5 - - - SISARC-2009 - - - CLAS1 - - - Inspire - - - SzGeCERN - Accelerators and Storage Rings - - - Design Report - - - CERN - - - CERN LEP - - - http://cds.cern.ch/record/60936/files/CM-P00098683-f.pdf - French - - - http://cds.cern.ch/record/60936/files/CERN-SPC-426.pdf - English - - - n - 200319 - - - 62 - - - 20031203 - 0901 - MAN01 - 19990126 - - - PUBLIC - - - 000009846MAN - - - SPC - - - -
-
-
diff --git a/tests/functional/cds/test_cds.py b/tests/functional/cds/test_cds.py index 93c60ce3..ffe9f8ab 100644 --- a/tests/functional/cds/test_cds.py +++ b/tests/functional/cds/test_cds.py @@ -7,20 +7,29 @@ # under the terms of the Revised BSD License; see LICENSE file for # more details. -"""Functional tests for ArXiv spider""" +"""Functional tests for CDS spider""" from __future__ import absolute_import, division, print_function +import os import pytest -from hepcrawl.testlib.tasks import app as celery_app from hepcrawl.testlib.celery_monitor import CeleryMonitor -from hepcrawl.testlib.utils import get_crawler_instance, deep_sort from hepcrawl.testlib.fixtures import ( - get_test_suite_path, expected_json_results_from_file, clean_dir, ) +from hepcrawl.testlib.tasks import app as celery_app +from hepcrawl.testlib.utils import get_crawler_instance, deep_sort + + +@pytest.fixture(scope='function', autouse=True) +def cleanup(): + clean_dir() + clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) + yield + clean_dir() + clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) def override_generated_fields(record): @@ -32,44 +41,39 @@ def override_generated_fields(record): return record -@pytest.fixture(scope="function") -def set_up_local_environment(): - package_location = get_test_suite_path( - 'cds', - 'fixtures', - 'oai_harvested', - 'cds_smoke_records.xml', - test_suite='functional', - ) - - yield { +def get_configuration(): + return { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { - 'source_file': 'file://' + package_location, + 'from_date': '2017-11-15', + 'sets': 'forINSPIRE', + 'url': 'http://cds-http-server.local/oai2d', } } - clean_dir() - @pytest.mark.parametrize( - 'expected_results', + 'expected_results, config', [ - expected_json_results_from_file( - 'cds', - 'fixtures', - 'cds_smoke_records_expected.json', + ( + expected_json_results_from_file( + 'cds', + 'fixtures', + 'cds_expected.json', + ), + get_configuration(), ), ], ids=[ 'smoke', ] ) -def test_cds(set_up_local_environment, expected_results): - crawler = get_crawler_instance( - set_up_local_environment.get('CRAWLER_HOST_URL') - ) +def test_cds( + expected_results, + config, +): + crawler = get_crawler_instance(config['CRAWLER_HOST_URL']) results = CeleryMonitor.do_crawl( app=celery_app, @@ -77,23 +81,10 @@ def test_cds(set_up_local_environment, expected_results): monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, - project=set_up_local_environment.get('CRAWLER_PROJECT'), + project=config['CRAWLER_PROJECT'], spider='CDS', settings={}, - **set_up_local_environment.get('CRAWLER_ARGUMENTS') - ) - - results = deep_sort( - sorted( - results, - key=lambda result: result['titles'][0]['title'], - ) - ) - expected_results = deep_sort( - sorted( - expected_results, - key=lambda result: result['titles'][0]['title'], - ) + **config['CRAWLER_ARGUMENTS'] ) gotten_results = [override_generated_fields(result) for result in results] @@ -101,70 +92,7 @@ def test_cds(set_up_local_environment, expected_results): override_generated_fields(expected) for expected in expected_results ] - assert gotten_results == expected_results - - -@pytest.mark.parametrize( - 'expected_results', - [ - expected_json_results_from_file( - 'cds', - 'fixtures', - 'cds_smoke_records_expected.json', - ), - ], - ids=[ - 'crawl_twice', - ] -) -def test_cds_crawl_twice(set_up_local_environment, expected_results): - crawler = get_crawler_instance( - set_up_local_environment.get('CRAWLER_HOST_URL') - ) - - results = CeleryMonitor.do_crawl( - app=celery_app, - monitor_timeout=5, - monitor_iter_limit=20, - events_limit=1, - crawler_instance=crawler, - project=set_up_local_environment.get('CRAWLER_PROJECT'), - spider='CDS', - settings={}, - **set_up_local_environment.get('CRAWLER_ARGUMENTS') - ) - - results = deep_sort( - sorted( - results, - key=lambda result: result['titles'][0]['title'], - ) - ) - expected_results = deep_sort( - sorted( - expected_results, - key=lambda result: result['titles'][0]['title'], - ) - ) - - gotten_results = [override_generated_fields(result) for result in results] - expected_results = [ - override_generated_fields(expected) for expected in expected_results - ] + gotten_results = deep_sort(gotten_results) + expected_results = deep_sort(expected_results) assert gotten_results == expected_results - - results = CeleryMonitor.do_crawl( - app=celery_app, - monitor_timeout=5, - monitor_iter_limit=20, - crawler_instance=crawler, - project=set_up_local_environment.get('CRAWLER_PROJECT'), - spider='CDS', - settings={}, - **set_up_local_environment.get('CRAWLER_ARGUMENTS') - ) - - gotten_results = [override_generated_fields(result) for result in results] - - assert gotten_results == []