From 8978e8307d919d766bee39c925d5a40089231110 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Fri, 12 Jan 2018 14:34:55 +0100 Subject: [PATCH] IOP Spider: improve and add tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds test records from IOP and fixes some simple issues with IOP spider, to make the tests pass. Introduces a functional tests of the IOP spider. Signed-off-by: Szymon Łopaciuk --- .travis.yml | 1 + docker-compose.test.yml | 7 + hepcrawl/extractors/nlm.py | 6 +- hepcrawl/spiders/iop_spider.py | 1 + hepcrawl/tohep.py | 2 +- tests/functional/iop/fixtures/test_iop.json | 175 ++++++++++++++++++ tests/functional/iop/fixtures/test_iop.xml | 106 +++++++++++ tests/functional/iop/test_iop.py | 107 +++++++++++ .../pdf/{test_143_3_336.pdf => 143_3_336.pdf} | Bin .../unit/responses/iop/xml/test_standard.xml | 2 +- tests/unit/test_iop.py | 8 +- 11 files changed, 407 insertions(+), 8 deletions(-) create mode 100644 tests/functional/iop/fixtures/test_iop.json create mode 100644 tests/functional/iop/fixtures/test_iop.xml create mode 100644 tests/functional/iop/test_iop.py rename tests/unit/responses/iop/pdf/{test_143_3_336.pdf => 143_3_336.pdf} (100%) diff --git a/.travis.yml b/.travis.yml index 6bc66b84..25210032 100644 --- a/.travis.yml +++ b/.travis.yml @@ -31,6 +31,7 @@ env: - SUITE=functional_desy - SUITE=functional_cds - SUITE=functional_pos + - SUITE=functional_iop matrix: fast_finish: true diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 9a1df2e0..e9ce1e45 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -70,6 +70,13 @@ services: http-server.local: condition: service_healthy + functional_iop: + <<: *service_base + command: py.test -vv tests/functional/iop + depends_on: + scrapyd: + condition: service_healthy + unit: <<: *service_base command: bash -c "py.test tests/unit -vv && make -C docs clean && make -C docs html && python setup.py sdist && ls dist/*" diff --git a/hepcrawl/extractors/nlm.py b/hepcrawl/extractors/nlm.py index 4a193cf0..dddc1e13 100644 --- a/hepcrawl/extractors/nlm.py +++ b/hepcrawl/extractors/nlm.py @@ -146,10 +146,10 @@ def get_page_numbers(node): fpage = node.xpath(".//FirstPage/text()").extract_first() lpage = node.xpath(".//LastPage/text()").extract_first() - if fpage and lpage: + try: page_nr = str(int(lpage) - int(fpage) + 1) - else: - page_nr = '' + except (ValueError, TypeError): + page_nr = None return ( fpage, diff --git a/hepcrawl/spiders/iop_spider.py b/hepcrawl/spiders/iop_spider.py index fbca3ae5..d5c918df 100644 --- a/hepcrawl/spiders/iop_spider.py +++ b/hepcrawl/spiders/iop_spider.py @@ -159,6 +159,7 @@ def add_document(self, file_path, hidden, fulltext): "fulltext": fulltext, "description": self.name.upper(), "url": file_path, + "key": os.path.basename(file_path), } return file_dict diff --git a/hepcrawl/tohep.py b/hepcrawl/tohep.py index a4529d7c..a10f49ba 100644 --- a/hepcrawl/tohep.py +++ b/hepcrawl/tohep.py @@ -243,7 +243,7 @@ def _filter_affiliation(affiliations): for author in crawler_record.get('authors', []): builder.add_author(builder.make_author( full_name=author['full_name'], - affiliations=_filter_affiliation(author['affiliations']), + affiliations=_filter_affiliation(author.get('affiliations', [])), )) for title in crawler_record.get('titles', []): diff --git a/tests/functional/iop/fixtures/test_iop.json b/tests/functional/iop/fixtures/test_iop.json new file mode 100644 index 00000000..1b620dd2 --- /dev/null +++ b/tests/functional/iop/fixtures/test_iop.json @@ -0,0 +1,175 @@ +[ + { + "refereed": true, + "documents": [ + { + "fulltext": true, + "description": "IOP", + "url": "file:///code/tests/functional/iop/fixtures/test_iop.xml", + "source": "iop", + "key": "test_iop.xml", + "hidden": true + } + ], + "curated": false, + "_collections": [ + "Literature" + ], + "dois": [ + { + "source": "iop", + "value": "10.1088/1742-6596/851/1/012001" + } + ], + "acquisition_source": { + "source": "iop", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2018-01-12T13:06:39.088639" + }, + "titles": [ + { + "source": "iop", + "title": "Response of optically stimulated luminescence dosimeters subjected to X-rays in diagnostic energy range" + } + ], + "copyright": [ + { + "holder": "Institute of Physics" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "Department of Physics, Universiti Teknologi Malaysia, Johor, Bahru Johor, 81300, Malaysia. Centre for Energy Research and Training, Ahmadu Bello University, Zaria, Kaduna State, P.M.B. 1014, Nigeria." + } + ], + "full_name": "Musa, Y." + }, + { + "full_name": "Hashim, S." + }, + { + "full_name": "Karim, M. K A" + }, + { + "full_name": "Bakar, K.A." + }, + { + "full_name": "Ang, W.C." + }, + { + "full_name": "Salehhon, N." + } + ], + "publication_info": [ + { + "journal_volume": "851", + "page_start": "012001", + "year": 2017, + "journal_issue": "1", + "journal_title": "J. Phys.: Conf. Ser." + } + ], + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "iop", + "value": "The use of optically stimulated luminescence (OSL) for dosimetry applications has recently increased considerably due to availability of commercial OSL dosimeters (nanoDots) for clinical use. The OSL dosimeter has a great potential to be used in clinical dosimetry because of its prevailing advantages in both handling and application. However, utilising nanoDot OSLDs for dose measurement in diagnostic radiology can only be guaranteed when the performance and characteristics of the dosimeters are apposite. In the present work, we examined the response of commercially available nanoDot OSLD (Al$_{2}$O$_{3}$:C) subjected to X-rays in general radiography. The nanoDots response with respect to reproducibility, dose linearity and signal depletion were analysed using microStar reader (Landauer, Inc., Glenwood, IL). Irradiations were performed free-in-air using 70, 80 and 120 kV tube voltages and tube currents ranging from 10 \u2013 100 mAs. The results showed that the nanoDots exhibit good linearity and reproducibility when subjected to diagnostic X-rays, with coefficient of variations (CV) ranging between 2.3% to 3.5% representing a good reproducibility. The results also indicated average of 1% signal reduction per readout. Hence, the nanoDots showed a promising potential for dose measurement in general X-ray procedure." + } + ], + "imprints": [ + { + "date": "2017" + } + ], + "citeable": true + }, + { + "refereed": true, + "documents": [ + { + "fulltext": true, + "description": "IOP", + "url": "file:///code/tests/functional/iop/fixtures/test_iop.xml", + "source": "iop", + "key": "test_iop.xml", + "hidden": true + } + ], + "curated": false, + "_collections": [ + "Literature" + ], + "dois": [ + { + "source": "iop", + "value": "10.1088/1361-6560/aa6be8" + } + ], + "acquisition_source": { + "source": "iop", + "method": "hepcrawl", + "submission_number": "None", + "datetime": "2018-01-12T13:06:39.192671" + }, + "titles": [ + { + "source": "iop", + "title": "Magnetic resonance imaging with hyperpolarized agents: methods and applications" + } + ], + "copyright": [ + { + "holder": "Institute of Physics" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "Department of Medical Physics, University of Wisconsin\u2013Madison, Madison, WI, United States of America." + } + ], + "full_name": "Adamson, Erin B." + }, + { + "full_name": "Ludwig, Kai D." + }, + { + "full_name": "Mummy, David G." + }, + { + "full_name": "Fain, Sean B." + } + ], + "publication_info": [ + { + "page_end": "R123", + "journal_title": "Phys. Med. Biol.", + "journal_volume": "62", + "year": 2017, + "page_start": "R81", + "journal_issue": "13" + } + ], + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "iop", + "value": "In the past decade, hyperpolarized (HP) contrast agents have been under active development for MRI applications to address the twin challenges of functional and quantitative imaging. Both HP helium ($^{3}$He) and xenon ($^{129}$Xe) gases have reached the stage where they are under study in clinical research. HP $^{129}$Xe, in particular, is poised for larger scale clinical research to investigate asthma, chronic obstructive pulmonary disease, and fibrotic lung diseases. With advances in polarizer technology and unique capabilities for imaging of $^{129}$Xe gas exchange into lung tissue and blood, HP $^{129}$Xe MRI is attracting new attention. In parallel, HP $^{13}$C and $^{15}$N MRI methods have steadily advanced in a wide range of pre-clinical research applications for imaging metabolism in various cancers and cardiac disease. The HP [1-$^{13}$C] pyruvate MRI technique, in particular, has undergone phase I trials in prostate cancer and is poised for investigational new drug trials at multiple institutions in cancer and cardiac applications. This review treats the methodology behind both HP gases and HP $^{13}$C and $^{15}$N liquid state agents. Gas and liquid phase HP agents share similar technologies for achieving non-equilibrium polarization outside the field of the MRI scanner, strategies for image data acquisition, and translational challenges in moving from pre-clinical to clinical research. To cover the wide array of methods and applications, this review is organized by numerical section into (1) a brief introduction, (2) the physical and biological properties of the most common polarized agents with a brief summary of applications and methods of polarization, (3) methods for image acquisition and reconstruction specific to improving data acquisition efficiency for HP MRI, (4) the main physical properties that enable unique measures of physiology or metabolic pathways, followed by a more detailed review of the literature describing the use of HP agents to study: (5) metabolic pathways in cancer and cardiac disease and (6) lung function in both pre-clinical and clinical research studies, concluding with (7) some future directions and challenges, and (8) an overall summary." + } + ], + "imprints": [ + { + "date": "2017" + } + ], + "citeable": true + } +] diff --git a/tests/functional/iop/fixtures/test_iop.xml b/tests/functional/iop/fixtures/test_iop.xml new file mode 100644 index 00000000..bd99214d --- /dev/null +++ b/tests/functional/iop/fixtures/test_iop.xml @@ -0,0 +1,106 @@ + + + +
+ + Institute of Physics + J. Phys.: Conf. Ser. + 1742-6588 + 851 + 1 + + 2017 + May + 31 + + + Response of optically stimulated luminescence dosimeters subjected to X-rays in diagnostic energy range + 012001 + + EN + + + Y + Musa + Department of Physics, Universiti Teknologi Malaysia, Johor, Bahru Johor, 81300, Malaysia. +Centre for Energy Research and Training, Ahmadu Bello University, Zaria, Kaduna State, P.M.B. 1014, Nigeria. + + + S + Hashim + + + M + K A + Karim + + + K + A + Bakar + + + W + C + Ang + + + N + Salehhon + + + + 10.1088/1742-6596/851/1/012001 + + +The use of optically stimulated luminescence (OSL) for dosimetry applications has recently increased considerably due to availability of commercial OSL dosimeters (nanoDots) for clinical use. The OSL dosimeter has a great potential to be used in clinical dosimetry because of its prevailing advantages in both handling and application. However, utilising nanoDot OSLDs for dose measurement in diagnostic radiology can only be guaranteed when the performance and characteristics of the dosimeters are apposite. In the present work, we examined the response of commercially available nanoDot OSLD (Al2O3:C) subjected to X-rays in general radiography. The nanoDots response with respect to reproducibility, dose linearity and signal depletion were analysed using microStar reader (Landauer, Inc., Glenwood, IL). Irradiations were performed free-in-air using 70, 80 and 120 kV tube voltages and tube currents ranging from 10 – 100 mAs. The results showed that the nanoDots exhibit good linearity and reproducibility when subjected to diagnostic X-rays, with coefficient of variations (CV) ranging between 2.3% to 3.5% representing a good reproducibility. The results also indicated average of 1% signal reduction per readout. Hence, the nanoDots showed a promising potential for dose measurement in general X-ray procedure. + +
+
+ + Institute of Physics + Phys. Med. Biol. + 0031-9155 + 62 + 13 + + 2017 + May + 31 + + + Magnetic resonance imaging with hyperpolarized agents: methods and applications + R81 + R123 + EN + + + Erin + B + Adamson + Department of Medical Physics, University of Wisconsin–Madison, Madison, WI, United States of America. + + + Kai + D + Ludwig + + + David + G + Mummy + + + Sean + B + Fain + + + + 10.1088/1361-6560/aa6be8 + + +In the past decade, hyperpolarized (HP) contrast agents have been under active development for MRI applications to address the twin challenges of functional and quantitative imaging. Both HP helium (3He) and xenon (129Xe) gases have reached the stage where they are under study in clinical research. HP 129Xe, in particular, is poised for larger scale clinical research to investigate asthma, chronic obstructive pulmonary disease, and fibrotic lung diseases. With advances in polarizer technology and unique capabilities for imaging of 129Xe gas exchange into lung tissue and blood, HP 129Xe MRI is attracting new attention. In parallel, HP 13C and 15N MRI methods have steadily advanced in a wide range of pre-clinical research applications for imaging metabolism in various cancers and cardiac disease. The HP [1-13C] pyruvate MRI technique, in particular, has undergone phase I trials in prostate cancer and is poised for investigational new drug trials at multiple institutions in cancer and cardiac applications. This review treats the methodology behind both HP gases and HP 13C and 15N liquid state agents. Gas and liquid phase HP agents share similar technologies for achieving non-equilibrium polarization outside the field of the MRI scanner, strategies for image data acquisition, and translational challenges in moving from pre-clinical to clinical research. To cover the wide array of methods and applications, this review is organized by numerical section into (1) a brief introduction, (2) the physical and biological properties of the most common polarized agents with a brief summary of applications and methods of polarization, (3) methods for image acquisition and reconstruction specific to improving data acquisition efficiency for HP MRI, (4) the main physical properties that enable unique measures of physiology or metabolic pathways, followed by a more detailed review of the literature describing the use of HP agents to study: (5) metabolic pathways in cancer and cardiac disease and (6) lung function in both pre-clinical and clinical research studies, concluding with (7) some future directions and challenges, and (8) an overall summary. + +
+
diff --git a/tests/functional/iop/test_iop.py b/tests/functional/iop/test_iop.py new file mode 100644 index 00000000..08e3397d --- /dev/null +++ b/tests/functional/iop/test_iop.py @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2018 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +"""Functional tests for IOP spider""" + +from __future__ import absolute_import, division, print_function + +import os +import pytest + +from hepcrawl.testlib.celery_monitor import CeleryMonitor +from hepcrawl.testlib.fixtures import ( + get_test_suite_path, + expected_json_results_from_file, + clean_dir, +) +from hepcrawl.testlib.tasks import app as celery_app +from hepcrawl.testlib.utils import get_crawler_instance + + +@pytest.fixture(scope='function', autouse=True) +def cleanup(): + clean_dir() + clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) + yield + clean_dir() + clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) + + +def override_generated_fields(record): + record['acquisition_source']['datetime'] = u'2017-04-03T10:26:40.365216' + record['acquisition_source']['submission_number'] = ( + u'5652c7f6190f11e79e8000224dabeaad' + ) + + return record + + +def get_configuration(): + package_location = get_test_suite_path( + 'iop', + 'fixtures', + 'test_iop.xml', + test_suite='functional', + ) + + return { + 'CRAWLER_HOST_URL': 'http://scrapyd:6800', + 'CRAWLER_PROJECT': 'hepcrawl', + 'CRAWLER_ARGUMENTS': { + 'xml_file': 'file://' + package_location, + } + } + + +@pytest.mark.parametrize( + 'expected_results, config', + [ + ( + expected_json_results_from_file( + 'iop', + 'fixtures', + 'test_iop.json', + ), + get_configuration(), + ), + ], + ids=[ + 'smoke', + ] +) +def test_iop(expected_results, config): + crawler = get_crawler_instance(config['CRAWLER_HOST_URL']) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=100, + events_limit=2, + crawler_instance=crawler, + project=config['CRAWLER_PROJECT'], + spider='iop', + settings={}, + **config['CRAWLER_ARGUMENTS'] + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [ + override_generated_fields(expected) for expected in expected_results + ] + + gotten_results = sorted( + gotten_results, + key=lambda x: x['titles'][0]['title'] + ) + expected_results = sorted( + expected_results, + key=lambda x: x['titles'][0]['title'] + ) + + assert gotten_results == expected_results diff --git a/tests/unit/responses/iop/pdf/test_143_3_336.pdf b/tests/unit/responses/iop/pdf/143_3_336.pdf similarity index 100% rename from tests/unit/responses/iop/pdf/test_143_3_336.pdf rename to tests/unit/responses/iop/pdf/143_3_336.pdf diff --git a/tests/unit/responses/iop/xml/test_standard.xml b/tests/unit/responses/iop/xml/test_standard.xml index 69fde3ce..74aad82f 100644 --- a/tests/unit/responses/iop/xml/test_standard.xml +++ b/tests/unit/responses/iop/xml/test_standard.xml @@ -76,7 +76,7 @@ j143/3/336 110.1309/AJCP4D7RXOBHLKGJ + IdType="doi">10.1309/AJCP4D7RXOBHLKGJ diff --git a/tests/unit/test_iop.py b/tests/unit/test_iop.py index 1e48fb8a..9061da32 100644 --- a/tests/unit/test_iop.py +++ b/tests/unit/test_iop.py @@ -87,7 +87,7 @@ def test_free_keywords(record): def test_dois(record): """Test extracting dois.""" assert record["dois"] - assert record["dois"][0]["value"] == '110.1309/AJCP4D7RXOBHLKGJ' + assert record["dois"][0]["value"] == '10.1309/AJCP4D7RXOBHLKGJ' def test_collections(record): @@ -152,12 +152,13 @@ def test_copyrights(record): def test_files(record): """Test files dictionary.""" - pdf_filename = "test_143_3_336.pdf" + pdf_filename = "143_3_336.pdf" assert "documents" in record assert record["documents"][1]["hidden"] assert record["documents"][1]["fulltext"] assert record["documents"][1]["url"] == os.path.join(TEST_PDF_DIR, pdf_filename) + assert record["documents"][1]["key"] == pdf_filename @pytest.fixture @@ -195,13 +196,14 @@ def erratum_open_access_record(): def test_files_erratum_open_access_record(erratum_open_access_record): """Test files dict with open access journal with erratum article.""" - pdf_filename = "test_143_3_336.pdf" + pdf_filename = "143_3_336.pdf" assert "documents" in erratum_open_access_record assert not erratum_open_access_record["documents"][1]["hidden"] assert not erratum_open_access_record["documents"][1]["fulltext"] assert erratum_open_access_record["documents"][1]["url"] == ( os.path.join(TEST_PDF_DIR, pdf_filename) ) + assert erratum_open_access_record["documents"][1]["key"] == pdf_filename def test_not_published_record():