diff --git a/hepcrawl/parsers/nlm.py b/hepcrawl/parsers/nlm.py index 617b6e68..a2fbbef2 100644 --- a/hepcrawl/parsers/nlm.py +++ b/hepcrawl/parsers/nlm.py @@ -17,7 +17,7 @@ from inspire_schemas.api import LiteratureBuilder from inspire_utils.date import PartialDate -from inspire_utils.helpers import maybe_int +from inspire_utils.helpers import remove_tags from inspire_utils.name import ParsedName from ..utils import get_node @@ -99,7 +99,19 @@ def bulk_parse(cls, nlm_records, source=None): @property def abstract(self): - return self.root.xpath('normalize-space(./Abstract)').extract_first() + abstract_node = self.root.xpath('./Abstract') + + if not abstract_node: + return None + + abstract = self.normalize_space( + remove_tags( + abstract_node[0], + allowed_tags=['sup', 'sub'], + allowed_trees=['math'], + ) + ) + return abstract @property def title(self): diff --git a/tests/unit/responses/iop/expected.yaml b/tests/unit/responses/iop/expected.yaml index 335c3833..a4aee6c6 100644 --- a/tests/unit/responses/iop/expected.yaml +++ b/tests/unit/responses/iop/expected.yaml @@ -1,12 +1,10 @@ -abstract: Somatic BRAF mutation in colon cancer essentially excludes Lynch - syndrome. We compared BRAF V600E immunohistochemistry (IHC) with BRAF - mutation in core, biopsy, and whole-section slides to determine whether - IHC is similar and to assess the cost-benefit of IHC. Resection cases - (2009-2013) with absent MLH1 and PMS2 and prior BRAF mutation polymerase - chain reaction results were chosen (n = 57). To mimic biopsy specimens, - tissue microarrays (TMAs) were constructed. In addition, available biopsies - performed prior to the resection were available in 15 cases. BRAF V600E IHC - was performed and graded on TMAs, available biopsy specimens, and +abstract: This is a sample text containing maths, + f(x)1+xdx and superscript text. + Resection cases (2009-2013) with absent MLH1 and PMS2 and prior BRAF mutation + polymerase chain reaction results were chosen (n = 57). To mimic biopsy + specimens, tissue microarrays (TMAs) were constructed. In addition, available + biopsies performed prior to the resection were available in 15 cases. BRAF + V600E IHC was performed and graded on TMAs, available biopsy specimens, and whole-section slides. Mutation status was compared with IHC, and cost-benefit analysis was performed. BRAF V600E IHC was similar in TMAs, biopsy specimens, and whole-section slides, with only four (7%) showing diff --git a/tests/unit/responses/iop/xml/test_standard.xml b/tests/unit/responses/iop/xml/test_standard.xml index 2d91653a..4aa9e05a 100644 --- a/tests/unit/responses/iop/xml/test_standard.xml +++ b/tests/unit/responses/iop/xml/test_standard.xml @@ -112,7 +112,7 @@ - Somatic BRAF mutation in colon cancer essentially excludes Lynch syndrome. We compared BRAF V600E immunohistochemistry (IHC) with BRAF mutation in core, biopsy, and whole-section slides to determine whether IHC is similar and to assess the cost-benefit of IHC. + This is a sample text containing maths, f(x)1+xdx and superscript text. Resection cases (2009-2013) with absent MLH1 and PMS2 and prior BRAF mutation polymerase chain reaction results were chosen (n = 57). To mimic biopsy specimens, tissue microarrays (TMAs) were constructed. In addition, available biopsies performed prior to the resection were available in 15 cases. BRAF V600E IHC was performed and graded on TMAs, available biopsy specimens, and whole-section slides. Mutation status was compared with IHC, and cost-benefit analysis was performed. BRAF V600E IHC was similar in TMAs, biopsy specimens, and whole-section slides, with only four (7%) showing discordance between IHC and mutation status. Using BRAF V600E IHC in our Lynch syndrome screening algorithm, we found a 10% cost savings compared with mutational analysis. BRAF V600E IHC was concordant between TMAs, biopsy specimens, and whole-section slides, suggesting biopsy specimens are as useful as whole sections. IHC remained cost beneficial compared with mutational analysis, even though more patients needed additional molecular testing to exclude Lynch diff --git a/tests/unit/test_iop.py b/tests/unit/test_iop.py index 91f16325..79fe52de 100644 --- a/tests/unit/test_iop.py +++ b/tests/unit/test_iop.py @@ -46,12 +46,6 @@ def record(): return parsed_item.record -def test_abstract(record): - """Test extracting abstract.""" - assert "abstract" in record - assert record["abstract"].startswith("Somatic BRAF mutation") - - def test_title(record): """Test extracting title.""" title = 'A Modified Lynch Syndrome Screening Algorithm in Colon Cancer: BRAF Immunohistochemistry Is Efficacious and Cost Beneficial.'