From 407ff757645ee4f1b5f4c6f2a40cdb1059a24d40 Mon Sep 17 00:00:00 2001
From: Miguel Garcia Garcia <miguel.garcia.garcia@cern.ch>
Date: Thu, 22 Aug 2024 11:22:25 +0200
Subject: [PATCH] Revert "parsers: addition of author xml parser"

* ref: https://github.com/cern-sis/issues-inspire/issues/546

This reverts commit 4d14037a7411201fda69ad1da3f59b56d38cc38e.
---
 inspire_utils/parsers/author_xml.py | 158 ----------------------------
 tests/test_parsers_author_xml.py    |  66 ------------
 2 files changed, 224 deletions(-)
 delete mode 100644 inspire_utils/parsers/author_xml.py
 delete mode 100644 tests/test_parsers_author_xml.py

diff --git a/inspire_utils/parsers/author_xml.py b/inspire_utils/parsers/author_xml.py
deleted file mode 100644
index 4565f52..0000000
--- a/inspire_utils/parsers/author_xml.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# This file is part of INSPIRE.
-# Copyright (C) 2014-2024 CERN.
-#
-# INSPIRE is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# INSPIRE is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
-#
-# In applying this license, CERN does not waive the privileges and immunities
-# granted to it by virtue of its status as an Intergovernmental Organization
-# or submit itself to any jurisdiction.
-
-
-from __future__ import (
-    absolute_import,
-    division,
-    print_function,
-)
-
-import re
-
-from inspire_schemas.api import LiteratureBuilder
-from scrapy.selector import Selector
-from six import binary_type
-from six.moves import zip
-
-from inspire_utils.name import normalize_name
-
-
-class AuthorXMLParser(object):
-    def __init__(self, xml_content):
-        self.xml_content = xml_content
-
-        if isinstance(self.xml_content, binary_type):
-            self.xml_content = self.xml_content.decode("utf-8")
-
-        # Probably the %auto-ignore comment exists, so we skip the
-        # first line. See: inspirehep/inspire-next/issues/2195
-        if "%auto-ignore" in self.xml_content:
-            self.xml_content = self.xml_content.split("\n", 1)[1]
-
-    def parse(self):
-        builder = LiteratureBuilder()
-        content = Selector(text=self.xml_content, type="xml")
-        content.remove_namespaces()
-        undefined_or_none_value_regex = re.compile("undefined|none", re.IGNORECASE)
-        undefined_or_empty_inspireid_value_regex = re.compile(
-            "undefined|inspire-\s*$", re.IGNORECASE  # noqa
-        )
-        undefined_value_regex = re.compile("undefined", re.IGNORECASE)
-        ror_path_value_regex = re.compile("https://ror.org/*")
-        remove_new_line_regex = re.compile("\s*\n\s*")  # noqa
-
-        # Goes through all the authors in the file
-        for author in content.xpath("//Person"):
-
-            ids = []
-            affiliations = []
-            affiliations_identifiers = []
-
-            # Gets all the author ids
-            for source, id in zip(
-                author.xpath(
-                    './authorIDs/authorID[@source!="" and text()!=""]/@source'
-                    '| ./authorids/authorid[@source!="" and text()!=""]/@source'
-                ).getall(),
-                author.xpath(
-                    './authorIDs/authorID[@source!="" and text()!=""]/text()'
-                    '| ./authorids/authorid[@source!="" and text()!=""]/text()'
-                ).getall(),
-            ):
-                source = re.sub(remove_new_line_regex, "", source)
-                id = re.sub(remove_new_line_regex, "", id)
-                if not re.match(undefined_value_regex, source) and not re.match(
-                    undefined_or_empty_inspireid_value_regex, id
-                ):
-                    if source == "CCID":
-                        ids.append(["CERN", id])
-                    elif source == "INSPIRE":
-                        ids.append(["{} ID".format(source), id])
-                    else:
-                        ids.append([source, id])
-
-            # Gets all the names for affiliated organizations using the organization ids from author
-            for affiliation in author.xpath(
-                "./authorAffiliations/authorAffiliation/@organizationid"
-            ).getall():
-                orgName = content.xpath(
-                    'string(//organizations/Organization[@id="{}"]/orgName[@source="spiresICN"'
-                    'or @source="INSPIRE" and text()!="" ]/text())'.format(
-                        affiliation
-                    )
-                ).get()
-
-                cleaned_org_name = re.sub(remove_new_line_regex, "", orgName)
-                if orgName and not re.match(
-                    undefined_or_none_value_regex, cleaned_org_name
-                ):
-                    affiliations.append(cleaned_org_name)
-
-                # Gets all the affiliations_identifiers for affiliated organizations
-                # using the organization ids from author
-                for value, source in zip(
-                    content.xpath(
-                        '//organizations/Organization[@id="{}"]/orgName[@source="ROR"'
-                        'or @source="GRID" and text()!=""]/text()'.format(
-                            affiliation
-                        )
-                    ).getall(),
-                    content.xpath(
-                        '//organizations/Organization[@id="{}"]/orgName[@source="ROR"'
-                        'or @source="GRID" and text()!=""]/@source'.format(
-                            affiliation
-                        )
-                    ).getall(),
-                ):
-                    source = re.sub(remove_new_line_regex, "", source)
-                    value = re.sub(remove_new_line_regex, "", value)
-                    if re.match(undefined_or_none_value_regex, source) or re.match(
-                        undefined_or_none_value_regex, value
-                    ):
-                        continue
-
-                    if source == "ROR" and not re.match(ror_path_value_regex, value):
-                        value = "https://ror.org/{}".format(value)
-
-                    affiliations_identifiers.append([source, value])
-
-            name = "{}, {}".format(
-                author.xpath(".//familyName/text()").get(),
-                author.xpath(".//givenName/text()").get(),
-            )
-            name_suffix = author.xpath(".//authorSuffix/text()").get()
-            if name_suffix:
-                name += ", {}".format(name_suffix)
-            name = normalize_name(name)
-
-            # builds the info to a correct format with litratureBuilder()
-            builder.add_author(
-                builder.make_author(
-                    full_name=name,
-                    affiliations=affiliations,
-                    ids=ids,
-                    affiliations_identifiers=affiliations_identifiers,
-                )
-            )
-
-        return builder.record.get("authors", [])
diff --git a/tests/test_parsers_author_xml.py b/tests/test_parsers_author_xml.py
deleted file mode 100644
index 56236c8..0000000
--- a/tests/test_parsers_author_xml.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# This file is part of INSPIRE.
-# Copyright (C) 2014-2024 CERN.
-#
-# INSPIRE is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# INSPIRE is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
-#
-# In applying this license, CERN does not waive the privileges and immunities
-# granted to it by virtue of its status as an Intergovernmental Organization
-# or submit itself to any jurisdiction.
-
-from __future__ import (
-    absolute_import,
-    division,
-    print_function,
-)
-
-from inspire_utils.parsers.author_xml import AuthorXMLParser
-
-
-def test_parsing_author_xml():
-    data = """
-    <collaborationauthorlist xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:cal="http://inspirehep.net/info/HepNames/tools/authors_xml/">
-    <cal:creationDate>2022-01-25</cal:creationDate>
-    <cal:publicationReference>Fermilab-PUB-2022-01-25</cal:publicationReference>
-    <cal:collaborations>
-    <cal:collaboration id="duneid">
-    <foaf:name>DUNE</foaf:name>
-    <cal:experimentNumber>DUNE</cal:experimentNumber>
-    </cal:collaboration>
-    </cal:collaborations>
-    <cal:authors>
-        <foaf:Person>
-            <foaf:name>Michael Finger</foaf:name>
-            <foaf:givenName>Michael</foaf:givenName>
-            <foaf:familyName>Finger</foaf:familyName>
-            <cal:authorNameNative lang=""/>
-            <cal:authorSuffix>Jr.</cal:authorSuffix>
-            <cal:authorStatus/>
-            <cal:authorNamePaper>M. Finger Jr.</cal:authorNamePaper>
-            <cal:authorAffiliations>
-            <cal:authorAffiliation organizationid="o27" connection=""/>
-            <cal:authorAffiliation organizationid="vo1" connection="AlsoAt"/>
-            </cal:authorAffiliations>
-            <cal:authorIDs>
-            <cal:authorID source="INSPIRE">INSPIRE-00171357</cal:authorID>
-            <cal:authorID source="CCID">391883</cal:authorID>
-            <cal:authorID source="ORCID">0000-0003-3155-2484</cal:authorID>
-            </cal:authorIDs>
-        </foaf:Person>
-    </cal:authors>
-    </collaborationauthorlist>
-    """
-    result = AuthorXMLParser(data).parse()
-    assert result[0]["full_name"] == "Finger, Michael, Jr."