From 36c7e2dd9f49b5979c559ad12b72dd29d1bdb064 Mon Sep 17 00:00:00 2001 From: PascalEgn Date: Mon, 5 Aug 2024 17:57:06 +0200 Subject: [PATCH] global: format to <100 line-width --- inspire_utils/config.py | 6 +- inspire_utils/date.py | 49 ++++--- inspire_utils/dedupers.py | 10 +- inspire_utils/helpers.py | 14 +- inspire_utils/logging.py | 6 +- inspire_utils/name.py | 209 ++++++++++++++++++---------- inspire_utils/parsers/arxiv.py | 129 ++++++++++------- inspire_utils/parsers/author_xml.py | 22 +-- inspire_utils/parsers/crossref.py | 13 +- inspire_utils/parsers/elsevier.py | 61 ++++---- inspire_utils/parsers/jats.py | 115 ++++++++------- inspire_utils/query.py | 19 +-- inspire_utils/record.py | 11 +- inspire_utils/urls.py | 3 +- inspire_utils/utils.py | 18 +-- ruff.toml | 3 +- tests/test_config.py | 15 +- tests/test_date.py | 4 +- tests/test_helpers.py | 9 +- tests/test_name.py | 61 ++++---- tests/test_parsers_arxiv.py | 16 ++- tests/test_parsers_crossref.py | 33 +++-- tests/test_parsers_elsevier.py | 39 +++--- tests/test_parsers_jats.py | 25 ++-- tests/test_record.py | 4 +- tests/test_urls.py | 4 +- 26 files changed, 533 insertions(+), 365 deletions(-) diff --git a/inspire_utils/config.py b/inspire_utils/config.py index e323fad..66d162f 100644 --- a/inspire_utils/config.py +++ b/inspire_utils/config.py @@ -19,7 +19,6 @@ # In applying this license, CERN does not waive the privileges and immunities # granted to it by virtue of its status as an Intergovernmental Organization # or submit itself to any jurisdiction. - """INSPIRE configuration loader. Inspired by the Flask configuration loader: @@ -47,10 +46,7 @@ def __init__(self, file_path, cause): cause (string): reason of failure, i.e. what exactly was the problem while parsing """ - message = six.text_type("Malformed config at {}: {}").format( - file_path, - cause - ) + message = six.text_type("Malformed config at {}: {}").format(file_path, cause) super(MalformedConfig, self).__init__(message) diff --git a/inspire_utils/date.py b/inspire_utils/date.py index 4216579..70ad648 100644 --- a/inspire_utils/date.py +++ b/inspire_utils/date.py @@ -19,7 +19,6 @@ # In applying this license, CERN does not waive the privileges and immunities # granted to it by virtue of its status as an Intergovernmental Organization # or submit itself to any jurisdiction. - """Utils to handle dates in INSPIRE.""" from __future__ import absolute_import, division, print_function @@ -48,13 +47,18 @@ class PartialDate(object): Raises: TypeError: when the date parts are not `int` s or `None`. ValueError: when the date is not valid. - """ + def __init__(self, year, month=None, day=None): - well_typed = all(isinstance(part, int) or part is None for part in (year, month, day)) + well_typed = all( + isinstance(part, int) or part is None for part in (year, month, day) + ) if not well_typed: - raise TypeError(u'arguments to {classname} must be of type int or None'.format( - classname=type(self).__name__)) + raise TypeError( + u'arguments to {classname} must be of type int or None'.format( + classname=type(self).__name__ + ) + ) if year is None or year < 1000: raise ValueError('year must be an int >= 1000') if day and not month: @@ -68,10 +72,18 @@ def __init__(self, year, month=None, day=None): self.day = day def __repr__(self): - return u'PartialDate(year={self.year}, month={self.month}, day={self.day})'.format(self=self) + return ( + u'PartialDate(year={self.year}, month={self.month}, day={self.day})'.format( + self=self + ) + ) def __eq__(self, other): - return self.year == other.year and self.month == other.month and self.day == other.day + return ( + self.year == other.year + and self.month == other.month + and self.day == other.day + ) def __lt__(self, other): self_month = self.month or 99 @@ -97,7 +109,6 @@ def loads(cls, string): Traceback (most recent call last): ... ValueError: month must be in 1..12 - """ date_parts = string.split('-') @@ -116,7 +127,6 @@ def dumps(self): Returns: str: normalized date, in the form ``YYYY-MM-DD``, ``YYYY-MM`` or ``YYYY`` (depending on the information present in the date) - """ non_empty = itertools.takewhile(bool, (self.year, self.month, self.day)) # XXX: this only handles dates after 1000, which should be sufficient @@ -147,7 +157,6 @@ def parse(cls, date, **kwargs): Examples: >>> PartialDate.parse('30 Jun 1686') PartialDate(year=1686, month=6, day=30) - """ # In order to detect partial dates, parse twice with different defaults # and compare the results. @@ -180,7 +189,6 @@ def from_parts(cls, year, month=None, day=None): Examples: >>> PartialDate.from_parts('1686', 'June', '30') PartialDate(year=1686, month=6, day=30) - """ # XXX: 0 is not a valid year/month/day non_empty = itertools.takewhile( @@ -194,13 +202,18 @@ def pprint(self): Examples: >>> PartialDate(1686, 6, 30).pprint() u'Jun 30, 1686' - """ if not self.month: - return dates.format_date(datetime.date(self.year, 1, 1), 'yyyy', locale='en') + return dates.format_date( + datetime.date(self.year, 1, 1), 'yyyy', locale='en' + ) if not self.day: - return dates.format_date(datetime.date(self.year, self.month, 1), 'MMM, yyyy', locale='en') - return dates.format_date(datetime.date(self.year, self.month, self.day), 'MMM d, yyyy', locale='en') + return dates.format_date( + datetime.date(self.year, self.month, 1), 'MMM, yyyy', locale='en' + ) + return dates.format_date( + datetime.date(self.year, self.month, self.day), 'MMM d, yyyy', locale='en' + ) def normalize_date(date, **kwargs): @@ -232,7 +245,6 @@ def normalize_date(date, **kwargs): >>> normalize_date(None) >>> normalize_date('30 Jun 1686') '1686-06-30' - """ if date is None: return @@ -265,7 +277,10 @@ def earliest_date(dates): def fill_missing_date_parts(date): - """Sets missing day and/or month to 1. Useful to avoid errors when saving to DB.""" + """Sets missing day and/or month to 1. + + Useful to avoid errors when saving to DB. + """ if date is None: return diff --git a/inspire_utils/dedupers.py b/inspire_utils/dedupers.py index d15edbb..f559a85 100644 --- a/inspire_utils/dedupers.py +++ b/inspire_utils/dedupers.py @@ -28,8 +28,8 @@ def dedupe_list(list_with_duplicates): """Remove duplicates from a list preserving the order. - We might be tempted to use the list(set(l)) idiom, but it doesn't preserve - the order, which hinders testability. + We might be tempted to use the list(set(l)) idiom, but it doesn't + preserve the order, which hinders testability. """ result = [] @@ -43,9 +43,11 @@ def dedupe_list(list_with_duplicates): def dedupe_list_of_dicts(ld): """Remove duplicates from a list of dictionaries preserving the order. - We can't use the generic list helper because a dictionary isn't hashable. - Adapted from http://stackoverflow.com/a/9427216/374865. + We can't use the generic list helper because a dictionary isn't + hashable. Adapted from + http://stackoverflow.com/a/9427216/374865. """ + def _freeze(o): """Recursively freezes a dict into an hashable object. diff --git a/inspire_utils/helpers.py b/inspire_utils/helpers.py index 36078c7..260caa2 100644 --- a/inspire_utils/helpers.py +++ b/inspire_utils/helpers.py @@ -58,7 +58,6 @@ def force_list(data): ['foo', 'bar'] >>> force_list(['foo', 'bar', 'baz']) ['foo', 'bar', 'baz'] - """ if data is None: return [] @@ -81,7 +80,6 @@ def maybe_float(el): Examples: >>> maybe_float('35.0499505') 35.0499505 - """ try: return float(el) @@ -101,7 +99,6 @@ def maybe_int(el): Examples: >>> maybe_int('10') 10 - """ try: return int(el) @@ -132,7 +129,8 @@ def remove_tags(dirty, allowed_tags=(), allowed_trees=(), strip=None): removed. Examples: - >>> tag = '

Only this text remains.

' + >>> tag = '

Only this text remains. +

' >>> remove_tags(tag, allowed_tree=('b',), strip='@class="hidden"') u'Only this text remains.' >>> remove_tags(tag, allowed_tags=('b',), strip='@class="hidden"') @@ -141,7 +139,9 @@ def remove_tags(dirty, allowed_tags=(), allowed_trees=(), strip=None): u'Only this text remains.' """ if isinstance(dirty, six.string_types): - element = etree.fromstring(u''.join(('', dirty, ''))) + element = etree.fromstring( + u''.join(('', dirty, '')) + ) elif isinstance(dirty, etree._Element): element = dirty else: # assuming scrapy Selector @@ -156,7 +156,9 @@ def remove_tags(dirty, allowed_tags=(), allowed_trees=(), strip=None): return tail subtext = u''.join( - remove_tags(child, allowed_tags=allowed_tags, allowed_trees=allowed_trees, strip=strip) + remove_tags( + child, allowed_tags=allowed_tags, allowed_trees=allowed_trees, strip=strip + ) for child in element ) text = element.text or u'' diff --git a/inspire_utils/logging.py b/inspire_utils/logging.py index da252ae..16ef9be 100644 --- a/inspire_utils/logging.py +++ b/inspire_utils/logging.py @@ -36,13 +36,15 @@ def __getattr__(self, item): def error(self, message, *args, **kwargs): """Log error with stack trace and locals information. - By default, enables stack trace information in logging messages, so that stacktrace and locals appear in Sentry. + By default, enables stack trace information in logging messages, + so that stacktrace and locals appear in Sentry. """ kwargs.setdefault('extra', {}).setdefault('stack', True) return self.logger.error(message, *args, **kwargs) def getStackTraceLogger(*args, **kwargs): - """Returns a :class:`StackTrace` logger that wraps a Python logger instance.""" + """Returns a :class:`StackTrace` logger that wraps a Python logger + instance.""" logger = logging.getLogger(*args, **kwargs) return StackTraceLogger(logger) diff --git a/inspire_utils/name.py b/inspire_utils/name.py index 9fe5624..b883d08 100644 --- a/inspire_utils/name.py +++ b/inspire_utils/name.py @@ -38,20 +38,45 @@ _LASTNAME_NON_LASTNAME_SEPARATORS = [u' ', u', '] _NAMES_MAX_NUMBER_THRESHOLD = 5 -"""Threshold for skipping the combinatorial expansion of names (when generating name variations). """ +"""Threshold for skipping the combinatorial expansion of names (when generating +name variations).""" def _prepare_nameparser_constants(): """Prepare nameparser Constants. - Remove nameparser's titles and use our own and add as suffixes the roman numerals. - Configuration is the same for all names (i.e. instances). + Remove nameparser's titles and use our own and add as suffixes the + roman numerals. Configuration is the same for all names (i.e. + instances). """ constants = Constants() - roman_numeral_suffixes = [u'vi', u'vii', u'viii', u'ix', u'x', u'xi', - u'xii', u'xiii', u'xiv', u'xv'] - titles = [u'Dr', u'Prof', u'Professor', u'Sir', u'Editor', u'Ed', u'Mr', - u'Mrs', u'Ms', u'Chair', u'Co-Chair', u'Chairs', u'co-Chairs'] + roman_numeral_suffixes = [ + u'vi', + u'vii', + u'viii', + u'ix', + u'x', + u'xi', + u'xii', + u'xiii', + u'xiv', + u'xv', + ] + titles = [ + u'Dr', + u'Prof', + u'Professor', + u'Sir', + u'Editor', + u'Ed', + u'Mr', + u'Mrs', + u'Ms', + u'Chair', + u'Co-Chair', + u'Chairs', + u'co-Chairs', + ] constants.titles.remove(*constants.titles).add(*titles) constants.suffix_not_acronyms.add(*roman_numeral_suffixes) constants.suffixes_prefixes_titles.remove(*constants.suffixes_prefixes_titles) @@ -65,15 +90,18 @@ class ParsedName(object): After construction, the instance exposes the fields exposed by `HumanName` instance, i.e. `title`, `first`, `middle`, `last`, `suffix`. """ + constants = _prepare_nameparser_constants() - """The default constants configuration for `HumanName` to use for parsing all names.""" + """The default constants configuration for `HumanName` to use for parsing + all names.""" def __init__(self, name, constants=None, without_titles=False): """Create a ParsedName instance. Args: name (Union[str, HumanName]): The name to be parsed (must be non empty nor None). - constants (:class:`nameparser.config.Constants`): Configuration for `HumanName` instantiation. + constants (:class:`nameparser.config.Constants`): Configuration + for `HumanName` instantiation. (Can be None, if provided it overwrites the default one generated in :method:`prepare_nameparser_constants`.) """ @@ -93,7 +121,9 @@ def __init__(self, name, constants=None, without_titles=False): self._parsed_name.capitalize() - if ',' not in name and (not self.first_list or (self.first_list and '.' not in self.first_list[-1])): + if ',' not in name and ( + not self.first_list or (self.first_list and '.' not in self.first_list[-1]) + ): self.maybe_only_last_name = True else: self.maybe_only_last_name = False @@ -116,18 +146,26 @@ def first_initials(self): @property def first(self): - name = u'{} {}'.format(self._parsed_name.first, self._parsed_name.middle).strip() + name = u'{} {}'.format( + self._parsed_name.first, self._parsed_name.middle + ).strip() return name.strip('.') @property def first_initials_list(self): - names_no_dash_list = itertools.chain.from_iterable(name.split("-") for name in self.first_list) - names_split_on_dot = itertools.chain.from_iterable(name.split('.') for name in names_no_dash_list) + names_no_dash_list = itertools.chain.from_iterable( + name.split("-") for name in self.first_list + ) + names_split_on_dot = itertools.chain.from_iterable( + name.split('.') for name in names_no_dash_list + ) return [(name[0] + u'.') for name in names_split_on_dot if name] @property def first_list(self): - first_and_middle_names = self._parsed_name.first_list + self._parsed_name.middle_list + first_and_middle_names = ( + self._parsed_name.first_list + self._parsed_name.middle_list + ) names = [name for name in first_and_middle_names if name and name != '.'] return names @@ -156,9 +194,11 @@ def loads(cls, name): ValueError: when name is empty or None. """ if not isinstance(name, six.string_types): - raise TypeError(u'arguments to {classname} must be of type {string_types}'.format( - classname=cls.__name__, string_types=repr(six.string_types) - )) + raise TypeError( + u'arguments to {classname} must be of type {string_types}'.format( + classname=cls.__name__, string_types=repr(six.string_types) + ) + ) if not name or name.isspace(): raise ValueError('name must not be empty') @@ -166,12 +206,12 @@ def loads(cls, name): def dumps(self): """Dump the name to string, after normalizing it.""" + def _is_initial(author_name): return len(author_name) == 1 or u'.' in author_name def _ensure_dotted_initials(author_name): - if _is_initial(author_name) \ - and u'.' not in author_name: + if _is_initial(author_name) and u'.' not in author_name: seq = (author_name, u'.') author_name = u''.join(seq) return author_name @@ -183,20 +223,30 @@ def _ensure_dotted_suffixes(author_suffix): return author_suffix def _is_roman_numeral(suffix): - """Controls that the user's input only contains valid roman numerals""" - valid_roman_numerals = [u'M', u'D', u'C', u'L', u'X', - u'V', u'I', u'(', u')'] - return all(letters in valid_roman_numerals - for letters in suffix.upper()) - - first_and_middle_names = iter(_ensure_dotted_initials(name) for name in self.first_list) + """Controls that the user's input only contains valid roman + numerals.""" + valid_roman_numerals = [ + u'M', + u'D', + u'C', + u'L', + u'X', + u'V', + u'I', + u'(', + u')', + ] + return all(letters in valid_roman_numerals for letters in suffix.upper()) + + first_and_middle_names = iter( + _ensure_dotted_initials(name) for name in self.first_list + ) try: prev = next(first_and_middle_names) names_with_spaces = [prev] except StopIteration: - LOGGER.warning(u"Cannot process %s properly", - self._parsed_name.original) + LOGGER.warning(u"Cannot process %s properly", self._parsed_name.original) names_with_spaces = [] for name in first_and_middle_names: @@ -213,8 +263,8 @@ def _is_roman_numeral(suffix): suffix = _ensure_dotted_suffixes(self.suffix) final_name = u', '.join( - part for part in (self.last, normalized_names.strip(), suffix) - if part) + part for part in (self.last, normalized_names.strip(), suffix) if part + ) # Replace unicode curly apostrophe to normal apostrophe. final_name = final_name.replace(u'’', '\'') @@ -235,27 +285,16 @@ def pprint(self, initials_only=False): u'S. M. Lieber' >>> ParsedName('Downey, Robert Jr.').pprint(initials_only=True) u'R. Downey Jr.' - """ last_name = self.last suffixes = ', ' + self.suffix if self.suffix else '' - if initials_only and last_name != u'': - first_names = self.first_initials - else: - first_names = self.first + first_names = self.first_initials if initials_only and last_name != u'' else self.first return u'{} {}{}'.format(first_names, last_name, suffixes).strip() @classmethod - def from_parts( - cls, - first=None, - last=None, - middle=None, - suffix=None, - title=None - ): + def from_parts(cls, first=None, last=None, middle=None, suffix=None, title=None): name = HumanName() name.first = first name.middle = middle @@ -266,16 +305,19 @@ def from_parts( def generate_es_query(self, keyword="authors"): """Generates a query handling specifically authors. + Notes: There are three main cases: 1) ``a Smith`` This will just generate a ``match`` query on ``last_name`` 2) ``a John Smith`` - This will just generate a ``match`` query on ``last_name`` and a ``prefix`` query on ``first_name`` - and a ``match`` query on the initial ``J``. This will return results from ``Smith, John`` and ``Smith, J`` + This will just generate a ``match`` query on ``last_name`` and a ``prefix`` + query on ``first_name`` and a ``match`` query on the initial ``J``. + This will return results from ``Smith, John`` and ``Smith, J`` but not from ``Smith, Jane``. 3) ``a J Smith`` - This will just generate a ``match`` query on ``last_name`` and a match query on ``first_name.initials``. + This will just generate a ``match`` query on ``last_name`` and a match + query on ``first_name.initials``. Please note, cases such as ``J.D.`` have been properly handled by the tokenizer. """ nested_query = { @@ -381,9 +423,8 @@ def normalize_name(name): def _generate_non_lastnames_variations(non_lastnames): """Generate variations for all non-lastnames. - E.g. For 'John Richard', this method generates: [ - 'John', 'J', 'Richard', 'R', 'John Richard', 'John R', 'J Richard', 'J R', - ] + E.g. For 'John Richard', this method generates: [ 'John', 'J', + 'Richard', 'R', 'John Richard', 'John R', 'J Richard', 'J R', ] """ if not non_lastnames: return [] @@ -411,12 +452,18 @@ def _generate_lastnames_variations(lastnames): if not lastnames: return [] - split_lastnames = [split_lastname for lastname in lastnames for split_lastname in lastname.split('-')] + split_lastnames = [ + split_lastname + for lastname in lastnames + for split_lastname in lastname.split('-') + ] lastnames_variations = split_lastnames if len(split_lastnames) > 1: # Generate lastnames concatenation if there are more than one lastname after split. - lastnames_variations.append(u' '.join([lastname for lastname in split_lastnames])) + lastnames_variations.append( + u' '.join([lastname for lastname in split_lastnames]) + ) return lastnames_variations @@ -431,19 +478,23 @@ def generate_name_variations(name): list: All the name variations for the given name. Notes: - Uses `unidecode` for doing unicode characters transliteration to ASCII ones. This was chosen so that we can map - both full names of authors in HEP records and user's input to the same space and thus make exact queries work. + Uses `unidecode` for doing unicode characters transliteration to ASCII ones. + This was chosen so that we can map both full names of authors in HEP records + and user's input to the same space and thus make exact queries work. """ + def _update_name_variations_with_product(set_a, set_b): - name_variations.update([ - unidecode((names_variation[0] + - separator + - names_variation[1]).strip(''.join(_LASTNAME_NON_LASTNAME_SEPARATORS))).lower() - for names_variation - in product(set_a, set_b) - for separator - in _LASTNAME_NON_LASTNAME_SEPARATORS - ]) + name_variations.update( + [ + unidecode( + (names_variation[0] + separator + names_variation[1]).strip( + ''.join(_LASTNAME_NON_LASTNAME_SEPARATORS) + ) + ).lower() + for names_variation in product(set_a, set_b) + for separator in _LASTNAME_NON_LASTNAME_SEPARATORS + ] + ) parsed_name = ParsedName.loads(name) @@ -453,27 +504,32 @@ def _update_name_variations_with_product(set_a, set_b): name_variations = set() - # We need to filter out empty entries, since HumanName for this name `Perelstein,, Maxim` returns a first_list with - # an empty string element. + # We need to filter out empty entries, since HumanName for this name + # `Perelstein,, Maxim` returns a first_list with an empty string element. non_lastnames = [ non_lastname - for non_lastname - in parsed_name.first_list + parsed_name.suffix_list + for non_lastname in parsed_name.first_list + parsed_name.suffix_list if non_lastname ] - # This is needed because due to erroneous data (e.g. having many authors in a single authors field) ends up + # This is needed because due to erroneous data (e.g. having many + # authors in a single authors field) ends up # requiring a lot of memory (due to combinatorial expansion of all non lastnames). # The policy is to use the input as a name variation, since this data will have to be curated. - if len(non_lastnames) > _NAMES_MAX_NUMBER_THRESHOLD or len(parsed_name.last_list) > _NAMES_MAX_NUMBER_THRESHOLD: - LOGGER.warning('Skipping name variations generation - too many names in: "%s"', name) + if ( + len(non_lastnames) > _NAMES_MAX_NUMBER_THRESHOLD + or len(parsed_name.last_list) > _NAMES_MAX_NUMBER_THRESHOLD + ): + LOGGER.warning( + 'Skipping name variations generation - too many names in: "%s"', name + ) return [name] - non_lastnames_variations = \ - _generate_non_lastnames_variations(non_lastnames) + non_lastnames_variations = _generate_non_lastnames_variations(non_lastnames) lastnames_variations = _generate_lastnames_variations(parsed_name.last_list) - # Create variations where lastnames comes first and is separated from non lastnames either by space or comma. + # Create variations where lastnames comes first and is separated + # from non lastnames either by space or comma. _update_name_variations_with_product(lastnames_variations, non_lastnames_variations) # Second part of transformations - having the lastnames in the end. @@ -490,7 +546,8 @@ def format_name(name, initials_only=False, without_titles=False): Args: name (str): The name to format, in pretty much any format. - initials_only (bool): ``True`` if we want the first names to be displayed with only the initial followed by a dot. ``False`` otherwise. + initials_only (bool): ``True`` if we want the first names to be + displayed with only the initial followed by a dot. ``False`` otherwise. Examples: >>> format_name('Lieber, Stanley Martin') @@ -500,4 +557,8 @@ def format_name(name, initials_only=False, without_titles=False): >>> format_name('Downey, Robert Jr.', initials_only=True) u'R. Downey Jr.' """ - return ParsedName(name=name, without_titles=without_titles).loads(name).pprint(initials_only) + return ( + ParsedName(name=name, without_titles=without_titles) + .loads(name) + .pprint(initials_only) + ) diff --git a/inspire_utils/parsers/arxiv.py b/inspire_utils/parsers/arxiv.py index 6c24884..68d630d 100644 --- a/inspire_utils/parsers/arxiv.py +++ b/inspire_utils/parsers/arxiv.py @@ -47,15 +47,11 @@ ) RE_CONFERENCE = re.compile( - r'\b(%s)\b' % '|'.join( - [re.escape(word) for word in CONFERENCE_WORDS] - ), + r'\b(%s)\b' % '|'.join([re.escape(word) for word in CONFERENCE_WORDS]), re.I | re.U, ) RE_THESIS = re.compile( - r'\b(%s)\b' % '|'.join( - [re.escape(word) for word in THESIS_WORDS] - ), + r'\b(%s)\b' % '|'.join([re.escape(word) for word in THESIS_WORDS]), re.I | re.U, ) RE_PAGES = re.compile(r'(?i)(\d+)\s*pages?\b') @@ -71,18 +67,20 @@ def _handle_sqrt(node, l2tobj): def get_arxiv_latex_context_db(): default_db = get_default_latex_context_db() - arxiv_db = default_db.filter_context(keep_categories=["latex-base", "advanced-symbols"]) + arxiv_db = default_db.filter_context( + keep_categories=["latex-base", "advanced-symbols"] + ) arxiv_db.add_context_category( - "overrides", - prepend=True, - macros=[ - MacroTextSpec("sqrt", _handle_sqrt) - ] + "overrides", prepend=True, macros=[MacroTextSpec("sqrt", _handle_sqrt)] ) # adapted from https://github.com/phfaist/pylatexenc/issues/32 - arxiv_db.set_unknown_macro_spec(MacroTextSpec("", lambda node: node.latex_verbatim())) - arxiv_db.set_unknown_environment_spec(EnvironmentTextSpec("", lambda node: node.latex_verbatim())) + arxiv_db.set_unknown_macro_spec( + MacroTextSpec("", lambda node: node.latex_verbatim()) + ) + arxiv_db.set_unknown_environment_spec( + EnvironmentTextSpec("", lambda node: node.latex_verbatim()) + ) return arxiv_db @@ -98,6 +96,7 @@ class ArxivParser(object): source (Optional[str]): if provided, sets the ``source`` everywhere in the record. Otherwise, the source is extracted from the arXiv metadata. """ + _l2t = LatexNodes2Text( latex_context=get_arxiv_latex_context_db(), math_mode="verbatim", @@ -139,8 +138,9 @@ def parse(self): self.builder.add_arxiv_eprint(self.arxiv_eprint, self.arxiv_categories) self.builder.add_private_note(self.private_note) self.builder.add_document_type(self.document_type) - normalized_categories = [classify_field(arxiv_cat) - for arxiv_cat in self.arxiv_categories] + normalized_categories = [ + classify_field(arxiv_cat) for arxiv_cat in self.arxiv_categories + ] self.builder.add_inspire_categories(dedupe_list(normalized_categories), 'arxiv') return self.builder.record @@ -161,8 +161,13 @@ def _get_authors_and_collaborations(self, node): # take 'for the' out of the general phrases and dont use it in # affiliations collab_phrases = [ - 'consortium', ' collab ', 'collaboration', ' team', 'group', - ' on behalf of ', ' representing ', + 'consortium', + ' collab ', + 'collaboration', + ' team', + 'group', + ' on behalf of ', + ' representing ', ] inst_phrases = ['institute', 'university', 'department', 'center'] @@ -172,16 +177,23 @@ def _get_authors_and_collaborations(self, node): some_affiliation_contains_collaboration = False authors_and_affiliations = ( - self._get_author_names_and_affiliations(author) for author in author_selectors + self._get_author_names_and_affiliations(author) + for author in author_selectors ) next_author_and_affiliations = ( - self._get_author_names_and_affiliations(author) for author in author_selectors + self._get_author_names_and_affiliations(author) + for author in author_selectors ) next(next_author_and_affiliations) - for (forenames, keyname, affiliations), (next_forenames, next_keyname, _) in six.moves.zip_longest( - authors_and_affiliations, next_author_and_affiliations, - fillvalue=('end of author-list', '', None) + for (forenames, keyname, affiliations), ( + next_forenames, + next_keyname, + _, + ) in six.moves.zip_longest( + authors_and_affiliations, + next_author_and_affiliations, + fillvalue=('end of author-list', '', None), ): name_string = " %s %s " % (forenames, keyname) @@ -193,9 +205,7 @@ def _get_authors_and_collaborations(self, node): for aff in affiliations: affiliation_contains_collaboration = any( phrase in aff.lower() for phrase in collab_phrases - ) and not any( - phrase in aff.lower() for phrase in inst_phrases - ) + ) and not any(phrase in aff.lower() for phrase in inst_phrases) if affiliation_contains_collaboration: affiliations_with_collaborations.append(aff) some_affiliation_contains_collaboration = True @@ -214,12 +224,14 @@ def _get_authors_and_collaborations(self, node): coll, author_name = coll_cleanforthe(name_string) if author_name: surname, given_names = split_fullname(author_name) - authors.append({ - 'full_name': surname + ', ' + given_names, - 'surname': surname, - 'given_names': given_names, - 'affiliations': [], - }) + authors.append( + { + 'full_name': surname + ', ' + given_names, + 'surname': surname, + 'given_names': given_names, + 'affiliations': [], + } + ) if coll and coll not in collaborations: collaborations.append(coll) elif name_string.strip() == ':': @@ -228,30 +240,35 @@ def _get_authors_and_collaborations(self, node): if not some_affiliation_contains_collaboration: # everything up to now seems to be collaboration info for author_info in authors: - name_string = " %s %s " % \ - (author_info['given_names'], author_info['surname']) + name_string = " %s %s " % ( + author_info['given_names'], + author_info['surname'], + ) coll, author_name = coll_cleanforthe(name_string) if coll and coll not in collaborations: collaborations.append(coll) authors = [] else: - authors.append({ - 'full_name': keyname + ', ' + forenames, - 'surname': keyname, - 'given_names': forenames, - 'affiliations': affiliations_without_collaborations - }) + authors.append( + { + 'full_name': keyname + ', ' + forenames, + 'surname': keyname, + 'given_names': forenames, + 'affiliations': affiliations_without_collaborations, + } + ) if warning_tags: - warning = 'WARNING: Colon in authors before %s: Check author list for collaboration names!' % ', '.join(warning_tags) + warning = ( + 'WARNING: Colon in authors before %s: Check author list for collaboration names!' + % ', '.join(warning_tags) + ) else: warning = '' return authors, collaborations, warning @staticmethod def _get_author_names_and_affiliations(author_node): - forenames = u' '.join( - author_node.xpath('.//forenames//text()').extract() - ) + forenames = u' '.join(author_node.xpath('.//forenames//text()').extract()) keyname = u' '.join(author_node.xpath('.//keyname//text()').extract()) affiliations = author_node.xpath('.//affiliation//text()').extract() @@ -272,8 +289,12 @@ def abstract(self): @property def authors(self): authors, _, _ = self.authors_and_collaborations - parsed_authors = [self.builder.make_author( - full_name=auth["full_name"], raw_affiliations=auth["affiliations"]) for auth in authors] + parsed_authors = [ + self.builder.make_author( + full_name=auth["full_name"], raw_affiliations=auth["affiliations"] + ) + for auth in authors + ] return parsed_authors @@ -286,7 +307,9 @@ def collaborations(self): @property def dois(self): doi_values = self.root.xpath('.//doi/text()').extract() - doi_values_splitted = chain.from_iterable([re.split(RE_DOIS, doi) for doi in doi_values]) + doi_values_splitted = chain.from_iterable( + [re.split(RE_DOIS, doi) for doi in doi_values] + ) dois = [ {'doi': value, 'material': 'publication'} for value in doi_values_splitted ] @@ -328,7 +351,9 @@ def pubinfo_freetext(self): @property def title(self): - long_text_fixed = self.fix_long_text(self.root.xpath('.//title/text()').extract_first()) + long_text_fixed = self.fix_long_text( + self.root.xpath('.//title/text()').extract_first() + ) return self.latex_to_unicode(long_text_fixed) @staticmethod @@ -386,7 +411,9 @@ def arxiv_eprint(self): def arxiv_categories(self): categories = self.root.xpath('.//categories/text()').extract_first(default='[]') categories = categories.split() - categories_without_old = [normalize_arxiv_category(arxiv_cat) for arxiv_cat in categories] + categories_without_old = [ + normalize_arxiv_category(arxiv_cat) for arxiv_cat in categories + ] return dedupe_list(categories_without_old) @@ -409,7 +436,9 @@ def source(self): @property def authors_and_collaborations(self): if not hasattr(self, '_authors_and_collaborations'): - self._authors_and_collaborations = self._get_authors_and_collaborations(self.root) + self._authors_and_collaborations = self._get_authors_and_collaborations( + self.root + ) return self._authors_and_collaborations @classmethod diff --git a/inspire_utils/parsers/author_xml.py b/inspire_utils/parsers/author_xml.py index 9acc26f..4565f52 100644 --- a/inspire_utils/parsers/author_xml.py +++ b/inspire_utils/parsers/author_xml.py @@ -55,11 +55,11 @@ def parse(self): content.remove_namespaces() undefined_or_none_value_regex = re.compile("undefined|none", re.IGNORECASE) undefined_or_empty_inspireid_value_regex = re.compile( - "undefined|inspire-\s*$", re.IGNORECASE # noqa + "undefined|inspire-\s*$", re.IGNORECASE # noqa ) undefined_value_regex = re.compile("undefined", re.IGNORECASE) ror_path_value_regex = re.compile("https://ror.org/*") - remove_new_line_regex = re.compile("\s*\n\s*") # noqa + remove_new_line_regex = re.compile("\s*\n\s*") # noqa # Goes through all the authors in the file for author in content.xpath("//Person"): @@ -71,10 +71,12 @@ def parse(self): # Gets all the author ids for source, id in zip( author.xpath( - './authorIDs/authorID[@source!="" and text()!=""]/@source | ./authorids/authorid[@source!="" and text()!=""]/@source' + './authorIDs/authorID[@source!="" and text()!=""]/@source' + '| ./authorids/authorid[@source!="" and text()!=""]/@source' ).getall(), author.xpath( - './authorIDs/authorID[@source!="" and text()!=""]/text() | ./authorids/authorid[@source!="" and text()!=""]/text()' + './authorIDs/authorID[@source!="" and text()!=""]/text()' + '| ./authorids/authorid[@source!="" and text()!=""]/text()' ).getall(), ): source = re.sub(remove_new_line_regex, "", source) @@ -94,7 +96,8 @@ def parse(self): "./authorAffiliations/authorAffiliation/@organizationid" ).getall(): orgName = content.xpath( - 'string(//organizations/Organization[@id="{}"]/orgName[@source="spiresICN" or @source="INSPIRE" and text()!="" ]/text())'.format( + 'string(//organizations/Organization[@id="{}"]/orgName[@source="spiresICN"' + 'or @source="INSPIRE" and text()!="" ]/text())'.format( affiliation ) ).get() @@ -105,15 +108,18 @@ def parse(self): ): affiliations.append(cleaned_org_name) - # Gets all the affiliations_identifiers for affiliated organizations using the organization ids from author + # Gets all the affiliations_identifiers for affiliated organizations + # using the organization ids from author for value, source in zip( content.xpath( - '//organizations/Organization[@id="{}"]/orgName[@source="ROR" or @source="GRID" and text()!=""]/text()'.format( + '//organizations/Organization[@id="{}"]/orgName[@source="ROR"' + 'or @source="GRID" and text()!=""]/text()'.format( affiliation ) ).getall(), content.xpath( - '//organizations/Organization[@id="{}"]/orgName[@source="ROR" or @source="GRID" and text()!=""]/@source'.format( + '//organizations/Organization[@id="{}"]/orgName[@source="ROR"' + 'or @source="GRID" and text()!=""]/@source'.format( affiliation ) ).getall(), diff --git a/inspire_utils/parsers/crossref.py b/inspire_utils/parsers/crossref.py index d732ebe..1c66071 100644 --- a/inspire_utils/parsers/crossref.py +++ b/inspire_utils/parsers/crossref.py @@ -66,6 +66,7 @@ class CrossrefParser(object): source (Optional[str]): if provided, sets the ``source`` everywhere in the record. Otherwise, the source is extracted from the Crossref metadata. """ + def __init__(self, crossref_record, source=None): self.record = crossref_record.get("message") if not source: @@ -115,9 +116,7 @@ def subtitle(self): @property def dois(self): value = self.record.get("DOI") - dois = [ - {'doi': value, 'material': self.material} - ] + dois = [{'doi': value, 'material': self.material}] return dois @@ -133,7 +132,9 @@ def material(self): material = 'erratum' elif title.startswith("Addendum") or subtitle.startswith("Addendum"): material = 'addendum' - elif title.startswith("Publisher's Note") or subtitle.startswith("Publisher's Note"): + elif title.startswith("Publisher's Note") or subtitle.startswith( + "Publisher's Note" + ): material = 'editorial note' else: material = 'publication' @@ -238,7 +239,9 @@ def get_author(self, author_key): affiliations = self.get_author_affiliations(author_key) orcid = self.get_author_orcid(author_key) - return self.builder.make_author(author_name, raw_affiliations=affiliations, ids=orcid) + return self.builder.make_author( + author_name, raw_affiliations=affiliations, ids=orcid + ) @property def authors(self): diff --git a/inspire_utils/parsers/elsevier.py b/inspire_utils/parsers/elsevier.py index b6bd862..203fe0f 100644 --- a/inspire_utils/parsers/elsevier.py +++ b/inspire_utils/parsers/elsevier.py @@ -107,7 +107,8 @@ class ElsevierParser(object): subclassed to customize its behavior. Args: - elsevier_record (Union[str, scrapy.selector.Selector]): the record in Elsevier format to parse. + elsevier_record (Union[str, scrapy.selector.Selector]): the record in Elsevier + format to parse. source (Optional[str]): if provided, sets the ``source`` everywhere in the record. Otherwise, the source is extracted from the Elsevier metadata. """ @@ -141,9 +142,7 @@ def parse(self): if self.imprints_date: self.builder.add_imprint_date(self.imprints_date) elif self.publication_date: - self.builder.add_imprint_date( - self.publication_date.dumps() - ) + self.builder.add_imprint_date(self.publication_date.dumps()) for reference in self.references: self.builder.add_reference(reference) @@ -176,14 +175,17 @@ def references(self): @property def abstract(self): - abstract_nodes = self.root.xpath(".//head/abstract[not(@graphical)]/abstract-sec/simple-para") + abstract_nodes = self.root.xpath( + ".//head/abstract[not(@graphical)]/abstract-sec/simple-para" + ) if not abstract_nodes: return - abstract_paragraphs = [remove_tags( - abstract_node, **self.remove_tags_config_abstract - ).strip("/ \n") for abstract_node in abstract_nodes] + abstract_paragraphs = [ + remove_tags(abstract_node, **self.remove_tags_config_abstract).strip("/ \n") + for abstract_node in abstract_nodes + ] abstract = ' '.join(abstract_paragraphs) return abstract @@ -270,9 +272,7 @@ def dois(self): @property def document_type(self): doctype = None - if self.root.xpath( - "./*[contains(name(),'article') or self::book-review]" - ): + if self.root.xpath("./*[contains(name(),'article') or self::book-review]"): doctype = "article" elif self.root.xpath("./*[self::book or self::simple-book]"): doctype = "book" @@ -428,9 +428,9 @@ def publication_info(self): @property def publisher(self): - publisher = self.root.xpath("string(./RDF/Description/publisher[1])").extract_first( - "Elsevier B.V." - ) + publisher = self.root.xpath( + "string(./RDF/Description/publisher[1])" + ).extract_first("Elsevier B.V.") return publisher @@ -443,7 +443,11 @@ def subtitle(self): @property def title(self): title = self.root.xpath("./*/head/title[1]").extract_first() - return remove_tags(title, **self.remove_tags_config_title).strip("\n") if title else None + return ( + remove_tags(title, **self.remove_tags_config_title).strip("\n") + if title + else None + ) @property def year(self): @@ -544,7 +548,9 @@ def get_reference_authors(ref_node): authors = ref_node.xpath("./contribution/authors/author") authors_names = [] for author in authors: - given_names = author.xpath("string(./given-name[1])").extract_first(default="") + given_names = author.xpath("string(./given-name[1])").extract_first( + default="" + ) last_names = author.xpath("string(./surname[1])").extract_first(default="") authors_names.append(" ".join([given_names, last_names]).strip()) return authors_names @@ -562,7 +568,9 @@ def get_reference_editors(ref_node): editors = ref_node.xpath(".//editors/authors/author") editors_names = [] for editor in editors: - given_names = editor.xpath("string(./given-name[1])").extract_first(default="") + given_names = editor.xpath("string(./given-name[1])").extract_first( + default="" + ) last_names = editor.xpath("string(./surname[1])").extract_first(default="") editors_names.append(" ".join([given_names, last_names]).strip()) return editors_names @@ -580,13 +588,13 @@ def get_reference_pages(ref_node): def get_reference_iter(self, ref_node): """Extract one reference. - Args: - ref_node(scrapy.selector.Selector): a selector on a single - reference, i.e. ````. + Args: + ref_node(scrapy.selector.Selector): a selector on a single + reference, i.e. ````. - Yields: - dict: the parsed reference, as generated by - :class:`inspire_schemas.api.ReferenceBuilder` + Yields: + dict: the parsed reference, as generated by + :class:`inspire_schemas.api.ReferenceBuilder` """ # handle also unstructured refs for citation_node in ref_node.xpath("./reference|./other-ref"): @@ -599,7 +607,10 @@ def get_reference_iter(self, ref_node): ) fields = [ - (("string(.//series/title/maintitle[1])"), builder.set_journal_title,), + ( + ("string(.//series/title/maintitle[1])"), + builder.set_journal_title, + ), ( "string(.//title[parent::edited-book|parent::book]/maintitle[1])", builder.add_parent_title, @@ -646,7 +657,7 @@ def get_reference_iter(self, ref_node): "|self::label" "|self::publisher" "|self::doi" - "|self::pages" + "|self::pages", ) .strip("\"';,. \t\n\r") .replace("()", "") diff --git a/inspire_utils/parsers/jats.py b/inspire_utils/parsers/jats.py index fbe18b6..c723f0e 100644 --- a/inspire_utils/parsers/jats.py +++ b/inspire_utils/parsers/jats.py @@ -35,9 +35,7 @@ from inspire_utils.helpers import maybe_int, remove_tags from inspire_utils.utils import get_node -JOURNAL_TITLES_MAPPING = { - "Physics": "APS Physics" -} +JOURNAL_TITLES_MAPPING = {"Physics": "APS Physics"} class JatsParser(object): @@ -51,6 +49,7 @@ class JatsParser(object): source (Optional[str]): if provided, sets the ``source`` everywhere in the record. Otherwise, the source is extracted from the JATS metadata. """ + def __init__(self, jats_record, source=None): self.root = self.get_root_node(jats_record) if not source: @@ -104,7 +103,7 @@ def references(self): remove_tags_config_abstract = { 'allowed_tags': ['sup', 'sub'], 'allowed_trees': ['math'], - 'strip': 'self::pub-id|self::issn' + 'strip': 'self::pub-id|self::issn', } remove_tags_config_title = { @@ -118,7 +117,9 @@ def abstract(self): if not abstract_nodes: return - abstract = remove_tags(abstract_nodes[0], **self.remove_tags_config_abstract).strip() + abstract = remove_tags( + abstract_nodes[0], **self.remove_tags_config_abstract + ).strip() return abstract @property @@ -129,7 +130,9 @@ def article_type(self): @property def artid(self): - artid = self.root.xpath('./front/article-meta//elocation-id//text()').extract_first() + artid = self.root.xpath( + './front/article-meta//elocation-id//text()' + ).extract_first() return artid @@ -166,28 +169,34 @@ def copyright(self): @property def copyright_holder(self): - copyright_holder = self.root.xpath('./front//copyright-holder/text()').extract_first() + copyright_holder = self.root.xpath( + './front//copyright-holder/text()' + ).extract_first() return copyright_holder @property def copyright_statement(self): - copyright_statement = self.root.xpath('./front//copyright-statement/text()').extract_first() + copyright_statement = self.root.xpath( + './front//copyright-statement/text()' + ).extract_first() return copyright_statement @property def copyright_year(self): - copyright_year = self.root.xpath('./front//copyright-year/text()').extract_first() + copyright_year = self.root.xpath( + './front//copyright-year/text()' + ).extract_first() return maybe_int(copyright_year) @property def dois(self): - doi_values = self.root.xpath('./front/article-meta//article-id[@pub-id-type="doi"]/text()').extract() - dois = [ - {'doi': value, 'material': self.material} for value in doi_values - ] + doi_values = self.root.xpath( + './front/article-meta//article-id[@pub-id-type="doi"]/text()' + ).extract() + dois = [{'doi': value, 'material': self.material} for value in doi_values] if self.material != 'publication': doi_values = self.root.xpath( @@ -222,20 +231,26 @@ def journal_title(self): @property def journal_issue(self): - journal_issue = self.root.xpath('./front/article-meta/issue/text()').extract_first() + journal_issue = self.root.xpath( + './front/article-meta/issue/text()' + ).extract_first() return journal_issue @property def journal_volume(self): - journal_volume = self.root.xpath('./front/article-meta/volume/text()').extract_first() + journal_volume = self.root.xpath( + './front/article-meta/volume/text()' + ).extract_first() return journal_volume @property def keywords(self): keyword_groups = self.root.xpath('./front//kwd-group') - keywords = itertools.chain.from_iterable(self.get_keywords(group) for group in keyword_groups) + keywords = itertools.chain.from_iterable( + self.get_keywords(group) for group in keyword_groups + ) return keywords @@ -251,7 +266,11 @@ def license(self): @property def license_statement(self): - license_statement = self.root.xpath('string(./front/article-meta//license)').extract_first().strip() + license_statement = ( + self.root.xpath('string(./front/article-meta//license)') + .extract_first() + .strip() + ) return license_statement @@ -279,13 +298,17 @@ def material(self): @property def number_of_pages(self): - number_of_pages = maybe_int(self.root.xpath('./front/article-meta//page-count/@count').extract_first()) + number_of_pages = maybe_int( + self.root.xpath('./front/article-meta//page-count/@count').extract_first() + ) return number_of_pages @property def page_start(self): - page_start = self.root.xpath('./front/article-meta/fpage/text()').extract_first() + page_start = self.root.xpath( + './front/article-meta/fpage/text()' + ).extract_first() return page_start @@ -305,9 +328,7 @@ def publication_date(self): ) if date_nodes: - publication_date = min( - self.get_date(date_node) for date_node in date_nodes - ) + publication_date = min(self.get_date(date_node) for date_node in date_nodes) return publication_date @@ -382,13 +403,11 @@ def year(self): './front//pub-date[@pub-type="ppub"] |' './front//pub-date[starts-with(@date-type,"pub") and $not_online] |' './front//date[starts-with(@date-type,"pub") and $not_online]', - not_online=not_online + not_online=not_online, ) if date_nodes: - year = min( - self.get_date(date_node) for date_node in date_nodes - ).year + year = min(self.get_date(date_node) for date_node in date_nodes).year return year @@ -401,7 +420,8 @@ def get_author_affiliations(self, author_node): referred_ids.update(set(raw_referred_id.split(' '))) affiliations = [ - self.get_affiliation(rid) for rid in referred_ids + self.get_affiliation(rid) + for rid in referred_ids if self.get_affiliation(rid) ] @@ -439,11 +459,7 @@ def _get_iso_date(iso_date_string): @staticmethod def _get_date_from_parts(year, month, day): - possible_dates = [ - [year, month, day], - [year, month], - [year] - ] + possible_dates = [[year, month, day], [year, month], [year]] # we try different date combinations # cause even if date part is not None # it can raise validation error @@ -485,10 +501,15 @@ def get_date(self, date_node): def get_keywords(group_node): """Extract keywords from a keyword group.""" schema = None - if 'pacs' in group_node.xpath('@kwd-group-type').extract_first(default='').lower(): + if ( + 'pacs' + in group_node.xpath('@kwd-group-type').extract_first(default='').lower() + ): schema = 'PACS' - keywords = (kwd.xpath('string(.)').extract_first() for kwd in group_node.xpath('.//kwd')) + keywords = ( + kwd.xpath('string(.)').extract_first() for kwd in group_node.xpath('.//kwd') + ) keyword_dicts = ({'keyword': keyword, 'schema': schema} for keyword in keywords) return keyword_dicts @@ -507,10 +528,7 @@ def get_root_node(jats_record): scrapy.selector.Selector: a selector on the root ``
`` node. """ - if isinstance(jats_record, six.string_types): - root = get_node(jats_record) - else: - root = jats_record + root = get_node(jats_record) if isinstance(jats_record, six.string_types) else jats_record root.remove_namespaces() return root @@ -531,15 +549,14 @@ def get_author(self, author_node): orcid = self.get_orcid(author_node) author_ids = [("ORCID", orcid)] if orcid else [] return self.builder.make_author( - author_name, - raw_affiliations=affiliations, - emails=emails, - ids=author_ids + author_name, raw_affiliations=affiliations, emails=emails, ids=author_ids ) @staticmethod def get_orcid(author_node): - orcid = author_node.xpath('./contrib-id[@contrib-id-type="orcid"]/text()').extract_first() + orcid = author_node.xpath( + './contrib-id[@contrib-id-type="orcid"]/text()' + ).extract_first() if orcid: return normalize_orcid(orcid) @@ -555,8 +572,7 @@ def get_reference_authors(ref_node, role): List[str]: list of names """ return ref_node.xpath( - './person-group[@person-group-type=$role]/string-name/text()', - role=role + './person-group[@person-group-type=$role]/string-name/text()', role=role ).extract() def get_reference(self, ref_node): @@ -576,7 +592,7 @@ def get_reference(self, ref_node): builder.add_raw_reference( ref_node.extract().strip(), source=self.builder.source, - ref_format='JATS' + ref_format='JATS', ) fields = [ @@ -600,10 +616,13 @@ def get_reference(self, ref_node): ( 'pub-id[@pub-id-type="other"]' '[contains(preceding-sibling::text(),"Report No")]/text()', - builder.add_report_number + builder.add_report_number, ), ('./article-title/text()', builder.add_title), - ('../label/text()', lambda x, builder=builder: builder.set_label(x.strip('[].'))) + ( + '../label/text()', + lambda x, builder=builder: builder.set_label(x.strip('[].')), + ), ] for xpath, field_handler in fields: diff --git a/inspire_utils/query.py b/inspire_utils/query.py index 75c7996..a1398c6 100644 --- a/inspire_utils/query.py +++ b/inspire_utils/query.py @@ -26,15 +26,18 @@ def wrap_queries_in_bool_clauses_if_more_than_one( queries, use_must_clause, preserve_bool_semantics_if_one_clause=False ): """Helper for wrapping a list of queries into a bool.{must, should} clause. + Args: queries (list): List of queries to be wrapped in a bool.{must, should} clause. - use_must_clause (bool): Flag that signifies whether to use 'must' or 'should' clause. - preserve_bool_semantics_if_one_clause (bool): Flag that signifies whether to generate a bool query even if - there's only one clause. This happens to generate boolean query semantics. Usually not the case, but - useful for boolean queries support. + use_must_clause (bool): Flag that signifies whether to use + 'must' or 'should' clause. + preserve_bool_semantics_if_one_clause (bool): Flag that signifies whether to + generate a bool query even if there's only one clause. This happens to generate + boolean query semantics. Usually not the case, but useful for boolean queries support. Returns: - (dict): If len(queries) > 1, the bool clause, otherwise if len(queries) == 1, will return the query itself, - while finally, if len(queries) == 0, then an empty dictionary is returned. + (dict): If len(queries) > 1, the bool clause, otherwise if len(queries) == 1, + will return the query itself,while finally, if len(queries) == 0, + then an empty dictionary is returned. """ if not queries: return {} @@ -48,9 +51,7 @@ def wrap_queries_in_bool_clauses_if_more_than_one( def ordered(obj): - """ - Helper to order the dictionary - """ + """Helper to order the dictionary.""" # See https://stackoverflow.com/a/25851972 if isinstance(obj, dict): return sorted((k, ordered(v)) for k, v in obj.items()) diff --git a/inspire_utils/record.py b/inspire_utils/record.py index 4e25369..f7fe520 100644 --- a/inspire_utils/record.py +++ b/inspire_utils/record.py @@ -504,12 +504,11 @@ def get_values_for_schema(elements, schema): def replace_undesirable_characters(line): - """ - Replace certain bad characters in a text line. - @param line: (string) the text line in which bad characters are to - be replaced. - @return: (string) the text line after the bad characters have been - replaced. + """Replace certain bad characters in a text line. @param line: (string) the + text line in which bad characters are to. + + be replaced. @return: (string) the text line after the + bad characters have been replaced. """ # These are separate because we want a particular order for bad_string, replacement in UNDESIRABLE_STRING_REPLACEMENTS: diff --git a/inspire_utils/urls.py b/inspire_utils/urls.py index ea339d6..efe50bd 100644 --- a/inspire_utils/urls.py +++ b/inspire_utils/urls.py @@ -19,7 +19,6 @@ # In applying this license, CERN does not waive the privileges and immunities # granted to it by virtue of its status as an Intergovernmental Organization # or submit itself to any jurisdiction. - """URL-related utils.""" from __future__ import absolute_import, division, print_function @@ -45,7 +44,7 @@ def ensure_scheme(url, default_scheme='http'): netloc=parsed.path, path='', query=parsed.query, - fragment=parsed.fragment + fragment=parsed.fragment, ) return urlunsplit(parsed) diff --git a/inspire_utils/utils.py b/inspire_utils/utils.py index 6c6b67d..3a99c7c 100644 --- a/inspire_utils/utils.py +++ b/inspire_utils/utils.py @@ -12,7 +12,9 @@ r'\b(?:for|on behalf of|representing)\b', re.IGNORECASE, ) -INST_PHRASES = ['for the development', ] +INST_PHRASES = [ + 'for the development', +] def get_node(text, namespaces=None): @@ -25,7 +27,7 @@ def get_node(text, namespaces=None): def coll_cleanforthe(coll): - """ Cleanup collaboration, try to find author """ + """Cleanup collaboration, try to find author.""" author = None if any(phrase for phrase in INST_PHRASES if phrase in coll.lower()): @@ -52,10 +54,10 @@ def coll_cleanforthe(coll): def split_fullname(author, switch_name_order=False): """Split an author name to surname and given names. - It accepts author strings with and without comma separation. - As default surname is first in case of comma separation, otherwise last. - Multi-part surnames are incorrectly detected in strings without comma - separation. + It accepts author strings with and without comma separation. As + default surname is first in case of comma separation, otherwise + last. Multi-part surnames are incorrectly detected in strings + without comma separation. """ if not author: return "", "" @@ -112,7 +114,7 @@ def split_fullname(author, switch_name_order=False): 'talk', 'talks', 'workshop', - 'workshops' + 'workshops', ] THESIS_WORDS = [ @@ -136,5 +138,5 @@ def split_fullname(author, switch_name_order=False): 'staatsexamensarbeit', 'tesi', 'thesis', - 'travail' + 'travail', ] diff --git a/ruff.toml b/ruff.toml index 843a909..8c073e3 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,4 +1,5 @@ target-version = "py311" +line-length = 100 [lint.flake8-tidy-imports] ban-relative-imports = "all" @@ -19,7 +20,7 @@ select = [ # flake8-pytest-style "PT", ] -ignore = ["B904", "E501"] +ignore = ["B904"] [lint.pycodestyle] diff --git a/tests/test_config.py b/tests/test_config.py index 525d6e2..ae02fa9 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -42,10 +42,12 @@ def test_config(tmpdir): mock_config = tmpdir.join("inspirehep.cfg") mock_config.write("SERVER_NAME = '0.0.0.0'; OTHER_VARIABLE = 42") - config = Config(defaults={ - 'SERVER_NAME': '127.0.0.1', - 'SOME_OTHER_DEFAULT': 1234, - }) + config = Config( + defaults={ + 'SERVER_NAME': '127.0.0.1', + 'SOME_OTHER_DEFAULT': 1234, + } + ) config.load_pyfile(mock_config.strpath) assert config['SERVER_NAME'] == '0.0.0.0' @@ -84,9 +86,12 @@ def test_config_invalid_file(tmpdir): with pytest.raises(MalformedConfig): config.load_pyfile(mock_config.strpath) + @pytest.mark.usefixtures(name="_restore_cwd") def test_load_config(tmpdir): - mock_inspirehep_var_cfg = tmpdir.mkdir('var').mkdir('inspirehep-instance').join("inspirehep.cfg") + mock_inspirehep_var_cfg = ( + tmpdir.mkdir('var').mkdir('inspirehep-instance').join("inspirehep.cfg") + ) mock_inspirehep_var_cfg.write("SERVER_NAME = '0.0.0.0'") mock_inspirehep_cfg = tmpdir.join("inspirehep.cfg") diff --git a/tests/test_date.py b/tests/test_date.py index 66b1eff..e66f92e 100644 --- a/tests/test_date.py +++ b/tests/test_date.py @@ -68,7 +68,9 @@ def test_partial_date_raises_on_day_with_no_month(): def test_partial_date_raises_on_wrong_types(): - with pytest.raises(TypeError, match='arguments to PartialDate must be of type int or None'): + with pytest.raises( + TypeError, match='arguments to PartialDate must be of type int or None' + ): PartialDate('1686', '6', '30') diff --git a/tests/test_helpers.py b/tests/test_helpers.py index c0f0672..e317d1d 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -83,7 +83,8 @@ def test_maybe_int_returns_none_otherwise(): def test_remove_tags_allowed_trees_strip(): allowed_trees = ('b',) strip = '@class="hidden"' - snippet = '

Only this text remains.

' + snippet = ('

Only this text remains.' + '

') result = remove_tags(snippet, allowed_trees=allowed_trees, strip=strip) expected = u'Only this text remains.' @@ -94,7 +95,8 @@ def test_remove_tags_allowed_trees_strip(): def test_remove_tags_allowed_tags_strip(): allowed_tags = ('b',) strip = '@class="hidden"' - snippet = '

Only this text remains.

' + snippet = ('

Only this text remains.' + '

') result = remove_tags(snippet, allowed_tags=allowed_tags, strip=strip) expected = u'Only this text remains.' @@ -105,7 +107,8 @@ def test_remove_tags_allowed_tags_strip(): def test_remove_tags_allowed_tags_strip_preserves_text(): allowed_tags = ('i',) strip = '@class="hidden"' - snippet = '

Only this text remains.

' + snippet = ('

Only this text remains.' + '

') result = remove_tags(snippet, allowed_tags=allowed_tags, strip=strip) expected = u'Only this text remains.' diff --git a/tests/test_name.py b/tests/test_name.py index af0a3da..9e108de 100644 --- a/tests/test_name.py +++ b/tests/test_name.py @@ -120,8 +120,11 @@ def test_normalize_name_handles_suffixes(input_author_name, expected): @pytest.mark.parametrize( ("input_author_name", "expected"), - [("Sir John Smith", "Smith, John"), - ("Bao, Hon", "Bao, Hon"), ("ed witten", "Witten, Ed")], + [ + ("Sir John Smith", "Smith, John"), + ("Bao, Hon", "Bao, Hon"), + ("ed witten", "Witten, Ed"), + ], ) def test_normalize_name_handles_titles(input_author_name, expected): assert normalize_name(input_author_name) == expected @@ -416,8 +419,7 @@ def test_format_author_name(): def test_format_author_name_with_initials(): expected = "S. M. Lieber" - assert expected == format_name( - "Lieber, Stanley Martin", initials_only=True) + assert expected == format_name("Lieber, Stanley Martin", initials_only=True) def test_format_author_name_with_initials_with_all_caps_name(): @@ -439,15 +441,20 @@ def test_parsed_name_initials(): @pytest.mark.parametrize( ("input_author_name", "expected"), - [("Lieber, Ed", "E. Lieber"), - ('Lieber, Ed Viktor', "E. V. Lieber"), - ('Lieber, Ed Jr.', "E. Lieber, Jr."), - ('Lieber, Ed Victor Jr.', "E. V. Lieber, Jr."), - ], + [ + ("Lieber, Ed", "E. Lieber"), + ('Lieber, Ed Viktor', "E. V. Lieber"), + ('Lieber, Ed Jr.', "E. Lieber, Jr."), + ('Lieber, Ed Victor Jr.', "E. V. Lieber, Jr."), + ], ) -def test_format_author_name_with_initials_when_first_name_is_similar_to_title(input_author_name, expected): +def test_format_author_name_with_initials_when_first_name_is_similar_to_title( + input_author_name, expected +): - assert expected == format_name(input_author_name, initials_only=True, without_titles=True) + assert expected == format_name( + input_author_name, initials_only=True, without_titles=True + ) def test_parsed_wrong_names_and_not_fail(): @@ -485,9 +492,9 @@ def test_first_name_with_dash_is_printed_with_dash_and_initialized_correctly(): def test_first_name_initials_without_whitespace_is_initialized_correctly(): - assert format_name( - "Miguel A-M.G. Garcia", initials_only=True - ) == u"M. A. M. G. Garcia" + assert ( + format_name("Miguel A-M.G. Garcia", initials_only=True) == u"M. A. M. G. Garcia" + ) def test_last_name_recognized_correctly_regression_test(): @@ -1348,48 +1355,52 @@ def test_generate_es_query_title_name(): name = "ed witten" expected_query = { 'nested': { - 'path': 'authors', 'query': { + 'path': 'authors', + 'query': { 'bool': { 'must': [ { 'match': { u'authors.last_name': { 'operator': 'AND', - 'query': u'Witten' + 'query': u'Witten', } } - }, { + }, + { 'bool': { 'should': [ { 'match_phrase_prefix': { u'authors.first_name': { 'query': u'Ed', - 'analyzer': 'names_analyzer' + 'analyzer': 'names_analyzer', } } - }, { + }, + { 'match': { u'authors.first_name': { 'operator': 'AND', 'query': u'Ed', - 'analyzer': 'names_initials_analyzer' + 'analyzer': 'names_initials_analyzer', } } - }, { + }, + { 'match': { u'authors.full_name': { 'operator': 'AND', - 'query': 'Ed Witten' + 'query': 'Ed Witten', } } - } + }, ] } - } + }, ] } - } + }, } } parsed_name = ParsedName(name) diff --git a/tests/test_parsers_arxiv.py b/tests/test_parsers_arxiv.py index 5e17a5e..6e1938e 100644 --- a/tests/test_parsers_arxiv.py +++ b/tests/test_parsers_arxiv.py @@ -51,15 +51,23 @@ def test_latex_to_unicode_preserves_math(): def test_latex_to_unicode_preserves_braces_containing_more_than_one_char(): - expected = u"On the origin of the Type~{\\sc ii} spicules - dynamic 3D MHD simulations" - result = ArxivParser.latex_to_unicode(u"On the origin of the Type~{\\sc ii} spicules - dynamic 3D MHD simulations") + expected = ( + u"On the origin of the Type~{\\sc ii} spicules - dynamic 3D MHD simulations" + ) + result = ArxivParser.latex_to_unicode( + u"On the origin of the Type~{\\sc ii} spicules - dynamic 3D MHD simulations" + ) assert result == expected def test_latex_to_unicode_preserves_comments(): - expected = u"A 4% measurement of $H_0$ using the cumulative distribution of strong-lensing time delays in doubly-imaged quasars" - result = ArxivParser.latex_to_unicode(u"A 4% measurement of $H_0$ using the cumulative distribution of strong-lensing time delays in doubly-imaged quasars") + expected = (u"A 4% measurement of $H_0$ using the cumulative" + u"distribution of strong-lensing time delays in doubly-imaged quasars") + result = ArxivParser.latex_to_unicode( + (u"A 4% measurement of $H_0$ using the cumulative" + u"distribution of strong-lensing time delays in doubly-imaged quasars") + ) assert result == expected diff --git a/tests/test_parsers_crossref.py b/tests/test_parsers_crossref.py index 2c12cc5..8fa8303 100644 --- a/tests/test_parsers_crossref.py +++ b/tests/test_parsers_crossref.py @@ -54,13 +54,16 @@ def get_parser_by_file(filename): return CrossrefParser(aps_crossref) -@pytest.fixture(scope='module', params=[ - ('2018.3804742.json', '2018.3804742_expected.yml'), - ('tasc.2017.2776938.json', 'tasc.2017.2776938_expected.yml'), - ('9781316535783.011.json', '9781316535783.011_expected.yml'), - ('PhysRevB.33.3547.2.json', 'PhysRevB.33.3547.2_expected.yml'), - ('s1463-4988(99)00060-3.json', 's1463-4988(99)00060-3_expected.yml'), -]) +@pytest.fixture( + scope='module', + params=[ + ('2018.3804742.json', '2018.3804742_expected.yml'), + ('tasc.2017.2776938.json', 'tasc.2017.2776938_expected.yml'), + ('9781316535783.011.json', '9781316535783.011_expected.yml'), + ('PhysRevB.33.3547.2.json', 'PhysRevB.33.3547.2_expected.yml'), + ('s1463-4988(99)00060-3.json', 's1463-4988(99)00060-3_expected.yml'), + ], +) def records(request): return { 'crossref': get_parser_by_file(request.param[0]), @@ -99,24 +102,20 @@ def test_data_completeness(records): assert field in all_fields -@pytest.mark.parametrize( - 'field_name', - REQUIRED_FIELDS -) +@pytest.mark.parametrize('field_name', REQUIRED_FIELDS) def test_required_fields(field_name, records): - '''Check every field in this list since all of them are required in a Crossref record''' + """Check every field in this list since all of them are required in a + Crossref record.""" result = getattr(records['crossref'], field_name) expected = records['expected'][field_name] assert result == expected -@pytest.mark.parametrize( - 'field_name', - UNREQUIRED_FIELDS -) +@pytest.mark.parametrize('field_name', UNREQUIRED_FIELDS) def test_unrequired_fields(field_name, records): - '''Check if the field was parsed correctly only if the field exists in this record''' + """Check if the field was parsed correctly only if the field exists in this + record.""" if field_name in records['expected']: result = getattr(records['crossref'], field_name) expected = records['expected'][field_name] diff --git a/tests/test_parsers_elsevier.py b/tests/test_parsers_elsevier.py index 234e67e..fb79d0a 100644 --- a/tests/test_parsers_elsevier.py +++ b/tests/test_parsers_elsevier.py @@ -55,17 +55,20 @@ def get_parser_by_file(filename): return ElsevierParser(aps_elsevier) -@pytest.fixture(scope='module', params=[ - ('j.nima.2019.162787.xml', 'j.nima.2019.162787_expected.yml'), - ('j.nuclphysa.2020.121991.xml', 'j.nuclphysa.2020.121991_expected.yml'), - ('j.nima.2019.162728.xml', 'j.nima.2019.162728_expected.yml'), - ('j.nimb.2019.04.063.xml', 'j.nimb.2019.04.063_expected.yml'), - ('j.cpc.2020.107740.xml', 'j.cpc.2020.107740_expected.yml'), - ('j.scib.2020.01.008.xml', 'j.scib.2020.01.008_expected.yml'), - ('aphy.2001.6176.xml', 'aphy.2001.6176_expected.yml'), - ('j.aim.2021.107831.xml', 'j.aim.2021.107831_expected.yml'), - ('j.nuclphysa.2020.121992.xml', 'j.nuclphysa.2020.121992_expected.yml'), -]) +@pytest.fixture( + scope='module', + params=[ + ('j.nima.2019.162787.xml', 'j.nima.2019.162787_expected.yml'), + ('j.nuclphysa.2020.121991.xml', 'j.nuclphysa.2020.121991_expected.yml'), + ('j.nima.2019.162728.xml', 'j.nima.2019.162728_expected.yml'), + ('j.nimb.2019.04.063.xml', 'j.nimb.2019.04.063_expected.yml'), + ('j.cpc.2020.107740.xml', 'j.cpc.2020.107740_expected.yml'), + ('j.scib.2020.01.008.xml', 'j.scib.2020.01.008_expected.yml'), + ('aphy.2001.6176.xml', 'aphy.2001.6176_expected.yml'), + ('j.aim.2021.107831.xml', 'j.aim.2021.107831_expected.yml'), + ('j.nuclphysa.2020.121992.xml', 'j.nuclphysa.2020.121992_expected.yml'), + ], +) def records(request): return { 'elsevier': get_parser_by_file(request.param[0]), @@ -97,11 +100,7 @@ def records(request): 'journal_issue', 'is_conference_paper', ] -FIELDS_TO_CHECK_SEPARATELY = [ - 'publication_date', - 'documents', - 'collaborations' -] +FIELDS_TO_CHECK_SEPARATELY = ['publication_date', 'documents', 'collaborations'] def test_data_completeness(records): @@ -110,10 +109,7 @@ def test_data_completeness(records): assert field in tested_fields -@pytest.mark.parametrize( - 'field_name', - FIELDS_TO_CHECK -) +@pytest.mark.parametrize('field_name', FIELDS_TO_CHECK) def test_field(field_name, records): result = getattr(records['elsevier'], field_name) expected = records['expected'][field_name] @@ -148,8 +144,7 @@ def test_parse(records): def test_attach_fulltext_document(records): parser = records['elsevier'] parser.attach_fulltext_document( - records['file_name'], - 'http://example.org/{}'.format(records['file_name']) + records['file_name'], 'http://example.org/{}'.format(records['file_name']) ) result = parser.parse() assert result['documents'] == records['expected']['documents'] diff --git a/tests/test_parsers_jats.py b/tests/test_parsers_jats.py index d222dab..a7c420b 100644 --- a/tests/test_parsers_jats.py +++ b/tests/test_parsers_jats.py @@ -55,13 +55,16 @@ def get_parser_by_file(filename): return JatsParser(aps_jats) -@pytest.fixture(scope='module', params=[ - ('PhysRevD.102.014505.xml', 'PhysRevD.102.014505_expected.yml'), - ('PhysRevX.7.021022.xml', 'PhysRevX.7.021022_expected.yml'), - ('PhysRevX.4.021018.xml', 'PhysRevX.4.021018_expected.yml'), - ('PhysRevD.96.095036.xml', 'PhysRevD.96.095036_expected.yml'), - ('PhysRevX.7.021021.xml', 'PhysRevX.7.021021_expected.yml'), -]) +@pytest.fixture( + scope='module', + params=[ + ('PhysRevD.102.014505.xml', 'PhysRevD.102.014505_expected.yml'), + ('PhysRevX.7.021022.xml', 'PhysRevX.7.021022_expected.yml'), + ('PhysRevX.4.021018.xml', 'PhysRevX.4.021018_expected.yml'), + ('PhysRevD.96.095036.xml', 'PhysRevD.96.095036_expected.yml'), + ('PhysRevX.7.021021.xml', 'PhysRevX.7.021021_expected.yml'), + ], +) def records(request): return { 'jats': get_parser_by_file(request.param[0]), @@ -105,10 +108,7 @@ def test_data_completeness(records): assert field in tested_fields -@pytest.mark.parametrize( - 'field_name', - FIELDS_TO_CHECK -) +@pytest.mark.parametrize('field_name', FIELDS_TO_CHECK) def test_field(field_name, records): result = getattr(records['jats'], field_name) expected = records['expected'][field_name] @@ -145,8 +145,7 @@ def test_parse(records): def test_attach_fulltext_document(records): parser = records['jats'] parser.attach_fulltext_document( - records['file_name'], - 'http://example.org/{}'.format(records['file_name']) + records['file_name'], 'http://example.org/{}'.format(records['file_name']) ) result = parser.parse() diff --git a/tests/test_record.py b/tests/test_record.py index 31e65c3..1cdbb68 100644 --- a/tests/test_record.py +++ b/tests/test_record.py @@ -74,9 +74,7 @@ def test_get_value_allows_slices_in_paths(): def test_get_value_returns_none_if_inner_key_does_not_exist_on_string(): - record = { - 'foo': 'bar' - } + record = {'foo': 'bar'} result = get_value(record, 'foo.value') diff --git a/tests/test_urls.py b/tests/test_urls.py index aac1c37..eccc167 100644 --- a/tests/test_urls.py +++ b/tests/test_urls.py @@ -54,7 +54,7 @@ 'explicit netloc', 'with scheme, no netloc', 'without scheme, no netloc', - ] + ], ) def test_ensure_scheme(url, scheme, expected): if scheme: @@ -86,7 +86,7 @@ def test_ensure_scheme(url, scheme, expected): 'integer recid, scheme already present', 'string recid, no scheme', 'unicode url', - ] + ], ) def test_record_url_by_pattern(pattern, recid, expected): assert record_url_by_pattern(pattern, recid) == expected