From 517814bf654770695dba617a35de2adf55e98847 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 30 Jan 2024 19:38:25 +0100 Subject: [PATCH] get_physical_pages: return early if no patterns --- src/ocrd_models/ocrd_mets.py | 75 ++++++++++++++++++----------------- tests/model/test_ocrd_mets.py | 2 +- 2 files changed, 39 insertions(+), 38 deletions(-) diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index 95c29ade87..6be47ad16d 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -611,45 +611,46 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O page_attr_patterns += generate_range(*pageId_token.split('..', 1)) else: page_attr_patterns += [pageId_token] - if page_attr_patterns: - if self._cache_flag: - # determine attr to look for before iterating + if not page_attr_patterns: + return [] + if self._cache_flag: + # determine attr to look for before iterating + try: + attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if ( + any(p in self._page_cache[a] for p in page_attr_patterns) or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) \ + for p in page_attr_patterns \ + for attr_val in self._page_cache[a]] + ))) + for attr_val in self._page_cache[attr].keys(): + if attr_val in page_attr_patterns or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]): + if return_divs: + ret.append(self._page_cache[attr][attr_val]) + else: + ret.append(attr_val) + except StopIteration: + log.debug(f"No pattern matches any keys of any of the _page_caches. patterns: {page_attr_patterns}") + else: + # determine attr during iterating + attr = None + for page in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS): try: - attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if ( - any(p in self._page_cache[a] for p in page_attr_patterns) or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) \ - for p in page_attr_patterns \ - for attr_val in self._page_cache[a]] - ))) - for attr_val in self._page_cache[attr].keys(): - if attr_val in page_attr_patterns or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]): - if return_divs: - ret.append(self._page_cache[attr][attr_val]) - else: - ret.append(attr_val) + if not attr: + attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if \ + page.get(a.name) in page_attr_patterns or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(page.get(a.name)) for p in page_attr_patterns])) + attr_val = page.get(attr.name) + if attr_val in page_attr_patterns or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]): + if return_divs: + ret.append(page) + else: + ret.append(attr_val) except StopIteration: - log.debug(f"No pattern matches any keys of any of the _page_caches. patterns: {page_attr_patterns}") - else: - # determine attr during iterating - attr = None - for page in self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', - namespaces=NS): - try: - if not attr: - attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if \ - page.get(a.name) in page_attr_patterns or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(page.get(a.name)) for p in page_attr_patterns])) - attr_val = page.get(attr.name) - if attr_val in page_attr_patterns or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]): - if return_divs: - ret.append(page) - else: - ret.append(attr_val) - except StopIteration: - log.debug(f"No pattern matches any mets:div attributes. patterns: {page_attr_patterns}") + log.debug(f"No pattern matches any mets:div attributes. patterns: {page_attr_patterns}") return ret assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index f33cf23960..125f2ac1ed 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -92,7 +92,7 @@ def test_find_all_files(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' assert len(sbb_sample_01.find_all_files(pageId='1..10')) == 35, '35 files in @ORDER range 1..10' - with raises(ValueError, match='differ in their non-numeric part'): + with pytest.raises(ValueError, match='differ in their non-numeric part'): len(sbb_sample_01.find_all_files(pageId='1..PHYS_0002')) def test_find_all_files_local_only(sbb_sample_01):