Skip to content

Commit

Permalink
get_physical_pages: return early if no patterns
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Jan 30, 2024
1 parent 643d1ef commit 517814b
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 38 deletions.
75 changes: 38 additions & 37 deletions src/ocrd_models/ocrd_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,45 +611,46 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O
page_attr_patterns += generate_range(*pageId_token.split('..', 1))
else:
page_attr_patterns += [pageId_token]
if page_attr_patterns:
if self._cache_flag:
# determine attr to look for before iterating
if not page_attr_patterns:
return []
if self._cache_flag:
# determine attr to look for before iterating
try:
attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if (
any(p in self._page_cache[a] for p in page_attr_patterns) or \
any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) \
for p in page_attr_patterns \
for attr_val in self._page_cache[a]]
)))
for attr_val in self._page_cache[attr].keys():
if attr_val in page_attr_patterns or \
any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]):
if return_divs:
ret.append(self._page_cache[attr][attr_val])
else:
ret.append(attr_val)
except StopIteration:
log.debug(f"No pattern matches any keys of any of the _page_caches. patterns: {page_attr_patterns}")
else:
# determine attr during iterating
attr = None
for page in self._tree.getroot().xpath(
'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
namespaces=NS):
try:
attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if (
any(p in self._page_cache[a] for p in page_attr_patterns) or \
any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) \
for p in page_attr_patterns \
for attr_val in self._page_cache[a]]
)))
for attr_val in self._page_cache[attr].keys():
if attr_val in page_attr_patterns or \
any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]):
if return_divs:
ret.append(self._page_cache[attr][attr_val])
else:
ret.append(attr_val)
if not attr:
attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if \
page.get(a.name) in page_attr_patterns or \
any([isinstance(p, typing.Pattern) and p.fullmatch(page.get(a.name)) for p in page_attr_patterns]))
attr_val = page.get(attr.name)
if attr_val in page_attr_patterns or \
any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]):
if return_divs:
ret.append(page)
else:
ret.append(attr_val)
except StopIteration:
log.debug(f"No pattern matches any keys of any of the _page_caches. patterns: {page_attr_patterns}")
else:
# determine attr during iterating
attr = None
for page in self._tree.getroot().xpath(
'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
namespaces=NS):
try:
if not attr:
attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if \
page.get(a.name) in page_attr_patterns or \
any([isinstance(p, typing.Pattern) and p.fullmatch(page.get(a.name)) for p in page_attr_patterns]))
attr_val = page.get(attr.name)
if attr_val in page_attr_patterns or \
any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]):
if return_divs:
ret.append(page)
else:
ret.append(attr_val)
except StopIteration:
log.debug(f"No pattern matches any mets:div attributes. patterns: {page_attr_patterns}")
log.debug(f"No pattern matches any mets:div attributes. patterns: {page_attr_patterns}")
return ret

assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright
Expand Down
2 changes: 1 addition & 1 deletion tests/model/test_ocrd_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def test_find_all_files(sbb_sample_01):
assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002'
assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002'
assert len(sbb_sample_01.find_all_files(pageId='1..10')) == 35, '35 files in @ORDER range 1..10'
with raises(ValueError, match='differ in their non-numeric part'):
with pytest.raises(ValueError, match='differ in their non-numeric part'):
len(sbb_sample_01.find_all_files(pageId='1..PHYS_0002'))

def test_find_all_files_local_only(sbb_sample_01):
Expand Down

0 comments on commit 517814b

Please sign in to comment.