Skip to content

Commit

Permalink
implement generic page attribute ranges
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Jan 16, 2024
1 parent cfd1c91 commit ee8fb69
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 31 deletions.
4 changes: 2 additions & 2 deletions ocrd/ocrd/cli/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,9 +696,8 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin
@pass_workspace
def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id):
"""
Update the @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID
Update the @ID, @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
update_kwargs = {k: v for k, v in attr_value_pairs}
if order:
update_kwargs['ORDER'] = order
Expand All @@ -707,6 +706,7 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id):
if contentids:
update_kwargs['CONTENTIDS'] = contentids
try:
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
workspace.mets.update_physical_page_attributes(page_id, **update_kwargs)
workspace.save_mets()
except Exception as err:
Expand Down
70 changes: 46 additions & 24 deletions ocrd_models/ocrd_models/ocrd_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def _fill_caches(self):
log.debug("DIV_ID: %s" % el_div.get('ID'))

for attr in METS_PAGE_DIV_ATTRIBUTE:
self._page_cache[attr][el_div.get(attr.name)] = el_div
self._page_cache[attr][str(el_div.get(attr.name))] = el_div

# Assign an empty dictionary that will hold the fptr of the added page (div)
self._fptr_cache[div_id] = {}
Expand Down Expand Up @@ -600,37 +600,59 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O
"""
if for_fileIds is None and for_pageIds is None:
return self.physical_pages
log = getLogger('ocrd.models.ocrd_mets.get_physical_pages')
if for_pageIds is not None:
ret = []
pageId_patterns = []
page_attr_patterns = []
for pageId_token in re.split(r',', for_pageIds):
if pageId_token.startswith(REGEX_PREFIX):
pageId_patterns.append(re.compile(pageId_token[REGEX_PREFIX_LEN:]))
page_attr_patterns.append(re.compile(pageId_token[REGEX_PREFIX_LEN:]))
elif '..' in pageId_token:
pageId_patterns += generate_range(*pageId_token.split('..', 1))
page_attr_patterns += generate_range(*pageId_token.split('..', 1))
else:
pageId_patterns += [pageId_token]
if self._cache_flag:
for page_id in self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].keys():
if page_id in pageId_patterns or \
any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]):
if return_divs:
ret.append(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][page_id])
else:
ret.append(page_id)
else:
for page in self._tree.getroot().xpath(
'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
namespaces=NS):
page_id = page.get('ID')
if page_id in pageId_patterns or \
any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]):
if return_divs:
ret.append(page)
else:
ret.append(page_id)
page_attr_patterns += [pageId_token]
if page_attr_patterns:
if self._cache_flag:
# determine attr to look for before iterating
try:
attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if (
any(p in self._page_cache[a] for p in page_attr_patterns) or \
any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) \
for p in page_attr_patterns \
for attr_val in self._page_cache[a]]
)))
for attr_val in self._page_cache[attr].keys():
if attr_val in page_attr_patterns or \
any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]):
if return_divs:
ret.append(self._page_cache[attr][attr_val])
else:
ret.append(attr_val)
except StopIteration:
log.debug(f"No pattern matches any keys of any of the _page_caches. patterns: {page_attr_patterns}")
else:
# determine attr during iterating
attr = None
for page in self._tree.getroot().xpath(
'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
namespaces=NS):
try:
if not attr:
attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if \
page.get(a.name) in page_attr_patterns or \
any([isinstance(p, typing.Pattern) and p.fullmatch(page.get(a.name)) for p in page_attr_patterns]))
attr_val = page.get(attr.name)
if attr_val in page_attr_patterns or \
any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]):
if return_divs:
ret.append(page)
else:
ret.append(attr_val)
except StopIteration:
log.debug(f"No pattern matches any mets:div attributes. patterns: {page_attr_patterns}")
return ret

assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright
ret = [None] * len(for_fileIds)
if self._cache_flag:
for pageId in self._fptr_cache.keys():
Expand Down
7 changes: 2 additions & 5 deletions ocrd_utils/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
[build-system]
requires = [
"setuptools>=61",
"setuptools_scm[toml]",
"wheel",
] # PEP 508 specifications.
build-backend = "setuptools.build_meta"

[project]
name = "ocrd_utils"
version = "1.2.3"
authors = [{name = "Konstantin Baierer", email = "[email protected]"}]
license = {text = "Apache License 2.0"}
description = "OCR-D framework - shared code, helpers, constants"
requires-python = ">=3.7"
dynamic = ["version", "dependencies"]
dynamic = ["dependencies"]

[project.readme]
file = "README.md"
Expand All @@ -35,6 +35,3 @@ include-package-data = true

[tool.setuptools.packages.find]
namespaces = false

[tool.setuptools_scm]
root = ".."
2 changes: 2 additions & 0 deletions tests/model/test_ocrd_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ def test_find_all_files(sbb_sample_01):
assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002'
assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)'
assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002'
assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002'
assert len(sbb_sample_01.find_all_files(pageId='0..100')) == 35, '35 files in @ORDER range 1..10'

def test_find_all_files_local_only(sbb_sample_01):
assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001',
Expand Down

0 comments on commit ee8fb69

Please sign in to comment.