diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index f548e8e25e..7480f5962e 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -696,9 +696,8 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin @pass_workspace def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): """ - Update the @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID + Update the @ID, @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) update_kwargs = {k: v for k, v in attr_value_pairs} if order: update_kwargs['ORDER'] = order @@ -707,6 +706,7 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): if contentids: update_kwargs['CONTENTIDS'] = contentids try: + workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) workspace.mets.update_physical_page_attributes(page_id, **update_kwargs) workspace.save_mets() except Exception as err: diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index f2017df10c..ab50694d59 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -136,7 +136,7 @@ def _fill_caches(self): log.debug("DIV_ID: %s" % el_div.get('ID')) for attr in METS_PAGE_DIV_ATTRIBUTE: - self._page_cache[attr][el_div.get(attr.name)] = el_div + self._page_cache[attr][str(el_div.get(attr.name))] = el_div # Assign an empty dictionary that will hold the fptr of the added page (div) self._fptr_cache[div_id] = {} @@ -600,37 +600,59 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O """ if for_fileIds is None and for_pageIds is None: return self.physical_pages + log = getLogger('ocrd.models.ocrd_mets.get_physical_pages') if for_pageIds is not None: ret = [] - pageId_patterns = [] + page_attr_patterns = [] for pageId_token in re.split(r',', for_pageIds): if pageId_token.startswith(REGEX_PREFIX): - pageId_patterns.append(re.compile(pageId_token[REGEX_PREFIX_LEN:])) + page_attr_patterns.append(re.compile(pageId_token[REGEX_PREFIX_LEN:])) elif '..' in pageId_token: - pageId_patterns += generate_range(*pageId_token.split('..', 1)) + page_attr_patterns += generate_range(*pageId_token.split('..', 1)) else: - pageId_patterns += [pageId_token] - if self._cache_flag: - for page_id in self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].keys(): - if page_id in pageId_patterns or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]): - if return_divs: - ret.append(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][page_id]) - else: - ret.append(page_id) - else: - for page in self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', - namespaces=NS): - page_id = page.get('ID') - if page_id in pageId_patterns or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]): - if return_divs: - ret.append(page) - else: - ret.append(page_id) + page_attr_patterns += [pageId_token] + if page_attr_patterns: + if self._cache_flag: + # determine attr to look for before iterating + try: + attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if ( + any(p in self._page_cache[a] for p in page_attr_patterns) or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) \ + for p in page_attr_patterns \ + for attr_val in self._page_cache[a]] + ))) + for attr_val in self._page_cache[attr].keys(): + if attr_val in page_attr_patterns or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]): + if return_divs: + ret.append(self._page_cache[attr][attr_val]) + else: + ret.append(attr_val) + except StopIteration: + log.debug(f"No pattern matches any keys of any of the _page_caches. patterns: {page_attr_patterns}") + else: + # determine attr during iterating + attr = None + for page in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS): + try: + if not attr: + attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if \ + page.get(a.name) in page_attr_patterns or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(page.get(a.name)) for p in page_attr_patterns])) + attr_val = page.get(attr.name) + if attr_val in page_attr_patterns or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]): + if return_divs: + ret.append(page) + else: + ret.append(attr_val) + except StopIteration: + log.debug(f"No pattern matches any mets:div attributes. patterns: {page_attr_patterns}") return ret + assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright ret = [None] * len(for_fileIds) if self._cache_flag: for pageId in self._fptr_cache.keys(): diff --git a/ocrd_utils/pyproject.toml b/ocrd_utils/pyproject.toml index dec419a9bf..b34a8732b4 100644 --- a/ocrd_utils/pyproject.toml +++ b/ocrd_utils/pyproject.toml @@ -1,18 +1,18 @@ [build-system] requires = [ "setuptools>=61", - "setuptools_scm[toml]", "wheel", ] # PEP 508 specifications. build-backend = "setuptools.build_meta" [project] name = "ocrd_utils" +version = "1.2.3" authors = [{name = "Konstantin Baierer", email = "unixprog@gmail.com"}] license = {text = "Apache License 2.0"} description = "OCR-D framework - shared code, helpers, constants" requires-python = ">=3.7" -dynamic = ["version", "dependencies"] +dynamic = ["dependencies"] [project.readme] file = "README.md" @@ -35,6 +35,3 @@ include-package-data = true [tool.setuptools.packages.find] namespaces = false - -[tool.setuptools_scm] -root = ".." diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index f31a74b3dc..15636e67fe 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -90,6 +90,8 @@ def test_find_all_files(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' + assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' + assert len(sbb_sample_01.find_all_files(pageId='0..100')) == 35, '35 files in @ORDER range 1..10' def test_find_all_files_local_only(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001',