diff --git a/CHANGELOG.md b/CHANGELOG.md index a9bc4d67b7..1c49b5d00e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,12 @@ Added: * `ocrd network client discovery processors` to list the processors deployed in the processing server, #1269 * `ocrd network client discovery processor` to get the `ocrd-tool.json` of a deployed processor, #1269 * `ocrd network client processing check-log` to retrieve the log data for a processing job, #1269 + * `ocrd workspace clone`/`Resolver.workspace_from_url`: with `clobber_mets=False`, raise a FileExistsError for existing mets.xml on disk, #563, #1268 + * `ocrd workspace find --download`: print the the correct, up-to-date field, not `None`, #1202, #1266 + +Fixed: + + * Sanitize `self.imageFilename` for the `pcGtsId` to ensure it is a valid `xml:id`, #1271 ## [2.67.2] - 2024-07-19 diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 4c262ec48d..0c70fd3a36 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -467,19 +467,18 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl include_fileGrp=include_fileGrp, exclude_fileGrp=exclude_fileGrp, ): - ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field] if download and not f.local_filename: workspace.download_file(f) modified_mets = True if wait: time.sleep(wait) if undo_download and f.url and f.local_filename: - ret_entry = [f'Removed local_filename {f.local_filename}'] f.local_filename = None modified_mets = True if not keep_files: ctx.log.debug("rm %s [cwd=%s]", f.local_filename, workspace.directory) unlink(f.local_filename) + ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field] ret.append(ret_entry) if modified_mets: workspace.save_mets() diff --git a/src/ocrd/resolver.py b/src/ocrd/resolver.py index fa98c82d0e..124d006927 100644 --- a/src/ocrd/resolver.py +++ b/src/ocrd/resolver.py @@ -95,12 +95,15 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip', log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url)) return str(ret) - # Respect 'if_exists' arg + # Respect 'if_exists' kwarg if dst_path.exists(): if if_exists == 'skip': + log.debug(f"File already exists but if_exists == {if_exists}, skipping.") return str(ret) - if if_exists == 'raise': - raise FileExistsError(f"File already exists and if_exists == 'raise': {dst_path}") + elif if_exists == 'raise': + raise FileExistsError(f"File already exists and if_exists == '{if_exists}': {dst_path}") + else: + log.debug(f"File already exists but if_exists == {if_exists}, overwriting.") # Create dst_path parent dir dst_path.parent.mkdir(parents=True, exist_ok=True) @@ -218,7 +221,7 @@ def workspace_from_url( log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'", mets_basename, mets_url, src_baseurl, dst_dir) - self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip') + self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'raise') workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url) diff --git a/src/ocrd_page_user_methods/id.py b/src/ocrd_page_user_methods/id.py index 91df07c8cd..7b83827b53 100644 --- a/src/ocrd_page_user_methods/id.py +++ b/src/ocrd_page_user_methods/id.py @@ -1,5 +1,6 @@ @property def id(self): + from ocrd_utils import make_xml_id if hasattr(self, 'pcGtsId'): return self.pcGtsId or '' - return self.imageFilename + return make_xml_id(self.imageFilename) diff --git a/src/ocrd_utils/__init__.py b/src/ocrd_utils/__init__.py index 2055758a89..b5bbcae121 100644 --- a/src/ocrd_utils/__init__.py +++ b/src/ocrd_utils/__init__.py @@ -79,6 +79,7 @@ :py:func:`set_json_key_value_overrides`, :py:func:`assert_file_grp_cardinality`, :py:func:`make_file_id` + :py:func:`make_xml_id` :py:func:`generate_range` String and OOP utilities @@ -198,6 +199,7 @@ is_local_filename, is_string, make_file_id, + make_xml_id, nth_url_segment, partition_list, parse_json_string_or_file, diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py index 51cce4bf23..dea3715bf4 100644 --- a/src/ocrd_utils/str.py +++ b/src/ocrd_utils/str.py @@ -18,6 +18,7 @@ 'partition_list', 'is_string', 'make_file_id', + 'make_xml_id', 'nth_url_segment', 'parse_json_string_or_file', 'parse_json_string_with_comments', @@ -95,12 +96,19 @@ def make_file_id(ocrd_file, output_file_grp): ret = output_file_grp + '_' + ocrd_file.pageId else: ret = output_file_grp + '_' + ocrd_file.ID + return make_xml_id(ret) + +def make_xml_id(idstr: str) -> str: + """ + Turn ``idstr`` into a valid ``xml:id`` literal by replacing ``:`` with ``_``, removing everything non-alphanumeric, ``.`` and ``-`` and prepending `id_` if ``idstr`` starts with a number. + """ + ret = idstr if not REGEX_FILE_ID.fullmatch(ret): ret = ret.replace(':', '_') ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret) ret = re.sub(r'[^\w.-]', r'', ret) return ret - + def nth_url_segment(url, n=-1): """ Return the last /-delimited segment of a URL-like string diff --git a/tests/test_resolver.py b/tests/test_resolver.py index abcf69257b..16dfd03d56 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -117,6 +117,9 @@ def test_workspace_from_url_kant_with_resources(mock_request, tmp_path): @patch.object(Session, "get") def test_workspace_from_url_kant_with_resources_existing_local(mock_request, tmp_path): + """ + Fail with clobber_mets=False, succeeed with clobber_mets=True + """ # arrange url_src = 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml' @@ -127,12 +130,14 @@ def test_workspace_from_url_kant_with_resources_existing_local(mock_request, tmp dst_mets = Path(dst_dir, 'mets.xml') shutil.copyfile(src_mets, dst_mets) - # act - Resolver().workspace_from_url(url_src, clobber_mets=False, dst_dir=dst_dir) + # fail + with pytest.raises(FileExistsError) as exc: + Resolver().workspace_from_url(url_src, clobber_mets=False, dst_dir=dst_dir) + assert mock_request.call_count == 0 - # assert - # no real request was made, since mets already present - assert mock_request.call_count == 0 + # succeed + Resolver().workspace_from_url(url_src, clobber_mets=True, dst_dir=dst_dir) + assert mock_request.call_count == 1 @patch.object(Session, "get") @@ -229,7 +234,7 @@ def test_workspace_from_nothing_noclobber(tmp_path): ws2 = Resolver().workspace_from_nothing(tmp_path) assert ws2.directory == tmp_path - with pytest.raises(Exception) as exc: + with pytest.raises(FileExistsError) as exc: Resolver().workspace_from_nothing(tmp_path) # assert