From 54913fbc1999c02f126a606f265d93e0f6200779 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 1 Aug 2024 21:16:48 +0200 Subject: [PATCH 01/10] workspace find: print correct output fields fixes #1202 and a regression that printed a removal notice instead of the requested output fields when undoing downloads --- src/ocrd/cli/workspace.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 4c262ec48d..0c70fd3a36 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -467,19 +467,18 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl include_fileGrp=include_fileGrp, exclude_fileGrp=exclude_fileGrp, ): - ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field] if download and not f.local_filename: workspace.download_file(f) modified_mets = True if wait: time.sleep(wait) if undo_download and f.url and f.local_filename: - ret_entry = [f'Removed local_filename {f.local_filename}'] f.local_filename = None modified_mets = True if not keep_files: ctx.log.debug("rm %s [cwd=%s]", f.local_filename, workspace.directory) unlink(f.local_filename) + ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field] ret.append(ret_entry) if modified_mets: workspace.save_mets() From c2e563074868c78d0390c71ccfc697f403b99bcb Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 6 Aug 2024 13:18:15 +0200 Subject: [PATCH 02/10] resolver.download_to_directory: log all if_exists cases on file existing --- src/ocrd/resolver.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ocrd/resolver.py b/src/ocrd/resolver.py index fa98c82d0e..132aa73df2 100644 --- a/src/ocrd/resolver.py +++ b/src/ocrd/resolver.py @@ -95,12 +95,15 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip', log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url)) return str(ret) - # Respect 'if_exists' arg + # Respect 'if_exists' kwarg if dst_path.exists(): if if_exists == 'skip': + log.debug(f"File already exists but if_exists == {if_exists}, skipping.") return str(ret) - if if_exists == 'raise': - raise FileExistsError(f"File already exists and if_exists == 'raise': {dst_path}") + elif if_exists == 'raise': + raise FileExistsError(f"File already exists and if_exists == '{if_exists}': {dst_path}") + else: + log.debug(f"File already exists but if_exists == {if_exists}, overwriting.") # Create dst_path parent dir dst_path.parent.mkdir(parents=True, exist_ok=True) From 1c8b9db05431d9a52e4763b9efcf4f123e3d65c8 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 6 Aug 2024 13:51:13 +0200 Subject: [PATCH 03/10] resolver.workspace_from_url: raise FileExistsError if mets.xml exists and not clobber_mets, fix #563 --- src/ocrd/resolver.py | 2 +- tests/test_resolver.py | 17 +++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/ocrd/resolver.py b/src/ocrd/resolver.py index 132aa73df2..124d006927 100644 --- a/src/ocrd/resolver.py +++ b/src/ocrd/resolver.py @@ -221,7 +221,7 @@ def workspace_from_url( log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'", mets_basename, mets_url, src_baseurl, dst_dir) - self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip') + self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'raise') workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url) diff --git a/tests/test_resolver.py b/tests/test_resolver.py index abcf69257b..16dfd03d56 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -117,6 +117,9 @@ def test_workspace_from_url_kant_with_resources(mock_request, tmp_path): @patch.object(Session, "get") def test_workspace_from_url_kant_with_resources_existing_local(mock_request, tmp_path): + """ + Fail with clobber_mets=False, succeeed with clobber_mets=True + """ # arrange url_src = 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml' @@ -127,12 +130,14 @@ def test_workspace_from_url_kant_with_resources_existing_local(mock_request, tmp dst_mets = Path(dst_dir, 'mets.xml') shutil.copyfile(src_mets, dst_mets) - # act - Resolver().workspace_from_url(url_src, clobber_mets=False, dst_dir=dst_dir) + # fail + with pytest.raises(FileExistsError) as exc: + Resolver().workspace_from_url(url_src, clobber_mets=False, dst_dir=dst_dir) + assert mock_request.call_count == 0 - # assert - # no real request was made, since mets already present - assert mock_request.call_count == 0 + # succeed + Resolver().workspace_from_url(url_src, clobber_mets=True, dst_dir=dst_dir) + assert mock_request.call_count == 1 @patch.object(Session, "get") @@ -229,7 +234,7 @@ def test_workspace_from_nothing_noclobber(tmp_path): ws2 = Resolver().workspace_from_nothing(tmp_path) assert ws2.directory == tmp_path - with pytest.raises(Exception) as exc: + with pytest.raises(FileExistsError) as exc: Resolver().workspace_from_nothing(tmp_path) # assert From ea2e06d5ded2ad5f99f1823ed1dd6776605c657e Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 7 Aug 2024 16:56:42 +0200 Subject: [PATCH 04/10] :memo: changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14783f8376..d001568686 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ Versioned according to [Semantic Versioning](http://semver.org/). Changed: * ocrd_network: Use `ocrd-all-tool.json` bundled by core instead of download from website, #1257, #1260 + * `ocrd workspace clone`/`Resolver.workspace_from_url`: with `clobber_mets=False`, raise a FileExistsError for existing mets.xml on disk, #563, #1268 ## [2.67.2] - 2024-07-19 From 6041785b36ea603da6da205b28a50e4009011b58 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 13 Aug 2024 10:25:55 +0200 Subject: [PATCH 05/10] :memo: changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d001568686..dd816a3545 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Changed: * ocrd_network: Use `ocrd-all-tool.json` bundled by core instead of download from website, #1257, #1260 * `ocrd workspace clone`/`Resolver.workspace_from_url`: with `clobber_mets=False`, raise a FileExistsError for existing mets.xml on disk, #563, #1268 + * `ocrd workspace find --download`: print the the correct, up-to-date field, not `None`, #1202, #1266 ## [2.67.2] - 2024-07-19 From 1b6aa8fab0c52dcec4638cd36cb8f18126b51751 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 10:56:18 +0200 Subject: [PATCH 06/10] ocrd_utils: separate make_xml_id --- src/ocrd_utils/str.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py index 51cce4bf23..21a79400f5 100644 --- a/src/ocrd_utils/str.py +++ b/src/ocrd_utils/str.py @@ -18,6 +18,7 @@ 'partition_list', 'is_string', 'make_file_id', + 'make_xml_id', 'nth_url_segment', 'parse_json_string_or_file', 'parse_json_string_with_comments', @@ -95,12 +96,16 @@ def make_file_id(ocrd_file, output_file_grp): ret = output_file_grp + '_' + ocrd_file.pageId else: ret = output_file_grp + '_' + ocrd_file.ID + return make_xml_id(ret) + +def make_xml_id(idstr: str) -> str: + ret = idstr if not REGEX_FILE_ID.fullmatch(ret): ret = ret.replace(':', '_') ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret) ret = re.sub(r'[^\w.-]', r'', ret) return ret - + def nth_url_segment(url, n=-1): """ Return the last /-delimited segment of a URL-like string From 4d2565df2647cc501c1698c6978ab55e7f055a61 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 10:57:24 +0200 Subject: [PATCH 07/10] ocrd_utils: export make_xml_id --- src/ocrd_utils/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ocrd_utils/__init__.py b/src/ocrd_utils/__init__.py index 2055758a89..b5bbcae121 100644 --- a/src/ocrd_utils/__init__.py +++ b/src/ocrd_utils/__init__.py @@ -79,6 +79,7 @@ :py:func:`set_json_key_value_overrides`, :py:func:`assert_file_grp_cardinality`, :py:func:`make_file_id` + :py:func:`make_xml_id` :py:func:`generate_range` String and OOP utilities @@ -198,6 +199,7 @@ is_local_filename, is_string, make_file_id, + make_xml_id, nth_url_segment, partition_list, parse_json_string_or_file, From ace2a7b27b7b27dae1e78b88b5b50239b9babe35 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 10:59:45 +0200 Subject: [PATCH 08/10] PcGts.id: convert filename to XML id --- src/ocrd_page_user_methods/id.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd_page_user_methods/id.py b/src/ocrd_page_user_methods/id.py index 91df07c8cd..7b83827b53 100644 --- a/src/ocrd_page_user_methods/id.py +++ b/src/ocrd_page_user_methods/id.py @@ -1,5 +1,6 @@ @property def id(self): + from ocrd_utils import make_xml_id if hasattr(self, 'pcGtsId'): return self.pcGtsId or '' - return self.imageFilename + return make_xml_id(self.imageFilename) From c2b583c755e3c63e65fc60795d2d6999eea2d33e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 16:46:29 +0200 Subject: [PATCH 09/10] document make_xml_id Co-authored-by: Konstantin Baierer --- src/ocrd_utils/str.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py index 21a79400f5..dea3715bf4 100644 --- a/src/ocrd_utils/str.py +++ b/src/ocrd_utils/str.py @@ -99,6 +99,9 @@ def make_file_id(ocrd_file, output_file_grp): return make_xml_id(ret) def make_xml_id(idstr: str) -> str: + """ + Turn ``idstr`` into a valid ``xml:id`` literal by replacing ``:`` with ``_``, removing everything non-alphanumeric, ``.`` and ``-`` and prepending `id_` if ``idstr`` starts with a number. + """ ret = idstr if not REGEX_FILE_ID.fullmatch(ret): ret = ret.replace(':', '_') From 18ecf8f06ec4f411fc97c803182fe0be6430a080 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 19 Aug 2024 14:58:15 +0200 Subject: [PATCH 10/10] :memo: changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index dd816a3545..5d5568f549 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,10 @@ Changed: * `ocrd workspace clone`/`Resolver.workspace_from_url`: with `clobber_mets=False`, raise a FileExistsError for existing mets.xml on disk, #563, #1268 * `ocrd workspace find --download`: print the the correct, up-to-date field, not `None`, #1202, #1266 +Fixed: + + * Sanitize `self.imageFilename` for the `pcGtsId` to ensure it is a valid `xml:id`, #1271 + ## [2.67.2] - 2024-07-19 Fixed: