Skip to content

Commit

Permalink
Merge branch 'master' into extend-network-client
Browse files Browse the repository at this point in the history
# Conflicts:
#	CHANGELOG.md
  • Loading branch information
kba committed Aug 22, 2024
2 parents 15cea57 + 18ecf8f commit 18d743a
Show file tree
Hide file tree
Showing 7 changed files with 38 additions and 14 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ Added:
* `ocrd network client discovery processors` to list the processors deployed in the processing server, #1269
* `ocrd network client discovery processor` to get the `ocrd-tool.json` of a deployed processor, #1269
* `ocrd network client processing check-log` to retrieve the log data for a processing job, #1269
* `ocrd workspace clone`/`Resolver.workspace_from_url`: with `clobber_mets=False`, raise a FileExistsError for existing mets.xml on disk, #563, #1268
* `ocrd workspace find --download`: print the the correct, up-to-date field, not `None`, #1202, #1266

Fixed:

* Sanitize `self.imageFilename` for the `pcGtsId` to ensure it is a valid `xml:id`, #1271

## [2.67.2] - 2024-07-19

Expand Down
3 changes: 1 addition & 2 deletions src/ocrd/cli/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,19 +467,18 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl
include_fileGrp=include_fileGrp,
exclude_fileGrp=exclude_fileGrp,
):
ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field]
if download and not f.local_filename:
workspace.download_file(f)
modified_mets = True
if wait:
time.sleep(wait)
if undo_download and f.url and f.local_filename:
ret_entry = [f'Removed local_filename {f.local_filename}']
f.local_filename = None
modified_mets = True
if not keep_files:
ctx.log.debug("rm %s [cwd=%s]", f.local_filename, workspace.directory)
unlink(f.local_filename)
ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field]
ret.append(ret_entry)
if modified_mets:
workspace.save_mets()
Expand Down
11 changes: 7 additions & 4 deletions src/ocrd/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,12 +95,15 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip',
log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url))
return str(ret)

# Respect 'if_exists' arg
# Respect 'if_exists' kwarg
if dst_path.exists():
if if_exists == 'skip':
log.debug(f"File already exists but if_exists == {if_exists}, skipping.")
return str(ret)
if if_exists == 'raise':
raise FileExistsError(f"File already exists and if_exists == 'raise': {dst_path}")
elif if_exists == 'raise':
raise FileExistsError(f"File already exists and if_exists == '{if_exists}': {dst_path}")
else:
log.debug(f"File already exists but if_exists == {if_exists}, overwriting.")

# Create dst_path parent dir
dst_path.parent.mkdir(parents=True, exist_ok=True)
Expand Down Expand Up @@ -218,7 +221,7 @@ def workspace_from_url(

log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'",
mets_basename, mets_url, src_baseurl, dst_dir)
self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip')
self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'raise')

workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url)

Expand Down
3 changes: 2 additions & 1 deletion src/ocrd_page_user_methods/id.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
@property
def id(self):
from ocrd_utils import make_xml_id
if hasattr(self, 'pcGtsId'):
return self.pcGtsId or ''
return self.imageFilename
return make_xml_id(self.imageFilename)
2 changes: 2 additions & 0 deletions src/ocrd_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
:py:func:`set_json_key_value_overrides`,
:py:func:`assert_file_grp_cardinality`,
:py:func:`make_file_id`
:py:func:`make_xml_id`
:py:func:`generate_range`
String and OOP utilities
Expand Down Expand Up @@ -198,6 +199,7 @@
is_local_filename,
is_string,
make_file_id,
make_xml_id,
nth_url_segment,
partition_list,
parse_json_string_or_file,
Expand Down
10 changes: 9 additions & 1 deletion src/ocrd_utils/str.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
'partition_list',
'is_string',
'make_file_id',
'make_xml_id',
'nth_url_segment',
'parse_json_string_or_file',
'parse_json_string_with_comments',
Expand Down Expand Up @@ -95,12 +96,19 @@ def make_file_id(ocrd_file, output_file_grp):
ret = output_file_grp + '_' + ocrd_file.pageId
else:
ret = output_file_grp + '_' + ocrd_file.ID
return make_xml_id(ret)

def make_xml_id(idstr: str) -> str:
"""
Turn ``idstr`` into a valid ``xml:id`` literal by replacing ``:`` with ``_``, removing everything non-alphanumeric, ``.`` and ``-`` and prepending `id_` if ``idstr`` starts with a number.
"""
ret = idstr
if not REGEX_FILE_ID.fullmatch(ret):
ret = ret.replace(':', '_')
ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret)
ret = re.sub(r'[^\w.-]', r'', ret)
return ret

def nth_url_segment(url, n=-1):
"""
Return the last /-delimited segment of a URL-like string
Expand Down
17 changes: 11 additions & 6 deletions tests/test_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ def test_workspace_from_url_kant_with_resources(mock_request, tmp_path):

@patch.object(Session, "get")
def test_workspace_from_url_kant_with_resources_existing_local(mock_request, tmp_path):
"""
Fail with clobber_mets=False, succeeed with clobber_mets=True
"""

# arrange
url_src = 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml'
Expand All @@ -127,12 +130,14 @@ def test_workspace_from_url_kant_with_resources_existing_local(mock_request, tmp
dst_mets = Path(dst_dir, 'mets.xml')
shutil.copyfile(src_mets, dst_mets)

# act
Resolver().workspace_from_url(url_src, clobber_mets=False, dst_dir=dst_dir)
# fail
with pytest.raises(FileExistsError) as exc:
Resolver().workspace_from_url(url_src, clobber_mets=False, dst_dir=dst_dir)
assert mock_request.call_count == 0

# assert
# no real request was made, since mets already present
assert mock_request.call_count == 0
# succeed
Resolver().workspace_from_url(url_src, clobber_mets=True, dst_dir=dst_dir)
assert mock_request.call_count == 1


@patch.object(Session, "get")
Expand Down Expand Up @@ -229,7 +234,7 @@ def test_workspace_from_nothing_noclobber(tmp_path):
ws2 = Resolver().workspace_from_nothing(tmp_path)
assert ws2.directory == tmp_path

with pytest.raises(Exception) as exc:
with pytest.raises(FileExistsError) as exc:
Resolver().workspace_from_nothing(tmp_path)

# assert
Expand Down

0 comments on commit 18d743a

Please sign in to comment.