diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index 2a1886959d..da01c8cb20 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -6,8 +6,9 @@ on:
workflow_dispatch: # run manually
env:
- GHCRIO_DOCKER_TAG: ghcr.io/ocr-d/core
- DOCKERIO_DOCKER_TAG: docker.io/ocrd/core
+ # FIXME: linux/arm/v7 disabled as long as scikit-build/cmake-python-distributions#503 is unresolved
+ # PLATFORMS: linux/amd64,linux/arm/v7,linux/arm64/v8,linux/ppc64le
+ PLATFORMS: linux/amd64,linux/arm64/v8,linux/ppc64le
jobs:
@@ -17,8 +18,18 @@ jobs:
permissions:
packages: write
contents: read
-
+ env:
+ DOCKER_BASE_TAG: ghcr.io/ocr-d docker.io/ocrd
+ # TODO(kba): make the interpolation work correctly
+ # DOCKER_BUILD: docker buildx build --progress=plain --platform ${{ env.PLATFORMS }} --push
+ # TODO(kba): Investigate why ppc64le build hangs on "Installing build dependencies"
+ # TODO(kba): Investigate why arm64 fails with .buildkit_qemu_emulator: /usr/local/bin/conda: Invalid ELF image for this architecture
+ DOCKER_BUILD: docker buildx build --progress=plain --platform linux/amd64 --push
steps:
+ - name: Export variables
+ run: |
+ echo "DOCKER_BASE_TAG=${{ env.DOCKER_BASE_TAG }}" >> $GITHUB_ENV
+ echo "DOCKER_BUILD=${{ env.DOCKER_BUILD }}" >> $GITHUB_ENV
- name: Checkout
uses: actions/checkout@v4
with:
@@ -28,19 +39,6 @@ jobs:
- # Activate cache export feature to reduce build time of images
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- - name: Build the Docker image
- run: make docker
- - name: Build the Docker image with GPU support
- run: make docker-cuda
- - name: Alias Docker images
- # default tag uses docker.io, so tag post-hoc
- run: |
- docker tag ocrd/core ${{ env.GHCRIO_DOCKER_TAG }}
- docker tag ocrd/core-cuda ${{ env.GHCRIO_DOCKER_TAG }}-cuda
- - name: Smoke Test that ocrd --help works
- run: |
- docker run --rm ${{ env.GHCRIO_DOCKER_TAG }} ocrd --version
- docker run --rm ${{ env.GHCRIO_DOCKER_TAG }}-cuda ocrd --version
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
@@ -52,20 +50,9 @@ jobs:
with:
username: ${{ secrets.DOCKERIO_USERNAME }}
password: ${{ secrets.DOCKERIO_PASSWORD }}
- - name: Push images to Github Container Registry
- run: |
- docker push ${{ env.GHCRIO_DOCKER_TAG }}:latest
- docker push ${{ env.GHCRIO_DOCKER_TAG }}-cuda:latest
- - name: Push images to Docker Hub
- run: |
- docker tag ${{ env.GHCRIO_DOCKER_TAG }} ${{ env.DOCKERIO_DOCKER_TAG }}
- docker tag ${{ env.GHCRIO_DOCKER_TAG }}-cuda ${{ env.DOCKERIO_DOCKER_TAG }}-cuda
- docker push ${{ env.DOCKERIO_DOCKER_TAG }}:latest
- docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda:latest
- docker push ${{ env.DOCKERIO_DOCKER_TAG }}:latest
- docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda:latest
- docker tag ${{ env.DOCKERIO_DOCKER_TAG }} ${{ env.DOCKERIO_DOCKER_TAG }}:$(git describe --tags --abbrev=0)
- docker tag ${{ env.DOCKERIO_DOCKER_TAG }}-cuda ${{ env.DOCKERIO_DOCKER_TAG }}-cuda:$(git describe --tags --abbrev=0)
- docker push ${{ env.DOCKERIO_DOCKER_TAG }}:$(git describe --tags --abbrev=0)
- docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda:$(git describe --tags --abbrev=0)
-
+ - name: Build the Docker image
+ run: make docker
+ - name: Build the Docker image with GPU support
+ run: make docker-cuda
+ - name: Build the Docker images with GPU support and ML frameworks
+ run: make docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch
diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml
index 2b8e3d5b82..2ab5ee46c4 100644
--- a/.github/workflows/unit-test.yml
+++ b/.github/workflows/unit-test.yml
@@ -50,14 +50,18 @@ jobs:
make install deps-test
- name: Test with pytest
run: |
- make test benchmark
+ mkdir -p test-results
+ make test benchmark PYTEST_ARGS=--junitxml=test-results/test.xml
+ - uses: test-summary/action@v2
+ with:
+ paths: "test-results/test.xml"
- name: test to ensure that --editable install works
run: |
make install-dev; ocrd --version
- name: Lint with flake8
run: |
- python -m pip install flake8
+ python -m pip install flake8 flake8-github
# stop the build if there are Python syntax errors or undefined names
- flake8 src --count --exit-zero --select=E9,F63,F7,F82 --show-source --statistics
+ flake8 src --count --exit-zero --select=E9,F63,F7,F82 --show-source --statistics --format=github
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
- flake8 src --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+ flake8 src --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --format=github
diff --git a/.scrutinizer.yml b/.scrutinizer.yml
index 4848dca46a..83cb1c8521 100644
--- a/.scrutinizer.yml
+++ b/.scrutinizer.yml
@@ -7,12 +7,16 @@ build:
python:
version: 3.8.2
virtualenv: true
+ variables:
+ DEBIAN_FRONTEND: noninteractive
nodes:
analysis:
dependencies:
override:
- - sudo make deps-ubuntu
- - make install
+ - echo "Skipped"
+ # - command: sudo make deps-ubuntu
+ # idle_timeout: 600
+ # - make install
tests:
override:
- py-scrutinizer-run
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 049f804b3e..dd816a3545 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,58 @@ Versioned according to [Semantic Versioning](http://semver.org/).
## Unreleased
+Changed:
+
+ * ocrd_network: Use `ocrd-all-tool.json` bundled by core instead of download from website, #1257, #1260
+ * `ocrd workspace clone`/`Resolver.workspace_from_url`: with `clobber_mets=False`, raise a FileExistsError for existing mets.xml on disk, #563, #1268
+ * `ocrd workspace find --download`: print the the correct, up-to-date field, not `None`, #1202, #1266
+
+## [2.67.2] - 2024-07-19
+
+Fixed:
+
+ * Run `multiprocessing.set_start_method('fork')` only for OSX, #1261
+ * Broken PyPI release, #1262
+
+## [2.67.1] - 2024-07-17
+
+Fixed:
+
+ - Build and tests fixed, no functional changes from #1258
+
+## [2.67.0] - 2024-07-16
+
+Changed:
+
+ - Additional docker base images with preinstalled tensorflow 1 (`core-cuda-tf1`), tensorflow 2 (`core-cuda-tf2`) and torch (`core-cuda-torch`), #1239
+ - Resource Manager: Skip instead of raise an exception download if target file already exists (unless `--overwrite`), #1246
+ - Resource Manager: Try to use bundled `ocrd-all-tool.json` if available, #1250, OCR-D/all#444
+
+Added:
+
+ - `ocrd process` does support `-U/--mets-server`, #1243
+
+Fixed:
+
+ - `ocrd process`-derived tasks are not run in a temporary directory when not called from within workspace, #1243
+ - regression from #1238 where processors failed that had required parameters, #1255, #1256
+ - METS Server: Unlink UDS sockert file if it exists before startup, #1244
+ - Resource Manager: Do not create zero-size files for failing downloads, #1201, #1246
+ - Workspace.add_file: Allow multiple processors to create file group folders simultaneously, #1203, #1253
+ - Resource Manager: Do not try to run `--dump-json` for known non-processors `ocrd-{cis-data,import,make}`, #1218, #1249
+ - Resource Manager: Properly handle copying of directories, #1237, #1248
+ - bashlib: regression in parsing JSON from introducing parameter preset files, #1258
+
+Removed:
+
+ - Defaults for `-I/--input-file-grp`/`-O/--output-file-grp`, #1256, #274
+
+## [2.66.1] - 2024-06-26
+
+Fixed:
+
+ * GHA Docker: build docker.io first, then tag ghcr.io
+
## [2.66.0] - 2024-06-07
Fixed:
@@ -2092,8 +2144,12 @@ Fixed
## [0.0.1] - 2018-04-17
Initial Release
-]
+
+[2.67.2]: ../../compare/v2.67.2..v2.67.1
+[2.67.1]: ../../compare/v2.67.1..v2.67.0
+[2.67.0]: ../../compare/v2.67.0..v2.66.1
+[2.66.1]: ../../compare/v2.66.1..v2.66.0
[2.66.0]: ../../compare/v2.66.0..v2.65.0
[2.65.0]: ../../compare/v2.65.0..v2.64.1
[2.64.1]: ../../compare/v2.64.1..v2.64.0
diff --git a/Dockerfile b/Dockerfile
index fd57e5014d..144ae774dc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -38,6 +38,8 @@ RUN python3 -m venv /usr/local \
&& hash -r \
&& make install-dev \
&& eval $FIXUP
+# Smoke Test
+RUN ocrd --version
WORKDIR /data
diff --git a/Makefile b/Makefile
index 1b4ef47bd2..0608b0b738 100644
--- a/Makefile
+++ b/Makefile
@@ -151,7 +151,7 @@ deps-tf1:
fi
deps-tf2:
- if $(PYTHON) -c 'import sys; print("%u.%u" % (sys.version_info.major, sys.version_info.minor))' | fgrep 3.8 && \
+ if $(PYTHON) -c 'import sys; print("%u.%u" % (sys.version_info.major, sys.version_info.minor))' | fgrep 3.8; then \
$(PIP) install tensorflow; \
else \
$(PIP) install "tensorflow[and-cuda]"; \
@@ -162,7 +162,7 @@ deps-torch:
# Dependencies for deployment in an ubuntu/debian linux
deps-ubuntu:
- apt-get install -y python3 imagemagick libgeos-dev
+ apt-get install -y python3 imagemagick libgeos-dev libxml2-dev libxslt-dev libssl-dev
# Install test python deps via pip
deps-test:
@@ -361,44 +361,46 @@ pyclean:
.PHONY: docker docker-cuda
# Additional arguments to docker build. Default: '$(DOCKER_ARGS)'
-DOCKER_ARGS =
+DOCKER_ARGS ?=
+DOCKER_BASE_TAG ?= ocrd
+DOCKER_BUILD ?= docker build --progress=plain
# Build docker image
docker: DOCKER_BASE_IMAGE = ubuntu:20.04
-docker: DOCKER_TAG = ocrd/core
+docker: DOCKER_TAG = $(DOCKER_BASE_TAG:%=%/core)
docker: DOCKER_FILE = Dockerfile
# Build extended sets for maximal layer sharing
-docker-cuda: DOCKER_BASE_IMAGE = ocrd/core
-docker-cuda: DOCKER_TAG = ocrd/core-cuda
+docker-cuda: DOCKER_BASE_IMAGE = $(DOCKER_BASE_TAG)/core
+docker-cuda: DOCKER_TAG = $(DOCKER_BASE_TAG:%=%/core-cuda)
docker-cuda: DOCKER_FILE = Dockerfile.cuda
docker-cuda: docker
-docker-cuda-tf1: DOCKER_BASE_IMAGE = ocrd/core-cuda
-docker-cuda-tf1: DOCKER_TAG = ocrd/core-cuda-tf1
+docker-cuda-tf1: DOCKER_BASE_IMAGE = $(DOCKER_BASE_TAG)/core-cuda
+docker-cuda-tf1: DOCKER_TAG = $(DOCKER_BASE_TAG:%=%/core-cuda-tf1)
docker-cuda-tf1: DOCKER_FILE = Dockerfile.cuda-tf1
docker-cuda-tf1: docker-cuda
-docker-cuda-tf2: DOCKER_BASE_IMAGE = ocrd/core-cuda
-docker-cuda-tf2: DOCKER_TAG = ocrd/core-cuda-tf2
+docker-cuda-tf2: DOCKER_BASE_IMAGE = $(DOCKER_BASE_TAG)/core-cuda
+docker-cuda-tf2: DOCKER_TAG = $(DOCKER_BASE_TAG:%=%/core-cuda-tf2)
docker-cuda-tf2: DOCKER_FILE = Dockerfile.cuda-tf2
docker-cuda-tf2: docker-cuda
-docker-cuda-torch: DOCKER_BASE_IMAGE = ocrd/core-cuda
-docker-cuda-torch: DOCKER_TAG = ocrd/core-cuda-torch
+docker-cuda-torch: DOCKER_BASE_IMAGE = $(DOCKER_BASE_TAG)/core-cuda
+docker-cuda-torch: DOCKER_TAG = $(DOCKER_BASE_TAG:%=%/core-cuda-torch)
docker-cuda-torch: DOCKER_FILE = Dockerfile.cuda-torch
docker-cuda-torch: docker-cuda
-docker docker-cuda docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch:
- docker build --progress=plain -f $(DOCKER_FILE) -t $(DOCKER_TAG) --target ocrd_core_base --build-arg BASE_IMAGE=$(DOCKER_BASE_IMAGE) $(DOCKER_ARGS) .
+docker docker-cuda docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch:
+ $(DOCKER_BUILD) -f $(DOCKER_FILE) $(DOCKER_TAG:%=-t %) --target ocrd_core_base --build-arg BASE_IMAGE=$(lastword $(DOCKER_BASE_IMAGE)) $(DOCKER_ARGS) .
# Build wheels and source dist and twine upload them
pypi: build
- twine upload dist/ocrd-$(VERSION)*{tar.gz,whl}
+ twine upload --verbose dist/ocrd-$(VERSION)*{tar.gz,whl}
pypi-workaround: build-workaround
for dist in $(BUILD_ORDER);do twine upload dist/$$dist-$(VERSION)*{tar.gz,whl};done
diff --git a/VERSION b/VERSION
index a6f4248b2f..2a94548735 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-3.0.0a1
+3.0.0a1
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 672f9bc66c..ed5fd56d59 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
atomicwrites >= 1.3.0
beanie~=1.7
click >=7
+cryptography < 43.0.0
Deprecated == 1.2.0
docker
fastapi>=0.78.0
@@ -27,9 +28,8 @@ pydantic==1.*
python-magic
python-multipart
pyyaml
-requests < 2.30
-requests_unixsocket
+requests
+requests_unixsocket2
shapely
uvicorn
uvicorn>=0.17.6
-
diff --git a/requirements_test.txt b/requirements_test.txt
index 0f0e5b97d4..d8cef1dae7 100644
--- a/requirements_test.txt
+++ b/requirements_test.txt
@@ -1,4 +1,5 @@
autopep8
+cryptography < 43.0.0
pytest >= 4.0.0
generateDS == 2.35.20
pytest-benchmark >= 3.2.3
diff --git a/src/ocrd/cli/process.py b/src/ocrd/cli/process.py
index b71b74d096..9dcd562644 100644
--- a/src/ocrd/cli/process.py
+++ b/src/ocrd/cli/process.py
@@ -19,14 +19,15 @@
@click.command('process')
@ocrd_loglevel
@click.option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME)
+@click.option('-U', '--mets-server-url', help="TCP host URI or UDS path of METS server")
@click.option('-g', '--page-id', help="ID(s) of the pages to process")
@click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist")
@click.argument('tasks', nargs=-1, required=True)
-def process_cli(log_level, mets, page_id, tasks, overwrite):
+def process_cli(log_level, mets, mets_server_url, page_id, tasks, overwrite):
"""
Process a series of tasks
"""
initLogging()
log = getLogger('ocrd.cli.process')
- run_tasks(mets, log_level, page_id, tasks, overwrite)
+ run_tasks(mets, log_level, page_id, tasks, overwrite=overwrite, mets_server_url=mets_server_url)
log.info("Finished")
diff --git a/src/ocrd/cli/resmgr.py b/src/ocrd/cli/resmgr.py
index 1289e498e1..6ddc9a91bf 100644
--- a/src/ocrd/cli/resmgr.py
+++ b/src/ocrd/cli/resmgr.py
@@ -13,29 +13,20 @@
import requests
import click
-from ocrd_utils import (
- initLogging,
- directory_size,
- getLogger,
- get_ocrd_tool_json,
- get_moduledir,
- RESOURCE_LOCATIONS,
-)
+from ocrd_utils import directory_size, getLogger, get_moduledir, get_ocrd_tool_json, initLogging, RESOURCE_LOCATIONS
from ocrd.constants import RESOURCE_USER_LIST_COMMENT
from ..resource_manager import OcrdResourceManager
+
def print_resources(executable, reslist, resmgr):
- print('%s' % executable)
+ print(f"{executable}")
for resdict in reslist:
- print('- %s %s (%s)\n %s' % (
- resdict['name'],
- '@ %s' % resmgr.resource_dir_to_location(resdict['path']) if 'path' in resdict else '',
- resdict['url'],
- resdict['description']
- ))
+ res_loc = resmgr.resource_dir_to_location(resdict['path']) if 'path' in resdict else ''
+ print(f"- {resdict['name']} @ {res_loc} ({resdict['url']})\n {resdict['description']}")
print()
+
@click.group("resmgr")
def resmgr_cli():
"""
@@ -43,9 +34,12 @@ def resmgr_cli():
"""
initLogging()
+
@resmgr_cli.command('list-available')
-@click.option('-D', '--no-dynamic', is_flag=True, default=False, help="Whether to skip looking into each processor's --dump-{json,module-dir} for module-level resources")
-@click.option('-e', '--executable', help='Show only resources for executable beginning with EXEC', metavar='EXEC', default='ocrd-*')
+@click.option('-D', '--no-dynamic', is_flag=True, default=False,
+ help="Whether to skip looking into each processor's --dump-{json,module-dir} for module-level resources")
+@click.option('-e', '--executable', metavar='EXEC', default='ocrd-*',
+ help='Show only resources for executable beginning with EXEC', )
def list_available(executable, no_dynamic):
"""
List available resources
@@ -54,6 +48,7 @@ def list_available(executable, no_dynamic):
for executable, reslist in resmgr.list_available(executable=executable, dynamic=not no_dynamic):
print_resources(executable, reslist, resmgr)
+
@resmgr_cli.command('list-installed')
@click.option('-e', '--executable', help='Show only resources for executable EXEC', metavar='EXEC')
def list_installed(executable=None):
@@ -64,17 +59,24 @@ def list_installed(executable=None):
for executable, reslist in resmgr.list_installed(executable):
print_resources(executable, reslist, resmgr)
+
@resmgr_cli.command('download')
-@click.option('-n', '--any-url', help='URL of unregistered resource to download/copy from', default='')
-@click.option('-D', '--no-dynamic', is_flag=True, default=False, help="Whether to skip looking into each processor's --dump-{json,module-dir} for module-level resources")
-@click.option('-t', '--resource-type', help='Type of resource', type=click.Choice(['file', 'directory', 'archive']), default='file')
-@click.option('-P', '--path-in-archive', help='Path to extract in case of archive type', default='.')
-@click.option('-a', '--allow-uninstalled', help="Allow installing resources for uninstalled processors", is_flag=True)
+@click.option('-n', '--any-url', default='', help='URL of unregistered resource to download/copy from')
+@click.option('-D', '--no-dynamic', default=False, is_flag=True,
+ help="Whether to skip looking into each processor's --dump-{json,module-dir} for module-level resources")
+@click.option('-t', '--resource-type', type=click.Choice(['file', 'directory', 'archive']), default='file',
+ help='Type of resource',)
+@click.option('-P', '--path-in-archive', default='.', help='Path to extract in case of archive type')
+@click.option('-a', '--allow-uninstalled', is_flag=True,
+ help="Allow installing resources for uninstalled processors",)
@click.option('-o', '--overwrite', help='Overwrite existing resources', is_flag=True)
-@click.option('-l', '--location', help="Where to store resources - defaults to first location in processor's 'resource_locations' list or finally 'data'", type=click.Choice(RESOURCE_LOCATIONS))
+@click.option('-l', '--location', type=click.Choice(RESOURCE_LOCATIONS),
+ help="Where to store resources - defaults to first location in processor's 'resource_locations' "
+ "list or finally 'data'")
@click.argument('executable', required=True)
@click.argument('name', required=False)
-def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstalled, overwrite, location, executable, name):
+def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstalled, overwrite, location, executable,
+ name):
"""
Download resource NAME for processor EXECUTABLE.
@@ -91,7 +93,7 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal
log = getLogger('ocrd.cli.resmgr')
resmgr = OcrdResourceManager()
if executable != '*' and not name:
- log.error("Unless EXECUTABLE ('%s') is the '*' wildcard, NAME is required" % executable)
+ log.error(f"Unless EXECUTABLE ('{executable}') is the '*' wildcard, NAME is required")
sys.exit(1)
elif executable == '*':
executable = None
@@ -101,19 +103,21 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal
is_filename = Path(any_url).exists() if any_url else False
if executable and not which(executable):
if not allow_uninstalled:
- log.error("Executable '%s' is not installed. " \
- "To download resources anyway, use the -a/--allow-uninstalled flag", executable)
+ log.error(f"Executable '{executable}' is not installed. "
+ f"To download resources anyway, use the -a/--allow-uninstalled flag")
sys.exit(1)
else:
- log.info("Executable %s is not installed, but " \
- "downloading resources anyway", executable)
+ log.info(f"Executable '{executable}' is not installed, but downloading resources anyway")
reslist = resmgr.list_available(executable=executable, dynamic=not no_dynamic, name=name)
if not any(r[1] for r in reslist):
log.info(f"No resources {name} found in registry for executable {executable}")
if executable and name:
- reslist = [(executable, [{'url': any_url or '???', 'name': name,
- 'type': resource_type,
- 'path_in_archive': path_in_archive}])]
+ reslist = [(executable, [{
+ 'url': any_url or '???',
+ 'name': name,
+ 'type': resource_type,
+ 'path_in_archive': path_in_archive}]
+ )]
for this_executable, this_reslist in reslist:
for resdict in this_reslist:
if 'size' in resdict:
@@ -123,15 +127,15 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal
if any_url:
resdict['url'] = any_url
if resdict['url'] == '???':
- log.warning("Cannot download user resource %s", resdict['name'])
+ log.warning(f"Cannot download user resource {resdict['name']}")
continue
if resdict['url'].startswith('https://') or resdict['url'].startswith('http://'):
- log.info("Downloading %s resource '%s' (%s)", registered, resdict['name'], resdict['url'])
+ log.info(f"Downloading {registered} resource '{resdict['name']}' ({resdict['url']})")
if 'size' not in resdict:
with requests.head(resdict['url']) as r:
resdict['size'] = int(r.headers.get('content-length', 0))
else:
- log.info("Copying %s resource '%s' (%s)", registered, resdict['name'], resdict['url'])
+ log.info(f"Copying {registered} resource '{resdict['name']}' ({resdict['url']})")
urlpath = Path(resdict['url'])
resdict['url'] = str(urlpath.resolve())
if Path(urlpath).is_dir():
@@ -141,7 +145,8 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal
if not location:
location = get_ocrd_tool_json(this_executable)['resource_locations'][0]
elif location not in get_ocrd_tool_json(this_executable)['resource_locations']:
- log.error("The selected --location {location} is not in the {this_executable}'s resource search path, refusing to install to invalid location")
+ log.error(f"The selected --location {location} is not in the {this_executable}'s resource search path, "
+ f"refusing to install to invalid location")
sys.exit(1)
if location != 'module':
basedir = resmgr.location_to_resource_dir(location)
@@ -164,13 +169,16 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal
progress_cb=lambda delta: bar.update(delta)
)
if registered == 'unregistered':
- log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", this_executable, name, any_url, resmgr.user_list)
+ log.info(f"{this_executable} resource '{name}' ({any_url}) not a known resource, creating stub "
+ f"in {resmgr.user_list}'")
resmgr.add_to_user_database(this_executable, fpath, url=any_url)
resmgr.save_user_list()
- log.info("Installed resource %s under %s", resdict['url'], fpath)
+ log.info(f"Installed resource {resdict['url']} under {fpath}")
except FileExistsError as exc:
log.info(str(exc))
- log.info("Use in parameters as '%s'", resmgr.parameter_usage(resdict['name'], usage=resdict.get('parameter_usage', 'as-is')))
+ log.info(f"Use in parameters as "
+ f"'{resmgr.parameter_usage(resdict['name'], usage=resdict.get('parameter_usage', 'as-is'))}'")
+
@resmgr_cli.command('migrate')
@click.argument('migration', type=click.Choice(['2.37.0']))
@@ -203,7 +211,6 @@ def migrate(migration):
v_out = 'directory'
resdict_out[k_out] = v_out
yaml_out[executable].append(resdict_out)
- resmgr.user_list.write_text(RESOURCE_USER_LIST_COMMENT +
- '\n# migrated with ocrd resmgr migrate {migration}\n' +
- safe_dump(yaml_out))
+ resmgr.user_list.write_text(
+ RESOURCE_USER_LIST_COMMENT + '\n# migrated with ocrd resmgr migrate {migration}\n' + safe_dump(yaml_out))
log.info(f'Applied migration {migration} to {resmgr.user_list}')
diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py
index 52f48b7c09..0c70fd3a36 100644
--- a/src/ocrd/cli/workspace.py
+++ b/src/ocrd/cli/workspace.py
@@ -48,7 +48,7 @@ def __init__(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, met
@click.option('-d', '--directory', envvar='WORKSPACE_DIR', type=click.Path(file_okay=False), metavar='WORKSPACE_DIR', help='Changes the workspace folder location [default: METS_URL directory or .]"')
@click.option('-M', '--mets-basename', default=None, help='METS file basename. Deprecated, use --mets/--directory')
@click.option('-m', '--mets', default=None, help='The path/URL of the METS file [default: WORKSPACE_DIR/mets.xml]', metavar="METS_URL")
-@click.option('-U', '--mets-server-url', 'mets_server_url', help="TCP host of METS server")
+@click.option('-U', '--mets-server-url', 'mets_server_url', help="TCP host URI or UDS path of METS server")
@click.option('--backup', default=False, help="Backup mets.xml whenever it is saved.", is_flag=True)
@click.pass_context
def workspace_cli(ctx, directory, mets, mets_basename, mets_server_url, backup):
@@ -467,19 +467,18 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl
include_fileGrp=include_fileGrp,
exclude_fileGrp=exclude_fileGrp,
):
- ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field]
if download and not f.local_filename:
workspace.download_file(f)
modified_mets = True
if wait:
time.sleep(wait)
if undo_download and f.url and f.local_filename:
- ret_entry = [f'Removed local_filename {f.local_filename}']
f.local_filename = None
modified_mets = True
if not keep_files:
ctx.log.debug("rm %s [cwd=%s]", f.local_filename, workspace.directory)
unlink(f.local_filename)
+ ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field]
ret.append(ret_entry)
if modified_mets:
workspace.save_mets()
diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash
index a65ca35cd6..1e3ecfc6eb 100644
--- a/src/ocrd/lib.bash
+++ b/src/ocrd/lib.bash
@@ -146,7 +146,7 @@ ocrd__parse_argv () {
-D|--dump-module-dir) echo $(dirname "$OCRD_TOOL_JSON"); exit ;;
-C|--show-resource) ocrd__show_resource "$2"; exit ;;
-L|--list-resources) ocrd__list_resources; exit ;;
- -p|--parameter) __parameters+=(-p $(ocrd__resolve_resource "$2" 2>/dev/null || echo "$2")) ; shift ;;
+ -p|--parameter) __parameters+=(-p "$(ocrd__resolve_resource "$2" 2>/dev/null || echo "$2")") ; shift ;;
-P|--parameter-override) __parameter_overrides+=(-P "$2" "$3") ; shift ; shift ;;
-g|--page-id) ocrd__argv[page_id]=$2 ; shift ;;
-O|--output-file-grp) ocrd__argv[output_file_grp]=$2 ; shift ;;
diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py
index 5131f3f05c..da6e873c06 100644
--- a/src/ocrd/mets_server.py
+++ b/src/ocrd/mets_server.py
@@ -527,6 +527,9 @@ async def add_file(
# Create socket and change to world-readable and -writable to avoid permission errors
self.log.debug(f"chmod 0o677 {self.url}")
server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+ if Path(self.url).exists() and not is_socket_in_use(self.url):
+ # remove leftover unused socket which blocks startup
+ Path(self.url).unlink()
server.bind(self.url) # creates the socket file
atexit.register(self.shutdown)
server.close()
@@ -540,3 +543,14 @@ async def add_file(
self.log.debug("Starting uvicorn")
uvicorn.run(app, **uvicorn_kwargs)
+
+
+def is_socket_in_use(socket_path):
+ if Path(socket_path).exists():
+ client = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+ try:
+ client.connect(socket_path)
+ except OSError:
+ return False
+ client.close()
+ return True
diff --git a/src/ocrd/ocrd-all-tool.json b/src/ocrd/ocrd-all-tool.json
new file mode 100644
index 0000000000..fee8e7ef62
--- /dev/null
+++ b/src/ocrd/ocrd-all-tool.json
@@ -0,0 +1,21 @@
+{
+ "ocrd-dummy": {
+ "executable": "ocrd-dummy",
+ "description": "Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group",
+ "steps": [
+ "preprocessing/optimization"
+ ],
+ "categories": [
+ "Image preprocessing"
+ ],
+ "input_file_grp": "DUMMY_INPUT",
+ "output_file_grp": "DUMMY_OUTPUT",
+ "parameters": {
+ "copy_files": {
+ "type": "boolean",
+ "default": false,
+ "description": "Whether to actually copy files (true) or just create PAGE-XML as a side effect (false)"
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py
index 5cde4d9fe2..43aec4ace0 100644
--- a/src/ocrd/processor/base.py
+++ b/src/ocrd/processor/base.py
@@ -168,6 +168,7 @@ def __init__(
if parameter is None:
parameter = {}
parameterValidator = ParameterValidator(self.ocrd_tool)
+
report = parameterValidator.validate(parameter)
if not report.is_valid:
raise ValueError("Invalid parameters %s" % report.errors)
@@ -427,6 +428,7 @@ def show_resource(self, val):
Args:
val (string): resource value to show
"""
+
res_fname = self.resolve_resource(val)
fpath = Path(res_fname)
if fpath.is_dir():
diff --git a/src/ocrd/resolver.py b/src/ocrd/resolver.py
index 4b8fe6b217..124d006927 100644
--- a/src/ocrd/resolver.py
+++ b/src/ocrd/resolver.py
@@ -95,12 +95,15 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip',
log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url))
return str(ret)
- # Respect 'if_exists' arg
+ # Respect 'if_exists' kwarg
if dst_path.exists():
if if_exists == 'skip':
+ log.debug(f"File already exists but if_exists == {if_exists}, skipping.")
return str(ret)
- if if_exists == 'raise':
- raise FileExistsError(f"File already exists and if_exists == 'raise': {dst_path}")
+ elif if_exists == 'raise':
+ raise FileExistsError(f"File already exists and if_exists == '{if_exists}': {dst_path}")
+ else:
+ log.debug(f"File already exists but if_exists == {if_exists}, overwriting.")
# Create dst_path parent dir
dst_path.parent.mkdir(parents=True, exist_ok=True)
@@ -174,6 +177,9 @@ def workspace_from_url(
By default existing ``mets.xml`` will raise an exception.
download (boolean, False): Whether to also download all the files referenced by the METS
src_baseurl (string, None): Base URL for resolving relative file locations
+ mets_server_url (string, None): URI of TCP or local path of UDS for METS server handling
+ the `OcrdMets` of the workspace. By default the METS will be read from and written to
+ the filesystem directly.
**kwargs (): Passed on to ``OcrdMets.find_files`` if download == True
Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless
@@ -215,7 +221,7 @@ def workspace_from_url(
log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'",
mets_basename, mets_url, src_baseurl, dst_dir)
- self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip')
+ self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'raise')
workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url)
diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py
index c668028e9c..44bbd081bc 100644
--- a/src/ocrd/resource_manager.py
+++ b/src/ocrd/resource_manager.py
@@ -1,6 +1,6 @@
from pathlib import Path
from os.path import join
-from os import environ, listdir, getcwd, path, unlink
+from os import environ, listdir, makedirs, getcwd, path, unlink
from shutil import copytree, rmtree, copy
from fnmatch import filter as apply_glob
from datetime import datetime
@@ -24,7 +24,8 @@
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
-class OcrdResourceManager():
+
+class OcrdResourceManager:
"""
Managing processor resources
@@ -81,7 +82,7 @@ def load_resource_list(self, list_filename, database=None):
report = OcrdResourceListValidator.validate(list_loaded)
if not report.is_valid:
self.log.error('\n'.join(report.errors))
- raise ValueError("Resource list %s is invalid!" % (list_filename))
+ raise ValueError(f"Resource list {list_filename} is invalid!")
for executable, resource_list in list_loaded.items():
if executable not in database:
database[executable] = []
@@ -98,8 +99,14 @@ def list_available(self, executable=None, dynamic=True, name=None, database=None
if not executable:
return database.items()
if dynamic:
+ skip_executables = ["ocrd-cis-data", "ocrd-import", "ocrd-make"]
for exec_dir in environ['PATH'].split(':'):
for exec_path in Path(exec_dir).glob(f'{executable}'):
+ if not exec_path.name.startswith('ocrd-'):
+ self.log.warning(f"OCR-D processor executable '{exec_path}' has no 'ocrd-' prefix")
+ if exec_path.name in skip_executables:
+ self.log.debug(f"Not an OCR-D processor CLI, skipping '{exec_path}'")
+ continue
self.log.debug(f"Inspecting '{exec_path} --dump-json' for resources")
ocrd_tool = get_ocrd_tool_json(exec_path)
for resdict in ocrd_tool.get('resources', ()):
@@ -176,7 +183,8 @@ def add_to_user_database(self, executable, res_filename, url=None, resource_type
Add a stub entry to the user resource.yml
"""
res_name = Path(res_filename).name
- self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", executable, res_name, str(res_filename), self.user_list)
+ self.log.info(f"{executable} resource '{res_name}' ({str(res_filename)}) not a known resource, "
+ f"creating stub in {self.user_list}'")
if Path(res_filename).is_dir():
res_size = directory_size(res_filename)
else:
@@ -190,7 +198,7 @@ def add_to_user_database(self, executable, res_filename, url=None, resource_type
resdict = {
'name': res_name,
'url': url if url else '???',
- 'description': 'Found at %s on %s' % (self.resource_dir_to_location(res_filename), datetime.now()),
+ 'description': f'Found at {self.resource_dir_to_location(res_filename)} on {datetime.now()}',
'version_range': '???',
'type': resource_type,
'size': res_size
@@ -218,74 +226,82 @@ def resource_dir_to_location(self, resource_path):
'cwd' if resource_path.startswith(getcwd()) else \
resource_path
- def parameter_usage(self, name, usage='as-is'):
+ @staticmethod
+ def parameter_usage(name, usage='as-is'):
if usage == 'as-is':
return name
elif usage == 'without-extension':
return Path(name).stem
- raise ValueError("No such usage '%s'" % usage)
+ raise ValueError(f"No such usage '{usage}'")
- def _download_impl(self, url, filename, progress_cb=None, size=None):
+ @staticmethod
+ def _download_impl(url, filename, progress_cb=None, size=None):
log = getLogger('ocrd.resource_manager._download_impl')
- log.info("Downloading %s to %s" % (url, filename))
- with open(filename, 'wb') as f:
+ log.info(f"Downloading {url} to {filename}")
+ try:
gdrive_file_id, is_gdrive_download_link = gparse_url(url, warning=False)
if gdrive_file_id:
if not is_gdrive_download_link:
- url = "https://drive.google.com/uc?id={id}".format(id=gdrive_file_id)
+ url = f"https://drive.google.com/uc?id={gdrive_file_id}"
try:
with requests.get(url, stream=True) as r:
if "Content-Disposition" not in r.headers:
url = get_url_from_gdrive_confirmation(r.text)
except RuntimeError as e:
log.warning("Cannot unwrap Google Drive URL: ", e)
- with requests.get(url, stream=True) as r:
- r.raise_for_status()
- for data in r.iter_content(chunk_size=4096):
+ with open(filename, 'wb') as f:
+ with requests.get(url, stream=True) as r:
+ r.raise_for_status()
+ for data in r.iter_content(chunk_size=4096):
+ if progress_cb:
+ progress_cb(len(data))
+ f.write(data)
+ except Exception as e:
+ rmtree(filename, ignore_errors=True)
+ Path(filename).unlink(missing_ok=True)
+ raise e
+
+ @staticmethod
+ def _copy_file(src, dst, progress_cb=None):
+ log = getLogger('ocrd.resource_manager._copy_file')
+ log.info(f"Copying file {src} to {dst}")
+ with open(dst, 'wb') as f_out, open(src, 'rb') as f_in:
+ while True:
+ chunk = f_in.read(4096)
+ if chunk:
+ f_out.write(chunk)
if progress_cb:
- progress_cb(len(data))
- f.write(data)
+ progress_cb(len(chunk))
+ else:
+ break
- def _copy_impl(self, src_filename, filename, progress_cb=None):
+ @staticmethod
+ def _copy_dir(src, dst, progress_cb=None):
+ log = getLogger('ocrd.resource_manager._copy_dir')
+ log.info(f"Copying dir recursively from {src} to {dst}")
+ if not Path(src).is_dir():
+ raise ValueError(f"The source is not a directory: {src}")
+ Path(dst).mkdir(parents=True, exist_ok=True)
+ for child in Path(src).rglob('*'):
+ child_dst = Path(dst) / child.relative_to(src)
+ if Path(child).is_dir():
+ OcrdResourceManager._copy_dir(child, child_dst, progress_cb)
+ else:
+ OcrdResourceManager._copy_file(child, child_dst, progress_cb)
+
+ @staticmethod
+ def _copy_impl(src_filename, filename, progress_cb=None):
log = getLogger('ocrd.resource_manager._copy_impl')
- log.info("Copying %s to %s", src_filename, filename)
+ log.info(f"Copying {src_filename} to {filename}")
if Path(src_filename).is_dir():
- log.info(f"Copying recursively from {src_filename} to {filename}")
- for child in Path(src_filename).rglob('*'):
- child_dst = Path(filename) / child.relative_to(src_filename)
- child_dst.parent.mkdir(parents=True, exist_ok=True)
- with open(child_dst, 'wb') as f_out, open(child, 'rb') as f_in:
- while True:
- chunk = f_in.read(4096)
- if chunk:
- f_out.write(chunk)
- if progress_cb:
- progress_cb(len(chunk))
- else:
- break
+ OcrdResourceManager._copy_dir(src_filename, filename, progress_cb)
else:
- with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in:
- while True:
- chunk = f_in.read(4096)
- if chunk:
- f_out.write(chunk)
- if progress_cb:
- progress_cb(len(chunk))
- else:
- break
+ OcrdResourceManager._copy_file(src_filename, filename, progress_cb)
# TODO Proper caching (make head request for size, If-Modified etc)
def download(
- self,
- executable,
- url,
- basedir,
- overwrite=False,
- no_subdir=False,
- name=None,
- resource_type='file',
- path_in_archive='.',
- progress_cb=None,
+ self, executable, url, basedir, overwrite=False, no_subdir=False, name=None, resource_type='file',
+ path_in_archive='.', progress_cb=None,
):
"""
Download a resource by URL
@@ -299,12 +315,15 @@ def download(
is_url = url.startswith('https://') or url.startswith('http://')
if fpath.exists():
if not overwrite:
- raise FileExistsError("%s %s already exists but --overwrite is not set" % ('Directory' if fpath.is_dir() else 'File', fpath))
+ fpath_type = 'Directory' if fpath.is_dir() else 'File'
+ log.warning(f"{fpath_type} {fpath} already exists but --overwrite is not set, skipping the download")
+ # raise FileExistsError(f"{fpath_type} {fpath} already exists but --overwrite is not set")
+ return fpath
if fpath.is_dir():
- log.info("Removing existing target directory {fpath}")
+ log.info(f"Removing existing target directory {fpath}")
rmtree(str(fpath))
else:
- log.info("Removing existing target file {fpath}")
+ log.info(f"Removing existing target file {fpath}")
unlink(str(fpath))
destdir.mkdir(parents=True, exist_ok=True)
if resource_type in ('file', 'directory'):
@@ -322,7 +341,7 @@ def download(
Path('out').mkdir()
with pushd_popd('out'):
mimetype = guess_media_type(f'../{archive_fname}', fallback='application/octet-stream')
- log.info("Extracting %s archive to %s/out" % (mimetype, tempdir))
+ log.info(f"Extracting {mimetype} archive to {tempdir}/out")
if mimetype == 'application/zip':
with ZipFile(f'../{archive_fname}', 'r') as zipf:
zipf.extractall()
@@ -330,8 +349,8 @@ def download(
with open_tarfile(f'../{archive_fname}', 'r:*') as tar:
tar.extractall()
else:
- raise RuntimeError("Unable to handle extraction of %s archive %s" % (mimetype, url))
- log.info("Copying '%s' from archive to %s" % (path_in_archive, fpath))
+ raise RuntimeError(f"Unable to handle extraction of {mimetype} archive {url}")
+ log.info(f"Copying '{path_in_archive}' from archive to {fpath}")
if Path(path_in_archive).is_dir():
copytree(path_in_archive, str(fpath))
else:
diff --git a/src/ocrd/task_sequence.py b/src/ocrd/task_sequence.py
index da691fbc1d..85e30b177c 100644
--- a/src/ocrd/task_sequence.py
+++ b/src/ocrd/task_sequence.py
@@ -115,9 +115,11 @@ def validate_tasks(tasks, workspace, page_id=None, overwrite=False):
return report
-def run_tasks(mets, log_level, page_id, task_strs, overwrite=False):
+def run_tasks(mets, log_level, page_id, task_strs, overwrite=False, mets_server_url=None):
resolver = Resolver()
- workspace = resolver.workspace_from_url(mets)
+ workdir, mets, basename, _ = resolver.resolve_mets_arguments(None, mets, None)
+ workspace = resolver.workspace_from_url(mets, workdir, mets_basename=basename,
+ mets_server_url=mets_server_url)
log = getLogger('ocrd.task_sequence.run_tasks')
tasks = [ProcessorTask.parse(task_str) for task_str in task_strs]
@@ -139,7 +141,8 @@ def run_tasks(mets, log_level, page_id, task_strs, overwrite=False):
overwrite=overwrite,
input_file_grp=','.join(task.input_file_grps),
output_file_grp=','.join(task.output_file_grps),
- parameter=json.dumps(task.parameters)
+ parameter=json.dumps(task.parameters),
+ mets_server_url=mets_server_url
)
# check return code
@@ -149,7 +152,8 @@ def run_tasks(mets, log_level, page_id, task_strs, overwrite=False):
log.info("Finished processing task '%s'", task)
# reload mets
- workspace.reload_mets()
+ if mets_server_url is None:
+ workspace.reload_mets()
# check output file groups are in mets
for output_file_grp in task.output_file_grps:
diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py
index bd9e4c5025..fc619b7d0b 100644
--- a/src/ocrd/workspace.py
+++ b/src/ocrd/workspace.py
@@ -64,13 +64,17 @@ class Workspace():
:py:class:`ocrd.resolver.Resolver`.
Args:
-
- directory (string) : Filesystem folder to work in
+ resolver (:py:class:`ocrd.Resolver`) : `Resolver` instance
+ directory (string) : Filesystem path to work in
mets (:py:class:`ocrd_models.ocrd_mets.OcrdMets`) : `OcrdMets` representing this workspace.
- Loaded from `'mets.xml'` if `None`.
- mets_basename (string) : Basename of the METS XML file. Default: Last URL segment of the mets_url.
- overwrite_mode (boolean) : Whether to force add operations on this workspace globally
- baseurl (string) : Base URL to prefix to relative URL.
+ If `None`, then loaded from ``directory``/``mets_basename``
+ or delegated to ``mets_server_url``.
+ mets_basename (string, mets.xml) : Basename of the METS XML file in the workspace directory.
+ mets_server_url (string, None) : URI of TCP or local path of UDS for METS server handling the
+ `OcrdMets` of this workspace. If `None`, then the METS will be read from and written to
+ the filesystem directly.
+ baseurl (string, None) : Base URL to prefix to relative URL.
+ overwrite_mode (boolean, False) : Whether to force add operations on this workspace globally
"""
def __init__(
@@ -422,7 +426,7 @@ def add_file(self, file_grp, content=None, **kwargs) -> Union[OcrdFile, ClientSi
# If the local filename has folder components, create those folders
local_filename_dir = str(kwargs['local_filename']).rsplit('/', 1)[0]
if local_filename_dir != str(kwargs['local_filename']) and not Path(local_filename_dir).is_dir():
- makedirs(local_filename_dir)
+ makedirs(local_filename_dir, exist_ok=True)
# print(kwargs)
kwargs["pageId"] = kwargs.pop("page_id")
diff --git a/src/ocrd_network/constants.py b/src/ocrd_network/constants.py
index 53dbd9b11b..bfa137d9e0 100644
--- a/src/ocrd_network/constants.py
+++ b/src/ocrd_network/constants.py
@@ -6,7 +6,7 @@
DOCKER_RABBIT_MQ_FEATURES = "quorum_queue,implicit_default_bindings,classic_mirrored_queue_version"
NETWORK_PROTOCOLS = ["http://", "https://"]
-OCRD_ALL_JSON_TOOLS_URL = "https://ocr-d.de/js/ocrd-all-tool.json"
+OCRD_ALL_TOOL_JSON = "ocrd-all-tool.json"
# Used as a placeholder to lock all pages when no page_id is specified
SERVER_ALL_PAGES_PLACEHOLDER = "all_pages"
diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py
index dbbdea6475..e142802268 100644
--- a/src/ocrd_network/processing_server.py
+++ b/src/ocrd_network/processing_server.py
@@ -10,7 +10,7 @@
from ocrd.task_sequence import ProcessorTask
from ocrd_utils import initLogging, getLogger
-from .constants import AgentType, JobState, OCRD_ALL_JSON_TOOLS_URL, ServerApiTags
+from .constants import AgentType, JobState, ServerApiTags
from .database import (
initiate_database,
db_get_processing_job,
@@ -58,7 +58,7 @@
)
from .tcp_to_uds_mets_proxy import MetsServerProxy
from .utils import (
- download_ocrd_all_tool_json,
+ load_ocrd_all_tool_json,
expand_page_ids,
generate_id,
generate_workflow_content,
@@ -90,8 +90,8 @@ def __init__(self, config_path: str, host: str, port: int) -> None:
log_file = get_processing_server_logging_file_path(pid=getpid())
configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a")
- self.log.info(f"Downloading ocrd all tool json")
- self.ocrd_all_tool_json = download_ocrd_all_tool_json(ocrd_all_url=OCRD_ALL_JSON_TOOLS_URL)
+ self.log.info(f"Loading ocrd all tool json")
+ self.ocrd_all_tool_json = load_ocrd_all_tool_json()
self.hostname = host
self.port = port
diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py
index e7a07fa9d9..a2f563de43 100644
--- a/src/ocrd_network/utils.py
+++ b/src/ocrd_network/utils.py
@@ -3,6 +3,7 @@
from fastapi import UploadFile
from functools import wraps
from hashlib import md5
+from json import loads
from pathlib import Path
from re import compile as re_compile, split as re_split
from requests import get as requests_get, Session as Session_TCP
@@ -14,7 +15,8 @@
from ocrd.resolver import Resolver
from ocrd.workspace import Workspace
from ocrd.mets_server import MpxReq
-from ocrd_utils import config, generate_range, REGEX_PREFIX, safe_filename, getLogger
+from ocrd_utils import config, generate_range, REGEX_PREFIX, safe_filename, getLogger, resource_string
+from .constants import OCRD_ALL_TOOL_JSON
from .rabbitmq_utils import OcrdResultMessage
@@ -92,14 +94,12 @@ def is_url_responsive(url: str, tries: int = 1, wait_time: int = 3) -> bool:
return False
-def download_ocrd_all_tool_json(ocrd_all_url: str):
- if not ocrd_all_url:
- raise ValueError(f"The URL of ocrd all tool json is empty")
- headers = {"Accept": "application/json"}
- response = Session_TCP().get(ocrd_all_url, headers=headers)
- if not response.status_code == 200:
- raise ValueError(f"Failed to download ocrd all tool json from: '{ocrd_all_url}'")
- return response.json()
+def load_ocrd_all_tool_json():
+ try:
+ ocrd_all_tool_json = loads(resource_string('ocrd', OCRD_ALL_TOOL_JSON))
+ except Exception as error:
+ raise ValueError(f"Failed to load ocrd all tool json from: '{OCRD_ALL_TOOL_JSON}', {error}")
+ return ocrd_all_tool_json
def post_to_callback_url(logger, callback_url: str, result_message: OcrdResultMessage):
diff --git a/src/ocrd_utils/__init__.py b/src/ocrd_utils/__init__.py
index d03c2a920c..2055758a89 100644
--- a/src/ocrd_utils/__init__.py
+++ b/src/ocrd_utils/__init__.py
@@ -169,6 +169,7 @@
disableLogging,
getLevelName,
getLogger,
+ get_logging_config_files,
initLogging,
setOverrideLogLevel,
)
diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py
index 5c99800cc8..b3a3e9537d 100644
--- a/src/ocrd_utils/config.py
+++ b/src/ocrd_utils/config.py
@@ -178,12 +178,12 @@ def _ocrd_download_timeout_parser(val):
default=(True, lambda: Path.home()))
config.add("XDG_DATA_HOME",
- description="Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database)",
+ description="Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location)",
parser=lambda val: Path(val),
default=(True, lambda: Path(config.HOME, '.local/share')))
config.add("XDG_CONFIG_HOME",
- description="Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location)",
+ description="Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database)",
parser=lambda val: Path(val),
default=(True, lambda: Path(config.HOME, '.config')))
diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py
index 6245f99b76..bb771fc0ce 100644
--- a/src/ocrd_utils/logging.py
+++ b/src/ocrd_utils/logging.py
@@ -128,6 +128,19 @@ def setOverrideLogLevel(lvl, silent=not config.OCRD_LOGGING_DEBUG):
print(f'[LOGGING] Overriding ocrd log level to {lvl}', file=sys.stderr)
ocrd_logger.setLevel(lvl)
+def get_logging_config_files():
+ """
+ Return a list of all ``ocrd_logging.conf`` files found in CWD, HOME or /etc.
+ """
+ CONFIG_PATHS = [
+ Path.cwd(),
+ Path.home(),
+ Path('/etc'),
+ ]
+ return [f for f \
+ in [p / 'ocrd_logging.conf' for p in CONFIG_PATHS] \
+ if f.exists()]
+
def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_LOGGING_DEBUG):
"""
Reset ``ocrd`` logger, read logging configuration if exists, otherwise use basicConfig
@@ -164,14 +177,7 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L
config_file = None
if not builtin_only:
- CONFIG_PATHS = [
- Path.cwd(),
- Path.home(),
- Path('/etc'),
- ]
- config_file = [f for f \
- in [p / 'ocrd_logging.conf' for p in CONFIG_PATHS] \
- if f.exists()]
+ config_file = get_logging_config_files()
if config_file:
if len(config_file) > 1 and not silent:
print(f"[LOGGING] Multiple logging configuration files found at {config_file}, using first one", file=sys.stderr)
diff --git a/src/ocrd_utils/ocrd_logging.conf b/src/ocrd_utils/ocrd_logging.conf
index 93e311a882..5cf161398e 100644
--- a/src/ocrd_utils/ocrd_logging.conf
+++ b/src/ocrd_utils/ocrd_logging.conf
@@ -11,7 +11,7 @@
# each logger requires a corresponding configuration section below
#
[loggers]
-keys=root,ocrd,ocrd_network,ocrd_tensorflow,ocrd_shapely_geos,ocrd_PIL,uvicorn,uvicorn_access,uvicorn_error,multipart
+keys=root,ocrd,ocrd_network,ocrd_tensorflow,ocrd_shapely_geos,ocrd_PIL,uvicorn,uvicorn_access,uvicorn_error,multipart,paramiko,paramiko_transport
#
# mandatory handlers section
@@ -91,6 +91,21 @@ level=INFO
handlers=consoleHandler
qualname=PIL
+#
+# paramiko loggers
+#
+[logger_paramiko]
+level=INFO
+handlers=consoleHandler
+qualname=paramiko
+propagate=0
+
+[logger_paramiko_transport]
+level=INFO
+handlers=consoleHandler
+qualname=paramiko.transport
+propagate=0
+
#
# uvicorn loggers
#
diff --git a/src/ocrd_utils/os.py b/src/ocrd_utils/os.py
index 1b3ab4e73d..18463de0c0 100644
--- a/src/ocrd_utils/os.py
+++ b/src/ocrd_utils/os.py
@@ -34,6 +34,7 @@
from .constants import EXT_TO_MIME
from .config import config
from .logging import getLogger
+from .introspect import resource_string
def abspath(url):
"""
@@ -79,12 +80,16 @@ def get_ocrd_tool_json(executable):
"""
Get the ``ocrd-tool`` description of ``executable``.
"""
+ ocrd_tool = {}
executable_name = Path(executable).name
try:
- ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE).stdout)
- except (JSONDecodeError, OSError) as e:
- getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json produced invalid JSON: {e}')
- ocrd_tool = {}
+ ocrd_all_tool = loads(resource_string('ocrd', 'ocrd-all-tool.json'))
+ ocrd_tool = ocrd_all_tool[executable]
+ except (JSONDecodeError, OSError, KeyError):
+ try:
+ ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE).stdout)
+ except (JSONDecodeError, OSError) as e:
+ getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json produced invalid JSON: {e}')
if 'resource_locations' not in ocrd_tool:
ocrd_tool['resource_locations'] = ['data', 'cwd', 'system', 'module']
return ocrd_tool
@@ -93,9 +98,13 @@ def get_ocrd_tool_json(executable):
def get_moduledir(executable):
moduledir = None
try:
- moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE).stdout.rstrip('\n')
- except (JSONDecodeError, OSError) as e:
- getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}')
+ ocrd_all_moduledir = loads(resource_string('ocrd', 'ocrd-all-module-dir.json'))
+ moduledir = ocrd_all_moduledir[executable]
+ except (JSONDecodeError, OSError, KeyError):
+ try:
+ moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE).stdout.rstrip('\n')
+ except (JSONDecodeError, OSError) as e:
+ getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}')
return moduledir
def list_resource_candidates(executable, fname, cwd=getcwd(), moduled=None, xdg_data_home=None):
diff --git a/tests/cli/test_process.py b/tests/cli/test_process.py
index d0e5dc2129..d123e857bd 100644
--- a/tests/cli/test_process.py
+++ b/tests/cli/test_process.py
@@ -1,16 +1,61 @@
+from os.path import exists
+from os import remove, getcwd
+from time import sleep
+from contextlib import ExitStack
+from multiprocessing import Process, set_start_method
+# necessary for macos
+from sys import platform
+if platform == "darwin":
+ set_start_method("fork")
+
+from ocrd import Resolver, Workspace, OcrdMetsServer
from ocrd.cli import process_cli
-from ocrd_utils import pushd_popd, disableLogging
+from ocrd_utils import pushd_popd
from tests.base import CapturingTestCase as TestCase, main, assets, copy_of_directory
-class TestLogCli(TestCase):
+class TestCli(TestCase):
+
+ def setUp(self):
+ super().setUp()
+ # make sure we get an isolated temporary copy of the testdata each time
+ # as long as we are not using pytest but unittest, we need to manage contexts
+ # (enterContext is only supported starting with py311)
+ with ExitStack() as stack:
+ self.workdir = stack.enter_context(copy_of_directory(assets.path_to('kant_aufklaerung_1784/data')))
+ stack.enter_context(pushd_popd(self.workdir))
+ self.addCleanup(stack.pop_all().close)
def test_cli_process_smoke(self):
- disableLogging()
- with copy_of_directory(assets.path_to('kant_aufklaerung_1784/data')) as wsdir:
- with pushd_popd(wsdir):
- with self.assertRaisesRegex(Exception, "Executable not found in PATH: ocrd-foo"):
- self.invoke_cli(process_cli, ['foo'])
+ with self.assertRaisesRegex(Exception, "Executable not found in PATH: ocrd-foo"):
+ self.invoke_cli(process_cli, ['foo'])
+
+ def test_cli_process_dummy(self):
+ code, out, err = self.invoke_cli(process_cli, ['dummy -I OCR-D-GT-PAGE -O OCR-D-DUMMY'])
+ print(code, out, err)
+ self.assertFalse(code)
+ self.assertTrue(exists('OCR-D-DUMMY'))
+
+ def test_cli_process_mets_server(self):
+ # stolen from test_mets_server.fixture_start_mets_server ...
+ def _start_mets_server(*args, **kwargs):
+ mets_server = OcrdMetsServer(*args, **kwargs)
+ mets_server.startup()
+ if exists('mets.sock'):
+ remove('mets.sock')
+ ws = Workspace(Resolver(), getcwd())
+ p = Process(target=_start_mets_server, kwargs={'workspace': ws, 'url': 'mets.sock'})
+ p.daemon = True
+ p.start()
+ sleep(1) # sleep to start up server
+ self.assertTrue(exists('mets.sock'))
+ code, out, err = self.invoke_cli(process_cli, ['-U', 'mets.sock', 'dummy -I OCR-D-GT-PAGE -O OCR-D-DUMMY'])
+ print(code, out, err)
+ self.assertFalse(code)
+ self.assertTrue(exists('OCR-D-DUMMY'))
+ p.terminate()
+ ws.reload_mets()
+ self.assertIn('OCR-D-DUMMY', ws.mets.file_groups)
if __name__ == '__main__':
main(__file__)
diff --git a/tests/cli/test_resmgr.py b/tests/cli/test_resmgr.py
index 6cec6225b8..9c52100cba 100644
--- a/tests/cli/test_resmgr.py
+++ b/tests/cli/test_resmgr.py
@@ -21,7 +21,7 @@ def test_url_tool_name_unregistered(mgr_with_tmp_path):
"""
We should add a test for the -n URL TOOL NAME use-case as well (both as an unregistered resource and as URL-override).
"""
- tmp_path, mgr, env = mgr_with_tmp_path
+ _, mgr, env = mgr_with_tmp_path
print(mgr.list_installed(executable)[0][1])
rsrcs_before = len(mgr.list_installed(executable)[0][1])
@@ -33,23 +33,28 @@ def test_url_tool_name_unregistered(mgr_with_tmp_path):
rsrcs = mgr.list_installed(executable)[0][1]
assert len(rsrcs) == rsrcs_before + 1
- assert rsrcs[0]['name'] == name
- assert rsrcs[0]['url'] == url
+ assert rsrcs[-1]['name'] == name
+ assert rsrcs[-1]['url'] == url
# add resource with different URL but same name
url2 = url.replace('dzo', 'bos')
- r = runner.invoke(resmgr_cli, ['download', '--allow-uninstalled', '--any-url', url2, executable, name], env=env)
- assert 'already exists but --overwrite is not set' in r.output
+ #
+ # TODO(kba): Silently skipped since https://github.com/OCR-D/core/commit/d5173ada7d052c107c04da8732ccd30f61c4d9a1
+ # so we'd need to check the log output which is not captured by
+ # CliRunner, even though `mix_stderr == True`
+ #
+ # r = runner.invoke(resmgr_cli, ['download', '--allow-uninstalled', '--any-url', url2, executable, name], env=env)
+ # assert 'already exists but --overwrite is not set' in r.output
r = runner.invoke(resmgr_cli, ['download', '--overwrite', '--allow-uninstalled', '--any-url', url2, executable, name], env=env)
- assert 'already exists but --overwrite is not set' not in r.output
+ # assert 'already exists but --overwrite is not set' not in r.output
mgr.load_resource_list(mgr.user_list)
rsrcs = mgr.list_installed(executable)[0][1]
print(rsrcs)
assert len(rsrcs) == rsrcs_before + 1
- assert rsrcs[0]['name'] == name
- assert rsrcs[0]['url'] == url2
+ assert rsrcs[-1]['name'] == name
+ assert rsrcs[-1]['url'] == url2
def test_directory_copy(mgr_with_tmp_path):
"""
@@ -76,13 +81,18 @@ def test_directory_copy(mgr_with_tmp_path):
assert Path(mgr_path / 'ocrd-resources' / proc).exists()
assert directory_size(mgr_path / 'ocrd-resources' / proc / res_name) == 30
- r = runner.invoke(
- resmgr_cli,
- ['download', '--allow-uninstalled', '--any-url', tmp_path, proc, res_name],
- env=env,
- catch_exceptions=False
- )
- assert 'already exists but --overwrite is not set' in r.output
+ #
+ # TODO(kba): Silently skipped since https://github.com/OCR-D/core/commit/d5173ada7d052c107c04da8732ccd30f61c4d9a1
+ # so we'd need to check the log output which is not captured by
+ # CliRunner, even though `mix_stderr == True`
+ #
+ # r = runner.invoke(
+ # resmgr_cli,
+ # ['download', '--allow-uninstalled', '--any-url', tmp_path, proc, res_name],
+ # env=env,
+ # catch_exceptions=False
+ # )
+ # assert 'already exists but --overwrite is not set' in r.output
r = runner.invoke(
resmgr_cli,
['download', '--overwrite', '--allow-uninstalled', '--any-url', tmp_path, proc, res_name],
diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py
index 3bf342b8ef..739db7625a 100644
--- a/tests/model/test_ocrd_mets.py
+++ b/tests/model/test_ocrd_mets.py
@@ -61,10 +61,11 @@ def test_unique_identifier_from_nothing():
def test_str():
- mets = OcrdMets(content='', cache_flag=False)
- assert str(mets) == 'OcrdMets[cached=False,fileGrps=[],files=[]]'
- mets_cached = OcrdMets(content='', cache_flag=True)
- assert str(mets_cached) == 'OcrdMets[cached=True,fileGrps=[],files=[]]'
+ with temp_env_var('OCRD_METS_CACHING', None):
+ mets = OcrdMets(content='', cache_flag=False)
+ assert str(mets) == 'OcrdMets[cached=False,fileGrps=[],files=[]]'
+ mets_cached = OcrdMets(content='', cache_flag=True)
+ assert str(mets_cached) == 'OcrdMets[cached=True,fileGrps=[],files=[]]'
def test_file_groups(sbb_sample_01):
@@ -383,16 +384,20 @@ def test_invalid_filegrp():
@contextmanager
def temp_env_var(k, v):
v_before = environ.get(k, None)
- environ[k] = v
+ if v == None:
+ environ.pop(k, None)
+ else:
+ environ[k] = v
yield
if v_before is not None:
environ[k] = v_before
else:
- del environ[k]
+ environ.pop(k, None)
def test_envvar():
- assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag
- assert not OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag
+ with temp_env_var('OCRD_METS_CACHING', None):
+ assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag
+ assert not OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag
with temp_env_var('OCRD_METS_CACHING', 'true'):
assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag
assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag
diff --git a/tests/network/docker-compose.yml b/tests/network/docker-compose.yml
index a5cef49e23..ec45bc7e8a 100644
--- a/tests/network/docker-compose.yml
+++ b/tests/network/docker-compose.yml
@@ -22,7 +22,7 @@ services:
test: echo 'db.runCommand("ping").ok' | mongosh localhost:27017/test --quiet
interval: 1s
timeout: 3s
- retries: 30
+ retries: 90
ocrd_network_rabbit_mq:
image: "rabbitmq:3.12-management"
@@ -42,7 +42,7 @@ services:
test: rabbitmq-diagnostics check_port_connectivity
interval: 1s
timeout: 3s
- retries: 30
+ retries: 90
ocrd_network_processing_server:
image: "ocrd_core_test"
diff --git a/tests/test_decorators.py b/tests/test_decorators.py
index 3c38f20789..5ab2880053 100644
--- a/tests/test_decorators.py
+++ b/tests/test_decorators.py
@@ -15,7 +15,7 @@
ocrd_loglevel,
ocrd_cli_wrap_processor,
) # pylint: disable=protected-access
-from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging
+from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging, get_logging_config_files
@click.command()
@ocrd_cli_options
@@ -60,6 +60,8 @@ def test_loglevel_invalid(self):
assert "'foo' is not one of" in err
def test_loglevel_override(self):
+ if get_logging_config_files():
+ pytest.skip(f"ocrd_logging.conf found at {get_logging_config_files()}, skipping logging test")
import logging
disableLogging()
assert logging.getLogger('ocrd').getEffectiveLevel() == logging.WARNING
diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py
index b1350ed663..1487617a71 100644
--- a/tests/test_mets_server.py
+++ b/tests/test_mets_server.py
@@ -6,8 +6,11 @@
from itertools import repeat
from multiprocessing import Process, Pool, Pipe, set_start_method
-# necessary for macos
-set_start_method("fork")
+try:
+ # necessary for macos
+ set_start_method("fork")
+except RuntimeError:
+ pass
from shutil import rmtree, copytree
from os import remove, stat as os_stat
from os.path import exists
diff --git a/tests/test_resolver.py b/tests/test_resolver.py
index 7e102612e1..c2575b6086 100644
--- a/tests/test_resolver.py
+++ b/tests/test_resolver.py
@@ -117,6 +117,9 @@ def test_workspace_from_url_kant_with_resources(mock_request, tmp_path):
@patch.object(Session, "get")
def test_workspace_from_url_kant_with_resources_existing_local(mock_request, tmp_path):
+ """
+ Fail with clobber_mets=False, succeeed with clobber_mets=True
+ """
# arrange
url_src = 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml'
@@ -127,12 +130,14 @@ def test_workspace_from_url_kant_with_resources_existing_local(mock_request, tmp
dst_mets = Path(dst_dir, 'mets.xml')
shutil.copyfile(src_mets, dst_mets)
- # act
- Resolver().workspace_from_url(url_src, clobber_mets=False, dst_dir=dst_dir)
+ # fail
+ with pytest.raises(FileExistsError) as exc:
+ Resolver().workspace_from_url(url_src, clobber_mets=False, dst_dir=dst_dir)
+ assert mock_request.call_count == 0
- # assert
- # no real request was made, since mets already present
- assert mock_request.call_count == 0
+ # succeed
+ Resolver().workspace_from_url(url_src, clobber_mets=True, dst_dir=dst_dir)
+ assert mock_request.call_count == 1
@patch.object(Session, "get")
@@ -229,7 +234,7 @@ def test_workspace_from_nothing_noclobber(tmp_path):
ws2 = Resolver().workspace_from_nothing(tmp_path)
assert ws2.directory == tmp_path
- with pytest.raises(Exception) as exc:
+ with pytest.raises(FileExistsError) as exc:
Resolver().workspace_from_nothing(tmp_path)
# assert
diff --git a/tests/test_resource_manager.py b/tests/test_resource_manager.py
index 221b0a3af1..653167e10a 100644
--- a/tests/test_resource_manager.py
+++ b/tests/test_resource_manager.py
@@ -1,9 +1,13 @@
import os
from pathlib import Path
+from shutil import rmtree
# import pdb
-from ocrd.resource_manager import OcrdResourceManager
-from ocrd_utils import config
+# avoid importing early (before mocking environ)
+# because ocrd_utils.config module-level init is crucial:
+#from ocrd.resource_manager import OcrdResourceManager
+#from ocrd_utils import config
+
from ocrd_utils.os import get_ocrd_tool_json
from pytest import raises, fixture
@@ -12,25 +16,31 @@
CONST_RESOURCE_YML = 'resources.yml'
CONST_RESOURCE_URL_LAYOUT = 'https://github.com/tesseract-ocr/tessdata_best/raw/main/bos.traineddata'
+
@fixture(autouse=True)
def drop_get_ocrd_tool_json_cache():
get_ocrd_tool_json.cache_clear()
yield
+
def test_resources_manager_config_default(monkeypatch, tmp_path):
# arrange
monkeypatch.setenv('HOME', str(tmp_path))
if 'XDG_CONFIG_HOME' in os.environ:
monkeypatch.delenv('XDG_CONFIG_HOME', raising=False)
+ if 'XDG_DATA_HOME' in os.environ:
+ monkeypatch.delenv('XDG_DATA_HOME', raising=False)
# act
+ from ocrd.resource_manager import OcrdResourceManager
mgr = OcrdResourceManager()
# assert
default_config_dir = os.path.join(os.environ['HOME'], '.config', 'ocrd')
f = Path(default_config_dir) / CONST_RESOURCE_YML
assert os.environ['HOME'] == str(tmp_path)
+ from ocrd_utils import config
assert config.HOME == tmp_path
assert Path.home() == tmp_path
assert f == mgr.user_list
@@ -54,6 +64,7 @@ def test_resources_manager_from_environment(tmp_path, monkeypatch):
monkeypatch.setenv('HOME', str(tmp_path))
# act
+ from ocrd.resource_manager import OcrdResourceManager
mgr = OcrdResourceManager()
# assert
@@ -72,6 +83,7 @@ def test_resources_manager_from_environment(tmp_path, monkeypatch):
def test_resources_manager_config_explicite(tmp_path):
# act
+ from ocrd.resource_manager import OcrdResourceManager
mgr = OcrdResourceManager(xdg_config_home=str(tmp_path / 'config'), xdg_data_home=str(tmp_path / 'data'))
# assert
@@ -85,9 +97,11 @@ def test_resources_manager_config_explicite(tmp_path):
assert fpath.exists()
assert mgr.add_to_user_database(proc, fpath)
+
def test_resources_manager_config_explicit_invalid(tmp_path):
# act
+ from ocrd.resource_manager import OcrdResourceManager
(tmp_path / 'ocrd').mkdir()
(tmp_path / 'ocrd' / CONST_RESOURCE_YML).write_text('::INVALID::')
@@ -95,9 +109,11 @@ def test_resources_manager_config_explicit_invalid(tmp_path):
with raises(ValueError, match='is invalid'):
OcrdResourceManager(xdg_config_home=tmp_path)
+
def test_find_resources(tmp_path):
# act
+ from ocrd.resource_manager import OcrdResourceManager
f = tmp_path / 'ocrd-foo' / 'foo.bar'
f.parent.mkdir()
f.write_text('foobar')
@@ -109,29 +125,39 @@ def test_find_resources(tmp_path):
assert 'ocrd-foo' in [x for x, _ in mgr.list_available()]
assert 'ocrd-foo' in [x for x, _ in mgr.list_available(url='http://foo/bar')]
+
def test_parameter_usage(tmp_path):
+ from ocrd.resource_manager import OcrdResourceManager
mgr = OcrdResourceManager(xdg_config_home=tmp_path)
assert mgr.parameter_usage('foo.bar') == 'foo.bar'
assert mgr.parameter_usage('foo.bar', 'without-extension') == 'foo'
with raises(ValueError, match='No such usage'):
mgr.parameter_usage('foo.bar', 'baz')
+
def test_default_resource_dir(tmp_path):
+ from ocrd.resource_manager import OcrdResourceManager
mgr = OcrdResourceManager(xdg_data_home=tmp_path)
assert mgr.xdg_config_home != mgr.xdg_data_home
assert mgr.default_resource_dir == str(mgr.xdg_data_home / 'ocrd-resources')
+
def test_list_available0(tmp_path):
+ from ocrd.resource_manager import OcrdResourceManager
mgr = OcrdResourceManager(xdg_data_home=tmp_path)
res = mgr.list_available()
assert len(res) > 0
+
def test_list_available_with_unknown_executable(tmp_path):
+ from ocrd.resource_manager import OcrdResourceManager
mgr = OcrdResourceManager(xdg_data_home=tmp_path)
res = mgr.list_available(executable="ocrd-non-existing-processor")
assert len(res[0][1]) == 0
+
def test_date_as_string(tmp_path):
+ from ocrd.resource_manager import OcrdResourceManager
mgr = OcrdResourceManager(xdg_data_home=tmp_path)
test_list = tmp_path / 'test-list.yml'
with open(test_list, 'w', encoding='utf-8') as fout:
@@ -147,7 +173,9 @@ def test_date_as_string(tmp_path):
mgr.load_resource_list(test_list)
mgr.list_available(executable='ocrd-eynollah-segment')
+
def test_download_archive(tmp_path):
+ from ocrd.resource_manager import OcrdResourceManager
mgr = OcrdResourceManager(xdg_data_home=tmp_path)
for archive_type in ('.zip', '.tar.gz', '.tar.xz'):
mgr.download(
@@ -163,5 +191,33 @@ def test_download_archive(tmp_path):
assert filecontent_path.read_text() == '1\n'
+def test_copy_impl(tmp_path):
+ from ocrd.resource_manager import OcrdResourceManager
+ root_dir = f"{tmp_path}/mgr_copy_impl_test"
+ root_dir_copied = f"{tmp_path}/mgr_copy_impl_test_copied"
+ rmtree(path=root_dir, ignore_errors=True)
+ rmtree(path=root_dir_copied, ignore_errors=True)
+
+ def _create_test_folder(test_dir: str, letter: str) -> str:
+ Path(f"{test_dir}/{letter}").mkdir(parents=True, exist_ok=True)
+ file_path = f"{test_dir}/{letter}/{letter}.txt"
+ with open(f"{file_path}", "w") as file:
+ file.write(f"{letter}")
+ return file_path
+
+ _create_test_folder(test_dir=root_dir, letter="a")
+ _create_test_folder(test_dir=root_dir, letter="b")
+ _create_test_folder(test_dir=root_dir, letter="c")
+
+ mgr = OcrdResourceManager(xdg_data_home=tmp_path)
+ mgr._copy_impl(src_filename=root_dir, filename=root_dir_copied)
+
+ assert Path(f"{root_dir_copied}/a/a.txt").exists()
+ assert Path(f"{root_dir_copied}/b/b.txt").exists()
+ assert Path(f"{root_dir_copied}/c/c.txt").exists()
+ rmtree(path=root_dir, ignore_errors=True)
+ rmtree(path=root_dir_copied, ignore_errors=True)
+
+
if __name__ == "__main__":
main(__file__)
diff --git a/tests/utils/test_os.py b/tests/utils/test_os.py
index a651ef608f..f6dbde3549 100644
--- a/tests/utils/test_os.py
+++ b/tests/utils/test_os.py
@@ -11,6 +11,7 @@
redirect_stderr_and_stdout_to_file,
guess_media_type,
)
+from ocrd_utils import config
class TestOsUtils(TestCase):
@@ -26,6 +27,8 @@ def tearDown(self):
def test_resolve_basic(self):
def dehomify(s):
+ if ENV['HOME'] == '/' or expanduser('~') == '/':
+ return s
return s.replace(ENV['HOME'], '$HOME').replace(expanduser('~'), '$HOME')
fname = 'foo.bar'
cands = list_resource_candidates('ocrd-dummy', fname)
@@ -34,7 +37,7 @@ def dehomify(s):
self.assertEqual(cands, [join(x, fname) for x in [
dehomify(join(getcwd())),
dehomify(self.tempdir_path),
- '$HOME/.local/share/ocrd-resources/ocrd-dummy',
+ dehomify(join(config.XDG_DATA_HOME, 'ocrd-resources', 'ocrd-dummy')),
'/usr/local/share/ocrd-resources/ocrd-dummy',
]])