diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index 2a1886959d..da01c8cb20 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -6,8 +6,9 @@ on: workflow_dispatch: # run manually env: - GHCRIO_DOCKER_TAG: ghcr.io/ocr-d/core - DOCKERIO_DOCKER_TAG: docker.io/ocrd/core + # FIXME: linux/arm/v7 disabled as long as scikit-build/cmake-python-distributions#503 is unresolved + # PLATFORMS: linux/amd64,linux/arm/v7,linux/arm64/v8,linux/ppc64le + PLATFORMS: linux/amd64,linux/arm64/v8,linux/ppc64le jobs: @@ -17,8 +18,18 @@ jobs: permissions: packages: write contents: read - + env: + DOCKER_BASE_TAG: ghcr.io/ocr-d docker.io/ocrd + # TODO(kba): make the interpolation work correctly + # DOCKER_BUILD: docker buildx build --progress=plain --platform ${{ env.PLATFORMS }} --push + # TODO(kba): Investigate why ppc64le build hangs on "Installing build dependencies" + # TODO(kba): Investigate why arm64 fails with .buildkit_qemu_emulator: /usr/local/bin/conda: Invalid ELF image for this architecture + DOCKER_BUILD: docker buildx build --progress=plain --platform linux/amd64 --push steps: + - name: Export variables + run: | + echo "DOCKER_BASE_TAG=${{ env.DOCKER_BASE_TAG }}" >> $GITHUB_ENV + echo "DOCKER_BUILD=${{ env.DOCKER_BUILD }}" >> $GITHUB_ENV - name: Checkout uses: actions/checkout@v4 with: @@ -28,19 +39,6 @@ jobs: - # Activate cache export feature to reduce build time of images name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - - name: Build the Docker image - run: make docker - - name: Build the Docker image with GPU support - run: make docker-cuda - - name: Alias Docker images - # default tag uses docker.io, so tag post-hoc - run: | - docker tag ocrd/core ${{ env.GHCRIO_DOCKER_TAG }} - docker tag ocrd/core-cuda ${{ env.GHCRIO_DOCKER_TAG }}-cuda - - name: Smoke Test that ocrd --help works - run: | - docker run --rm ${{ env.GHCRIO_DOCKER_TAG }} ocrd --version - docker run --rm ${{ env.GHCRIO_DOCKER_TAG }}-cuda ocrd --version - name: Login to GitHub Container Registry uses: docker/login-action@v3 with: @@ -52,20 +50,9 @@ jobs: with: username: ${{ secrets.DOCKERIO_USERNAME }} password: ${{ secrets.DOCKERIO_PASSWORD }} - - name: Push images to Github Container Registry - run: | - docker push ${{ env.GHCRIO_DOCKER_TAG }}:latest - docker push ${{ env.GHCRIO_DOCKER_TAG }}-cuda:latest - - name: Push images to Docker Hub - run: | - docker tag ${{ env.GHCRIO_DOCKER_TAG }} ${{ env.DOCKERIO_DOCKER_TAG }} - docker tag ${{ env.GHCRIO_DOCKER_TAG }}-cuda ${{ env.DOCKERIO_DOCKER_TAG }}-cuda - docker push ${{ env.DOCKERIO_DOCKER_TAG }}:latest - docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda:latest - docker push ${{ env.DOCKERIO_DOCKER_TAG }}:latest - docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda:latest - docker tag ${{ env.DOCKERIO_DOCKER_TAG }} ${{ env.DOCKERIO_DOCKER_TAG }}:$(git describe --tags --abbrev=0) - docker tag ${{ env.DOCKERIO_DOCKER_TAG }}-cuda ${{ env.DOCKERIO_DOCKER_TAG }}-cuda:$(git describe --tags --abbrev=0) - docker push ${{ env.DOCKERIO_DOCKER_TAG }}:$(git describe --tags --abbrev=0) - docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda:$(git describe --tags --abbrev=0) - + - name: Build the Docker image + run: make docker + - name: Build the Docker image with GPU support + run: make docker-cuda + - name: Build the Docker images with GPU support and ML frameworks + run: make docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 2b8e3d5b82..2ab5ee46c4 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -50,14 +50,18 @@ jobs: make install deps-test - name: Test with pytest run: | - make test benchmark + mkdir -p test-results + make test benchmark PYTEST_ARGS=--junitxml=test-results/test.xml + - uses: test-summary/action@v2 + with: + paths: "test-results/test.xml" - name: test to ensure that --editable install works run: | make install-dev; ocrd --version - name: Lint with flake8 run: | - python -m pip install flake8 + python -m pip install flake8 flake8-github # stop the build if there are Python syntax errors or undefined names - flake8 src --count --exit-zero --select=E9,F63,F7,F82 --show-source --statistics + flake8 src --count --exit-zero --select=E9,F63,F7,F82 --show-source --statistics --format=github # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 src --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + flake8 src --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --format=github diff --git a/.scrutinizer.yml b/.scrutinizer.yml index 4848dca46a..83cb1c8521 100644 --- a/.scrutinizer.yml +++ b/.scrutinizer.yml @@ -7,12 +7,16 @@ build: python: version: 3.8.2 virtualenv: true + variables: + DEBIAN_FRONTEND: noninteractive nodes: analysis: dependencies: override: - - sudo make deps-ubuntu - - make install + - echo "Skipped" + # - command: sudo make deps-ubuntu + # idle_timeout: 600 + # - make install tests: override: - py-scrutinizer-run diff --git a/CHANGELOG.md b/CHANGELOG.md index 049f804b3e..dd816a3545 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,58 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Changed: + + * ocrd_network: Use `ocrd-all-tool.json` bundled by core instead of download from website, #1257, #1260 + * `ocrd workspace clone`/`Resolver.workspace_from_url`: with `clobber_mets=False`, raise a FileExistsError for existing mets.xml on disk, #563, #1268 + * `ocrd workspace find --download`: print the the correct, up-to-date field, not `None`, #1202, #1266 + +## [2.67.2] - 2024-07-19 + +Fixed: + + * Run `multiprocessing.set_start_method('fork')` only for OSX, #1261 + * Broken PyPI release, #1262 + +## [2.67.1] - 2024-07-17 + +Fixed: + + - Build and tests fixed, no functional changes from #1258 + +## [2.67.0] - 2024-07-16 + +Changed: + + - Additional docker base images with preinstalled tensorflow 1 (`core-cuda-tf1`), tensorflow 2 (`core-cuda-tf2`) and torch (`core-cuda-torch`), #1239 + - Resource Manager: Skip instead of raise an exception download if target file already exists (unless `--overwrite`), #1246 + - Resource Manager: Try to use bundled `ocrd-all-tool.json` if available, #1250, OCR-D/all#444 + +Added: + + - `ocrd process` does support `-U/--mets-server`, #1243 + +Fixed: + + - `ocrd process`-derived tasks are not run in a temporary directory when not called from within workspace, #1243 + - regression from #1238 where processors failed that had required parameters, #1255, #1256 + - METS Server: Unlink UDS sockert file if it exists before startup, #1244 + - Resource Manager: Do not create zero-size files for failing downloads, #1201, #1246 + - Workspace.add_file: Allow multiple processors to create file group folders simultaneously, #1203, #1253 + - Resource Manager: Do not try to run `--dump-json` for known non-processors `ocrd-{cis-data,import,make}`, #1218, #1249 + - Resource Manager: Properly handle copying of directories, #1237, #1248 + - bashlib: regression in parsing JSON from introducing parameter preset files, #1258 + +Removed: + + - Defaults for `-I/--input-file-grp`/`-O/--output-file-grp`, #1256, #274 + +## [2.66.1] - 2024-06-26 + +Fixed: + + * GHA Docker: build docker.io first, then tag ghcr.io + ## [2.66.0] - 2024-06-07 Fixed: @@ -2092,8 +2144,12 @@ Fixed ## [0.0.1] - 2018-04-17 Initial Release -] + +[2.67.2]: ../../compare/v2.67.2..v2.67.1 +[2.67.1]: ../../compare/v2.67.1..v2.67.0 +[2.67.0]: ../../compare/v2.67.0..v2.66.1 +[2.66.1]: ../../compare/v2.66.1..v2.66.0 [2.66.0]: ../../compare/v2.66.0..v2.65.0 [2.65.0]: ../../compare/v2.65.0..v2.64.1 [2.64.1]: ../../compare/v2.64.1..v2.64.0 diff --git a/Dockerfile b/Dockerfile index fd57e5014d..144ae774dc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,6 +38,8 @@ RUN python3 -m venv /usr/local \ && hash -r \ && make install-dev \ && eval $FIXUP +# Smoke Test +RUN ocrd --version WORKDIR /data diff --git a/Makefile b/Makefile index 1b4ef47bd2..0608b0b738 100644 --- a/Makefile +++ b/Makefile @@ -151,7 +151,7 @@ deps-tf1: fi deps-tf2: - if $(PYTHON) -c 'import sys; print("%u.%u" % (sys.version_info.major, sys.version_info.minor))' | fgrep 3.8 && \ + if $(PYTHON) -c 'import sys; print("%u.%u" % (sys.version_info.major, sys.version_info.minor))' | fgrep 3.8; then \ $(PIP) install tensorflow; \ else \ $(PIP) install "tensorflow[and-cuda]"; \ @@ -162,7 +162,7 @@ deps-torch: # Dependencies for deployment in an ubuntu/debian linux deps-ubuntu: - apt-get install -y python3 imagemagick libgeos-dev + apt-get install -y python3 imagemagick libgeos-dev libxml2-dev libxslt-dev libssl-dev # Install test python deps via pip deps-test: @@ -361,44 +361,46 @@ pyclean: .PHONY: docker docker-cuda # Additional arguments to docker build. Default: '$(DOCKER_ARGS)' -DOCKER_ARGS = +DOCKER_ARGS ?= +DOCKER_BASE_TAG ?= ocrd +DOCKER_BUILD ?= docker build --progress=plain # Build docker image docker: DOCKER_BASE_IMAGE = ubuntu:20.04 -docker: DOCKER_TAG = ocrd/core +docker: DOCKER_TAG = $(DOCKER_BASE_TAG:%=%/core) docker: DOCKER_FILE = Dockerfile # Build extended sets for maximal layer sharing -docker-cuda: DOCKER_BASE_IMAGE = ocrd/core -docker-cuda: DOCKER_TAG = ocrd/core-cuda +docker-cuda: DOCKER_BASE_IMAGE = $(DOCKER_BASE_TAG)/core +docker-cuda: DOCKER_TAG = $(DOCKER_BASE_TAG:%=%/core-cuda) docker-cuda: DOCKER_FILE = Dockerfile.cuda docker-cuda: docker -docker-cuda-tf1: DOCKER_BASE_IMAGE = ocrd/core-cuda -docker-cuda-tf1: DOCKER_TAG = ocrd/core-cuda-tf1 +docker-cuda-tf1: DOCKER_BASE_IMAGE = $(DOCKER_BASE_TAG)/core-cuda +docker-cuda-tf1: DOCKER_TAG = $(DOCKER_BASE_TAG:%=%/core-cuda-tf1) docker-cuda-tf1: DOCKER_FILE = Dockerfile.cuda-tf1 docker-cuda-tf1: docker-cuda -docker-cuda-tf2: DOCKER_BASE_IMAGE = ocrd/core-cuda -docker-cuda-tf2: DOCKER_TAG = ocrd/core-cuda-tf2 +docker-cuda-tf2: DOCKER_BASE_IMAGE = $(DOCKER_BASE_TAG)/core-cuda +docker-cuda-tf2: DOCKER_TAG = $(DOCKER_BASE_TAG:%=%/core-cuda-tf2) docker-cuda-tf2: DOCKER_FILE = Dockerfile.cuda-tf2 docker-cuda-tf2: docker-cuda -docker-cuda-torch: DOCKER_BASE_IMAGE = ocrd/core-cuda -docker-cuda-torch: DOCKER_TAG = ocrd/core-cuda-torch +docker-cuda-torch: DOCKER_BASE_IMAGE = $(DOCKER_BASE_TAG)/core-cuda +docker-cuda-torch: DOCKER_TAG = $(DOCKER_BASE_TAG:%=%/core-cuda-torch) docker-cuda-torch: DOCKER_FILE = Dockerfile.cuda-torch docker-cuda-torch: docker-cuda -docker docker-cuda docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch: - docker build --progress=plain -f $(DOCKER_FILE) -t $(DOCKER_TAG) --target ocrd_core_base --build-arg BASE_IMAGE=$(DOCKER_BASE_IMAGE) $(DOCKER_ARGS) . +docker docker-cuda docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch: + $(DOCKER_BUILD) -f $(DOCKER_FILE) $(DOCKER_TAG:%=-t %) --target ocrd_core_base --build-arg BASE_IMAGE=$(lastword $(DOCKER_BASE_IMAGE)) $(DOCKER_ARGS) . # Build wheels and source dist and twine upload them pypi: build - twine upload dist/ocrd-$(VERSION)*{tar.gz,whl} + twine upload --verbose dist/ocrd-$(VERSION)*{tar.gz,whl} pypi-workaround: build-workaround for dist in $(BUILD_ORDER);do twine upload dist/$$dist-$(VERSION)*{tar.gz,whl};done diff --git a/VERSION b/VERSION index a6f4248b2f..2a94548735 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0a1 +3.0.0a1 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 672f9bc66c..ed5fd56d59 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ atomicwrites >= 1.3.0 beanie~=1.7 click >=7 +cryptography < 43.0.0 Deprecated == 1.2.0 docker fastapi>=0.78.0 @@ -27,9 +28,8 @@ pydantic==1.* python-magic python-multipart pyyaml -requests < 2.30 -requests_unixsocket +requests +requests_unixsocket2 shapely uvicorn uvicorn>=0.17.6 - diff --git a/requirements_test.txt b/requirements_test.txt index 0f0e5b97d4..d8cef1dae7 100644 --- a/requirements_test.txt +++ b/requirements_test.txt @@ -1,4 +1,5 @@ autopep8 +cryptography < 43.0.0 pytest >= 4.0.0 generateDS == 2.35.20 pytest-benchmark >= 3.2.3 diff --git a/src/ocrd/cli/process.py b/src/ocrd/cli/process.py index b71b74d096..9dcd562644 100644 --- a/src/ocrd/cli/process.py +++ b/src/ocrd/cli/process.py @@ -19,14 +19,15 @@ @click.command('process') @ocrd_loglevel @click.option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME) +@click.option('-U', '--mets-server-url', help="TCP host URI or UDS path of METS server") @click.option('-g', '--page-id', help="ID(s) of the pages to process") @click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist") @click.argument('tasks', nargs=-1, required=True) -def process_cli(log_level, mets, page_id, tasks, overwrite): +def process_cli(log_level, mets, mets_server_url, page_id, tasks, overwrite): """ Process a series of tasks """ initLogging() log = getLogger('ocrd.cli.process') - run_tasks(mets, log_level, page_id, tasks, overwrite) + run_tasks(mets, log_level, page_id, tasks, overwrite=overwrite, mets_server_url=mets_server_url) log.info("Finished") diff --git a/src/ocrd/cli/resmgr.py b/src/ocrd/cli/resmgr.py index 1289e498e1..6ddc9a91bf 100644 --- a/src/ocrd/cli/resmgr.py +++ b/src/ocrd/cli/resmgr.py @@ -13,29 +13,20 @@ import requests import click -from ocrd_utils import ( - initLogging, - directory_size, - getLogger, - get_ocrd_tool_json, - get_moduledir, - RESOURCE_LOCATIONS, -) +from ocrd_utils import directory_size, getLogger, get_moduledir, get_ocrd_tool_json, initLogging, RESOURCE_LOCATIONS from ocrd.constants import RESOURCE_USER_LIST_COMMENT from ..resource_manager import OcrdResourceManager + def print_resources(executable, reslist, resmgr): - print('%s' % executable) + print(f"{executable}") for resdict in reslist: - print('- %s %s (%s)\n %s' % ( - resdict['name'], - '@ %s' % resmgr.resource_dir_to_location(resdict['path']) if 'path' in resdict else '', - resdict['url'], - resdict['description'] - )) + res_loc = resmgr.resource_dir_to_location(resdict['path']) if 'path' in resdict else '' + print(f"- {resdict['name']} @ {res_loc} ({resdict['url']})\n {resdict['description']}") print() + @click.group("resmgr") def resmgr_cli(): """ @@ -43,9 +34,12 @@ def resmgr_cli(): """ initLogging() + @resmgr_cli.command('list-available') -@click.option('-D', '--no-dynamic', is_flag=True, default=False, help="Whether to skip looking into each processor's --dump-{json,module-dir} for module-level resources") -@click.option('-e', '--executable', help='Show only resources for executable beginning with EXEC', metavar='EXEC', default='ocrd-*') +@click.option('-D', '--no-dynamic', is_flag=True, default=False, + help="Whether to skip looking into each processor's --dump-{json,module-dir} for module-level resources") +@click.option('-e', '--executable', metavar='EXEC', default='ocrd-*', + help='Show only resources for executable beginning with EXEC', ) def list_available(executable, no_dynamic): """ List available resources @@ -54,6 +48,7 @@ def list_available(executable, no_dynamic): for executable, reslist in resmgr.list_available(executable=executable, dynamic=not no_dynamic): print_resources(executable, reslist, resmgr) + @resmgr_cli.command('list-installed') @click.option('-e', '--executable', help='Show only resources for executable EXEC', metavar='EXEC') def list_installed(executable=None): @@ -64,17 +59,24 @@ def list_installed(executable=None): for executable, reslist in resmgr.list_installed(executable): print_resources(executable, reslist, resmgr) + @resmgr_cli.command('download') -@click.option('-n', '--any-url', help='URL of unregistered resource to download/copy from', default='') -@click.option('-D', '--no-dynamic', is_flag=True, default=False, help="Whether to skip looking into each processor's --dump-{json,module-dir} for module-level resources") -@click.option('-t', '--resource-type', help='Type of resource', type=click.Choice(['file', 'directory', 'archive']), default='file') -@click.option('-P', '--path-in-archive', help='Path to extract in case of archive type', default='.') -@click.option('-a', '--allow-uninstalled', help="Allow installing resources for uninstalled processors", is_flag=True) +@click.option('-n', '--any-url', default='', help='URL of unregistered resource to download/copy from') +@click.option('-D', '--no-dynamic', default=False, is_flag=True, + help="Whether to skip looking into each processor's --dump-{json,module-dir} for module-level resources") +@click.option('-t', '--resource-type', type=click.Choice(['file', 'directory', 'archive']), default='file', + help='Type of resource',) +@click.option('-P', '--path-in-archive', default='.', help='Path to extract in case of archive type') +@click.option('-a', '--allow-uninstalled', is_flag=True, + help="Allow installing resources for uninstalled processors",) @click.option('-o', '--overwrite', help='Overwrite existing resources', is_flag=True) -@click.option('-l', '--location', help="Where to store resources - defaults to first location in processor's 'resource_locations' list or finally 'data'", type=click.Choice(RESOURCE_LOCATIONS)) +@click.option('-l', '--location', type=click.Choice(RESOURCE_LOCATIONS), + help="Where to store resources - defaults to first location in processor's 'resource_locations' " + "list or finally 'data'") @click.argument('executable', required=True) @click.argument('name', required=False) -def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstalled, overwrite, location, executable, name): +def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstalled, overwrite, location, executable, + name): """ Download resource NAME for processor EXECUTABLE. @@ -91,7 +93,7 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal log = getLogger('ocrd.cli.resmgr') resmgr = OcrdResourceManager() if executable != '*' and not name: - log.error("Unless EXECUTABLE ('%s') is the '*' wildcard, NAME is required" % executable) + log.error(f"Unless EXECUTABLE ('{executable}') is the '*' wildcard, NAME is required") sys.exit(1) elif executable == '*': executable = None @@ -101,19 +103,21 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal is_filename = Path(any_url).exists() if any_url else False if executable and not which(executable): if not allow_uninstalled: - log.error("Executable '%s' is not installed. " \ - "To download resources anyway, use the -a/--allow-uninstalled flag", executable) + log.error(f"Executable '{executable}' is not installed. " + f"To download resources anyway, use the -a/--allow-uninstalled flag") sys.exit(1) else: - log.info("Executable %s is not installed, but " \ - "downloading resources anyway", executable) + log.info(f"Executable '{executable}' is not installed, but downloading resources anyway") reslist = resmgr.list_available(executable=executable, dynamic=not no_dynamic, name=name) if not any(r[1] for r in reslist): log.info(f"No resources {name} found in registry for executable {executable}") if executable and name: - reslist = [(executable, [{'url': any_url or '???', 'name': name, - 'type': resource_type, - 'path_in_archive': path_in_archive}])] + reslist = [(executable, [{ + 'url': any_url or '???', + 'name': name, + 'type': resource_type, + 'path_in_archive': path_in_archive}] + )] for this_executable, this_reslist in reslist: for resdict in this_reslist: if 'size' in resdict: @@ -123,15 +127,15 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal if any_url: resdict['url'] = any_url if resdict['url'] == '???': - log.warning("Cannot download user resource %s", resdict['name']) + log.warning(f"Cannot download user resource {resdict['name']}") continue if resdict['url'].startswith('https://') or resdict['url'].startswith('http://'): - log.info("Downloading %s resource '%s' (%s)", registered, resdict['name'], resdict['url']) + log.info(f"Downloading {registered} resource '{resdict['name']}' ({resdict['url']})") if 'size' not in resdict: with requests.head(resdict['url']) as r: resdict['size'] = int(r.headers.get('content-length', 0)) else: - log.info("Copying %s resource '%s' (%s)", registered, resdict['name'], resdict['url']) + log.info(f"Copying {registered} resource '{resdict['name']}' ({resdict['url']})") urlpath = Path(resdict['url']) resdict['url'] = str(urlpath.resolve()) if Path(urlpath).is_dir(): @@ -141,7 +145,8 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal if not location: location = get_ocrd_tool_json(this_executable)['resource_locations'][0] elif location not in get_ocrd_tool_json(this_executable)['resource_locations']: - log.error("The selected --location {location} is not in the {this_executable}'s resource search path, refusing to install to invalid location") + log.error(f"The selected --location {location} is not in the {this_executable}'s resource search path, " + f"refusing to install to invalid location") sys.exit(1) if location != 'module': basedir = resmgr.location_to_resource_dir(location) @@ -164,13 +169,16 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal progress_cb=lambda delta: bar.update(delta) ) if registered == 'unregistered': - log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", this_executable, name, any_url, resmgr.user_list) + log.info(f"{this_executable} resource '{name}' ({any_url}) not a known resource, creating stub " + f"in {resmgr.user_list}'") resmgr.add_to_user_database(this_executable, fpath, url=any_url) resmgr.save_user_list() - log.info("Installed resource %s under %s", resdict['url'], fpath) + log.info(f"Installed resource {resdict['url']} under {fpath}") except FileExistsError as exc: log.info(str(exc)) - log.info("Use in parameters as '%s'", resmgr.parameter_usage(resdict['name'], usage=resdict.get('parameter_usage', 'as-is'))) + log.info(f"Use in parameters as " + f"'{resmgr.parameter_usage(resdict['name'], usage=resdict.get('parameter_usage', 'as-is'))}'") + @resmgr_cli.command('migrate') @click.argument('migration', type=click.Choice(['2.37.0'])) @@ -203,7 +211,6 @@ def migrate(migration): v_out = 'directory' resdict_out[k_out] = v_out yaml_out[executable].append(resdict_out) - resmgr.user_list.write_text(RESOURCE_USER_LIST_COMMENT + - '\n# migrated with ocrd resmgr migrate {migration}\n' + - safe_dump(yaml_out)) + resmgr.user_list.write_text( + RESOURCE_USER_LIST_COMMENT + '\n# migrated with ocrd resmgr migrate {migration}\n' + safe_dump(yaml_out)) log.info(f'Applied migration {migration} to {resmgr.user_list}') diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 52f48b7c09..0c70fd3a36 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -48,7 +48,7 @@ def __init__(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, met @click.option('-d', '--directory', envvar='WORKSPACE_DIR', type=click.Path(file_okay=False), metavar='WORKSPACE_DIR', help='Changes the workspace folder location [default: METS_URL directory or .]"') @click.option('-M', '--mets-basename', default=None, help='METS file basename. Deprecated, use --mets/--directory') @click.option('-m', '--mets', default=None, help='The path/URL of the METS file [default: WORKSPACE_DIR/mets.xml]', metavar="METS_URL") -@click.option('-U', '--mets-server-url', 'mets_server_url', help="TCP host of METS server") +@click.option('-U', '--mets-server-url', 'mets_server_url', help="TCP host URI or UDS path of METS server") @click.option('--backup', default=False, help="Backup mets.xml whenever it is saved.", is_flag=True) @click.pass_context def workspace_cli(ctx, directory, mets, mets_basename, mets_server_url, backup): @@ -467,19 +467,18 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl include_fileGrp=include_fileGrp, exclude_fileGrp=exclude_fileGrp, ): - ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field] if download and not f.local_filename: workspace.download_file(f) modified_mets = True if wait: time.sleep(wait) if undo_download and f.url and f.local_filename: - ret_entry = [f'Removed local_filename {f.local_filename}'] f.local_filename = None modified_mets = True if not keep_files: ctx.log.debug("rm %s [cwd=%s]", f.local_filename, workspace.directory) unlink(f.local_filename) + ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field] ret.append(ret_entry) if modified_mets: workspace.save_mets() diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index a65ca35cd6..1e3ecfc6eb 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -146,7 +146,7 @@ ocrd__parse_argv () { -D|--dump-module-dir) echo $(dirname "$OCRD_TOOL_JSON"); exit ;; -C|--show-resource) ocrd__show_resource "$2"; exit ;; -L|--list-resources) ocrd__list_resources; exit ;; - -p|--parameter) __parameters+=(-p $(ocrd__resolve_resource "$2" 2>/dev/null || echo "$2")) ; shift ;; + -p|--parameter) __parameters+=(-p "$(ocrd__resolve_resource "$2" 2>/dev/null || echo "$2")") ; shift ;; -P|--parameter-override) __parameter_overrides+=(-P "$2" "$3") ; shift ; shift ;; -g|--page-id) ocrd__argv[page_id]=$2 ; shift ;; -O|--output-file-grp) ocrd__argv[output_file_grp]=$2 ; shift ;; diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 5131f3f05c..da6e873c06 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -527,6 +527,9 @@ async def add_file( # Create socket and change to world-readable and -writable to avoid permission errors self.log.debug(f"chmod 0o677 {self.url}") server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + if Path(self.url).exists() and not is_socket_in_use(self.url): + # remove leftover unused socket which blocks startup + Path(self.url).unlink() server.bind(self.url) # creates the socket file atexit.register(self.shutdown) server.close() @@ -540,3 +543,14 @@ async def add_file( self.log.debug("Starting uvicorn") uvicorn.run(app, **uvicorn_kwargs) + + +def is_socket_in_use(socket_path): + if Path(socket_path).exists(): + client = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + try: + client.connect(socket_path) + except OSError: + return False + client.close() + return True diff --git a/src/ocrd/ocrd-all-tool.json b/src/ocrd/ocrd-all-tool.json new file mode 100644 index 0000000000..fee8e7ef62 --- /dev/null +++ b/src/ocrd/ocrd-all-tool.json @@ -0,0 +1,21 @@ +{ + "ocrd-dummy": { + "executable": "ocrd-dummy", + "description": "Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group", + "steps": [ + "preprocessing/optimization" + ], + "categories": [ + "Image preprocessing" + ], + "input_file_grp": "DUMMY_INPUT", + "output_file_grp": "DUMMY_OUTPUT", + "parameters": { + "copy_files": { + "type": "boolean", + "default": false, + "description": "Whether to actually copy files (true) or just create PAGE-XML as a side effect (false)" + } + } + } +} \ No newline at end of file diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 5cde4d9fe2..43aec4ace0 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -168,6 +168,7 @@ def __init__( if parameter is None: parameter = {} parameterValidator = ParameterValidator(self.ocrd_tool) + report = parameterValidator.validate(parameter) if not report.is_valid: raise ValueError("Invalid parameters %s" % report.errors) @@ -427,6 +428,7 @@ def show_resource(self, val): Args: val (string): resource value to show """ + res_fname = self.resolve_resource(val) fpath = Path(res_fname) if fpath.is_dir(): diff --git a/src/ocrd/resolver.py b/src/ocrd/resolver.py index 4b8fe6b217..124d006927 100644 --- a/src/ocrd/resolver.py +++ b/src/ocrd/resolver.py @@ -95,12 +95,15 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip', log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url)) return str(ret) - # Respect 'if_exists' arg + # Respect 'if_exists' kwarg if dst_path.exists(): if if_exists == 'skip': + log.debug(f"File already exists but if_exists == {if_exists}, skipping.") return str(ret) - if if_exists == 'raise': - raise FileExistsError(f"File already exists and if_exists == 'raise': {dst_path}") + elif if_exists == 'raise': + raise FileExistsError(f"File already exists and if_exists == '{if_exists}': {dst_path}") + else: + log.debug(f"File already exists but if_exists == {if_exists}, overwriting.") # Create dst_path parent dir dst_path.parent.mkdir(parents=True, exist_ok=True) @@ -174,6 +177,9 @@ def workspace_from_url( By default existing ``mets.xml`` will raise an exception. download (boolean, False): Whether to also download all the files referenced by the METS src_baseurl (string, None): Base URL for resolving relative file locations + mets_server_url (string, None): URI of TCP or local path of UDS for METS server handling + the `OcrdMets` of the workspace. By default the METS will be read from and written to + the filesystem directly. **kwargs (): Passed on to ``OcrdMets.find_files`` if download == True Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless @@ -215,7 +221,7 @@ def workspace_from_url( log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'", mets_basename, mets_url, src_baseurl, dst_dir) - self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip') + self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'raise') workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index c668028e9c..44bbd081bc 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -1,6 +1,6 @@ from pathlib import Path from os.path import join -from os import environ, listdir, getcwd, path, unlink +from os import environ, listdir, makedirs, getcwd, path, unlink from shutil import copytree, rmtree, copy from fnmatch import filter as apply_glob from datetime import datetime @@ -24,7 +24,8 @@ from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT -class OcrdResourceManager(): + +class OcrdResourceManager: """ Managing processor resources @@ -81,7 +82,7 @@ def load_resource_list(self, list_filename, database=None): report = OcrdResourceListValidator.validate(list_loaded) if not report.is_valid: self.log.error('\n'.join(report.errors)) - raise ValueError("Resource list %s is invalid!" % (list_filename)) + raise ValueError(f"Resource list {list_filename} is invalid!") for executable, resource_list in list_loaded.items(): if executable not in database: database[executable] = [] @@ -98,8 +99,14 @@ def list_available(self, executable=None, dynamic=True, name=None, database=None if not executable: return database.items() if dynamic: + skip_executables = ["ocrd-cis-data", "ocrd-import", "ocrd-make"] for exec_dir in environ['PATH'].split(':'): for exec_path in Path(exec_dir).glob(f'{executable}'): + if not exec_path.name.startswith('ocrd-'): + self.log.warning(f"OCR-D processor executable '{exec_path}' has no 'ocrd-' prefix") + if exec_path.name in skip_executables: + self.log.debug(f"Not an OCR-D processor CLI, skipping '{exec_path}'") + continue self.log.debug(f"Inspecting '{exec_path} --dump-json' for resources") ocrd_tool = get_ocrd_tool_json(exec_path) for resdict in ocrd_tool.get('resources', ()): @@ -176,7 +183,8 @@ def add_to_user_database(self, executable, res_filename, url=None, resource_type Add a stub entry to the user resource.yml """ res_name = Path(res_filename).name - self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", executable, res_name, str(res_filename), self.user_list) + self.log.info(f"{executable} resource '{res_name}' ({str(res_filename)}) not a known resource, " + f"creating stub in {self.user_list}'") if Path(res_filename).is_dir(): res_size = directory_size(res_filename) else: @@ -190,7 +198,7 @@ def add_to_user_database(self, executable, res_filename, url=None, resource_type resdict = { 'name': res_name, 'url': url if url else '???', - 'description': 'Found at %s on %s' % (self.resource_dir_to_location(res_filename), datetime.now()), + 'description': f'Found at {self.resource_dir_to_location(res_filename)} on {datetime.now()}', 'version_range': '???', 'type': resource_type, 'size': res_size @@ -218,74 +226,82 @@ def resource_dir_to_location(self, resource_path): 'cwd' if resource_path.startswith(getcwd()) else \ resource_path - def parameter_usage(self, name, usage='as-is'): + @staticmethod + def parameter_usage(name, usage='as-is'): if usage == 'as-is': return name elif usage == 'without-extension': return Path(name).stem - raise ValueError("No such usage '%s'" % usage) + raise ValueError(f"No such usage '{usage}'") - def _download_impl(self, url, filename, progress_cb=None, size=None): + @staticmethod + def _download_impl(url, filename, progress_cb=None, size=None): log = getLogger('ocrd.resource_manager._download_impl') - log.info("Downloading %s to %s" % (url, filename)) - with open(filename, 'wb') as f: + log.info(f"Downloading {url} to {filename}") + try: gdrive_file_id, is_gdrive_download_link = gparse_url(url, warning=False) if gdrive_file_id: if not is_gdrive_download_link: - url = "https://drive.google.com/uc?id={id}".format(id=gdrive_file_id) + url = f"https://drive.google.com/uc?id={gdrive_file_id}" try: with requests.get(url, stream=True) as r: if "Content-Disposition" not in r.headers: url = get_url_from_gdrive_confirmation(r.text) except RuntimeError as e: log.warning("Cannot unwrap Google Drive URL: ", e) - with requests.get(url, stream=True) as r: - r.raise_for_status() - for data in r.iter_content(chunk_size=4096): + with open(filename, 'wb') as f: + with requests.get(url, stream=True) as r: + r.raise_for_status() + for data in r.iter_content(chunk_size=4096): + if progress_cb: + progress_cb(len(data)) + f.write(data) + except Exception as e: + rmtree(filename, ignore_errors=True) + Path(filename).unlink(missing_ok=True) + raise e + + @staticmethod + def _copy_file(src, dst, progress_cb=None): + log = getLogger('ocrd.resource_manager._copy_file') + log.info(f"Copying file {src} to {dst}") + with open(dst, 'wb') as f_out, open(src, 'rb') as f_in: + while True: + chunk = f_in.read(4096) + if chunk: + f_out.write(chunk) if progress_cb: - progress_cb(len(data)) - f.write(data) + progress_cb(len(chunk)) + else: + break - def _copy_impl(self, src_filename, filename, progress_cb=None): + @staticmethod + def _copy_dir(src, dst, progress_cb=None): + log = getLogger('ocrd.resource_manager._copy_dir') + log.info(f"Copying dir recursively from {src} to {dst}") + if not Path(src).is_dir(): + raise ValueError(f"The source is not a directory: {src}") + Path(dst).mkdir(parents=True, exist_ok=True) + for child in Path(src).rglob('*'): + child_dst = Path(dst) / child.relative_to(src) + if Path(child).is_dir(): + OcrdResourceManager._copy_dir(child, child_dst, progress_cb) + else: + OcrdResourceManager._copy_file(child, child_dst, progress_cb) + + @staticmethod + def _copy_impl(src_filename, filename, progress_cb=None): log = getLogger('ocrd.resource_manager._copy_impl') - log.info("Copying %s to %s", src_filename, filename) + log.info(f"Copying {src_filename} to {filename}") if Path(src_filename).is_dir(): - log.info(f"Copying recursively from {src_filename} to {filename}") - for child in Path(src_filename).rglob('*'): - child_dst = Path(filename) / child.relative_to(src_filename) - child_dst.parent.mkdir(parents=True, exist_ok=True) - with open(child_dst, 'wb') as f_out, open(child, 'rb') as f_in: - while True: - chunk = f_in.read(4096) - if chunk: - f_out.write(chunk) - if progress_cb: - progress_cb(len(chunk)) - else: - break + OcrdResourceManager._copy_dir(src_filename, filename, progress_cb) else: - with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in: - while True: - chunk = f_in.read(4096) - if chunk: - f_out.write(chunk) - if progress_cb: - progress_cb(len(chunk)) - else: - break + OcrdResourceManager._copy_file(src_filename, filename, progress_cb) # TODO Proper caching (make head request for size, If-Modified etc) def download( - self, - executable, - url, - basedir, - overwrite=False, - no_subdir=False, - name=None, - resource_type='file', - path_in_archive='.', - progress_cb=None, + self, executable, url, basedir, overwrite=False, no_subdir=False, name=None, resource_type='file', + path_in_archive='.', progress_cb=None, ): """ Download a resource by URL @@ -299,12 +315,15 @@ def download( is_url = url.startswith('https://') or url.startswith('http://') if fpath.exists(): if not overwrite: - raise FileExistsError("%s %s already exists but --overwrite is not set" % ('Directory' if fpath.is_dir() else 'File', fpath)) + fpath_type = 'Directory' if fpath.is_dir() else 'File' + log.warning(f"{fpath_type} {fpath} already exists but --overwrite is not set, skipping the download") + # raise FileExistsError(f"{fpath_type} {fpath} already exists but --overwrite is not set") + return fpath if fpath.is_dir(): - log.info("Removing existing target directory {fpath}") + log.info(f"Removing existing target directory {fpath}") rmtree(str(fpath)) else: - log.info("Removing existing target file {fpath}") + log.info(f"Removing existing target file {fpath}") unlink(str(fpath)) destdir.mkdir(parents=True, exist_ok=True) if resource_type in ('file', 'directory'): @@ -322,7 +341,7 @@ def download( Path('out').mkdir() with pushd_popd('out'): mimetype = guess_media_type(f'../{archive_fname}', fallback='application/octet-stream') - log.info("Extracting %s archive to %s/out" % (mimetype, tempdir)) + log.info(f"Extracting {mimetype} archive to {tempdir}/out") if mimetype == 'application/zip': with ZipFile(f'../{archive_fname}', 'r') as zipf: zipf.extractall() @@ -330,8 +349,8 @@ def download( with open_tarfile(f'../{archive_fname}', 'r:*') as tar: tar.extractall() else: - raise RuntimeError("Unable to handle extraction of %s archive %s" % (mimetype, url)) - log.info("Copying '%s' from archive to %s" % (path_in_archive, fpath)) + raise RuntimeError(f"Unable to handle extraction of {mimetype} archive {url}") + log.info(f"Copying '{path_in_archive}' from archive to {fpath}") if Path(path_in_archive).is_dir(): copytree(path_in_archive, str(fpath)) else: diff --git a/src/ocrd/task_sequence.py b/src/ocrd/task_sequence.py index da691fbc1d..85e30b177c 100644 --- a/src/ocrd/task_sequence.py +++ b/src/ocrd/task_sequence.py @@ -115,9 +115,11 @@ def validate_tasks(tasks, workspace, page_id=None, overwrite=False): return report -def run_tasks(mets, log_level, page_id, task_strs, overwrite=False): +def run_tasks(mets, log_level, page_id, task_strs, overwrite=False, mets_server_url=None): resolver = Resolver() - workspace = resolver.workspace_from_url(mets) + workdir, mets, basename, _ = resolver.resolve_mets_arguments(None, mets, None) + workspace = resolver.workspace_from_url(mets, workdir, mets_basename=basename, + mets_server_url=mets_server_url) log = getLogger('ocrd.task_sequence.run_tasks') tasks = [ProcessorTask.parse(task_str) for task_str in task_strs] @@ -139,7 +141,8 @@ def run_tasks(mets, log_level, page_id, task_strs, overwrite=False): overwrite=overwrite, input_file_grp=','.join(task.input_file_grps), output_file_grp=','.join(task.output_file_grps), - parameter=json.dumps(task.parameters) + parameter=json.dumps(task.parameters), + mets_server_url=mets_server_url ) # check return code @@ -149,7 +152,8 @@ def run_tasks(mets, log_level, page_id, task_strs, overwrite=False): log.info("Finished processing task '%s'", task) # reload mets - workspace.reload_mets() + if mets_server_url is None: + workspace.reload_mets() # check output file groups are in mets for output_file_grp in task.output_file_grps: diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index bd9e4c5025..fc619b7d0b 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -64,13 +64,17 @@ class Workspace(): :py:class:`ocrd.resolver.Resolver`. Args: - - directory (string) : Filesystem folder to work in + resolver (:py:class:`ocrd.Resolver`) : `Resolver` instance + directory (string) : Filesystem path to work in mets (:py:class:`ocrd_models.ocrd_mets.OcrdMets`) : `OcrdMets` representing this workspace. - Loaded from `'mets.xml'` if `None`. - mets_basename (string) : Basename of the METS XML file. Default: Last URL segment of the mets_url. - overwrite_mode (boolean) : Whether to force add operations on this workspace globally - baseurl (string) : Base URL to prefix to relative URL. + If `None`, then loaded from ``directory``/``mets_basename`` + or delegated to ``mets_server_url``. + mets_basename (string, mets.xml) : Basename of the METS XML file in the workspace directory. + mets_server_url (string, None) : URI of TCP or local path of UDS for METS server handling the + `OcrdMets` of this workspace. If `None`, then the METS will be read from and written to + the filesystem directly. + baseurl (string, None) : Base URL to prefix to relative URL. + overwrite_mode (boolean, False) : Whether to force add operations on this workspace globally """ def __init__( @@ -422,7 +426,7 @@ def add_file(self, file_grp, content=None, **kwargs) -> Union[OcrdFile, ClientSi # If the local filename has folder components, create those folders local_filename_dir = str(kwargs['local_filename']).rsplit('/', 1)[0] if local_filename_dir != str(kwargs['local_filename']) and not Path(local_filename_dir).is_dir(): - makedirs(local_filename_dir) + makedirs(local_filename_dir, exist_ok=True) # print(kwargs) kwargs["pageId"] = kwargs.pop("page_id") diff --git a/src/ocrd_network/constants.py b/src/ocrd_network/constants.py index 53dbd9b11b..bfa137d9e0 100644 --- a/src/ocrd_network/constants.py +++ b/src/ocrd_network/constants.py @@ -6,7 +6,7 @@ DOCKER_RABBIT_MQ_FEATURES = "quorum_queue,implicit_default_bindings,classic_mirrored_queue_version" NETWORK_PROTOCOLS = ["http://", "https://"] -OCRD_ALL_JSON_TOOLS_URL = "https://ocr-d.de/js/ocrd-all-tool.json" +OCRD_ALL_TOOL_JSON = "ocrd-all-tool.json" # Used as a placeholder to lock all pages when no page_id is specified SERVER_ALL_PAGES_PLACEHOLDER = "all_pages" diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index dbbdea6475..e142802268 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -10,7 +10,7 @@ from ocrd.task_sequence import ProcessorTask from ocrd_utils import initLogging, getLogger -from .constants import AgentType, JobState, OCRD_ALL_JSON_TOOLS_URL, ServerApiTags +from .constants import AgentType, JobState, ServerApiTags from .database import ( initiate_database, db_get_processing_job, @@ -58,7 +58,7 @@ ) from .tcp_to_uds_mets_proxy import MetsServerProxy from .utils import ( - download_ocrd_all_tool_json, + load_ocrd_all_tool_json, expand_page_ids, generate_id, generate_workflow_content, @@ -90,8 +90,8 @@ def __init__(self, config_path: str, host: str, port: int) -> None: log_file = get_processing_server_logging_file_path(pid=getpid()) configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a") - self.log.info(f"Downloading ocrd all tool json") - self.ocrd_all_tool_json = download_ocrd_all_tool_json(ocrd_all_url=OCRD_ALL_JSON_TOOLS_URL) + self.log.info(f"Loading ocrd all tool json") + self.ocrd_all_tool_json = load_ocrd_all_tool_json() self.hostname = host self.port = port diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index e7a07fa9d9..a2f563de43 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -3,6 +3,7 @@ from fastapi import UploadFile from functools import wraps from hashlib import md5 +from json import loads from pathlib import Path from re import compile as re_compile, split as re_split from requests import get as requests_get, Session as Session_TCP @@ -14,7 +15,8 @@ from ocrd.resolver import Resolver from ocrd.workspace import Workspace from ocrd.mets_server import MpxReq -from ocrd_utils import config, generate_range, REGEX_PREFIX, safe_filename, getLogger +from ocrd_utils import config, generate_range, REGEX_PREFIX, safe_filename, getLogger, resource_string +from .constants import OCRD_ALL_TOOL_JSON from .rabbitmq_utils import OcrdResultMessage @@ -92,14 +94,12 @@ def is_url_responsive(url: str, tries: int = 1, wait_time: int = 3) -> bool: return False -def download_ocrd_all_tool_json(ocrd_all_url: str): - if not ocrd_all_url: - raise ValueError(f"The URL of ocrd all tool json is empty") - headers = {"Accept": "application/json"} - response = Session_TCP().get(ocrd_all_url, headers=headers) - if not response.status_code == 200: - raise ValueError(f"Failed to download ocrd all tool json from: '{ocrd_all_url}'") - return response.json() +def load_ocrd_all_tool_json(): + try: + ocrd_all_tool_json = loads(resource_string('ocrd', OCRD_ALL_TOOL_JSON)) + except Exception as error: + raise ValueError(f"Failed to load ocrd all tool json from: '{OCRD_ALL_TOOL_JSON}', {error}") + return ocrd_all_tool_json def post_to_callback_url(logger, callback_url: str, result_message: OcrdResultMessage): diff --git a/src/ocrd_utils/__init__.py b/src/ocrd_utils/__init__.py index d03c2a920c..2055758a89 100644 --- a/src/ocrd_utils/__init__.py +++ b/src/ocrd_utils/__init__.py @@ -169,6 +169,7 @@ disableLogging, getLevelName, getLogger, + get_logging_config_files, initLogging, setOverrideLogLevel, ) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 5c99800cc8..b3a3e9537d 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -178,12 +178,12 @@ def _ocrd_download_timeout_parser(val): default=(True, lambda: Path.home())) config.add("XDG_DATA_HOME", - description="Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database)", + description="Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location)", parser=lambda val: Path(val), default=(True, lambda: Path(config.HOME, '.local/share'))) config.add("XDG_CONFIG_HOME", - description="Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location)", + description="Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database)", parser=lambda val: Path(val), default=(True, lambda: Path(config.HOME, '.config'))) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 6245f99b76..bb771fc0ce 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -128,6 +128,19 @@ def setOverrideLogLevel(lvl, silent=not config.OCRD_LOGGING_DEBUG): print(f'[LOGGING] Overriding ocrd log level to {lvl}', file=sys.stderr) ocrd_logger.setLevel(lvl) +def get_logging_config_files(): + """ + Return a list of all ``ocrd_logging.conf`` files found in CWD, HOME or /etc. + """ + CONFIG_PATHS = [ + Path.cwd(), + Path.home(), + Path('/etc'), + ] + return [f for f \ + in [p / 'ocrd_logging.conf' for p in CONFIG_PATHS] \ + if f.exists()] + def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_LOGGING_DEBUG): """ Reset ``ocrd`` logger, read logging configuration if exists, otherwise use basicConfig @@ -164,14 +177,7 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L config_file = None if not builtin_only: - CONFIG_PATHS = [ - Path.cwd(), - Path.home(), - Path('/etc'), - ] - config_file = [f for f \ - in [p / 'ocrd_logging.conf' for p in CONFIG_PATHS] \ - if f.exists()] + config_file = get_logging_config_files() if config_file: if len(config_file) > 1 and not silent: print(f"[LOGGING] Multiple logging configuration files found at {config_file}, using first one", file=sys.stderr) diff --git a/src/ocrd_utils/ocrd_logging.conf b/src/ocrd_utils/ocrd_logging.conf index 93e311a882..5cf161398e 100644 --- a/src/ocrd_utils/ocrd_logging.conf +++ b/src/ocrd_utils/ocrd_logging.conf @@ -11,7 +11,7 @@ # each logger requires a corresponding configuration section below # [loggers] -keys=root,ocrd,ocrd_network,ocrd_tensorflow,ocrd_shapely_geos,ocrd_PIL,uvicorn,uvicorn_access,uvicorn_error,multipart +keys=root,ocrd,ocrd_network,ocrd_tensorflow,ocrd_shapely_geos,ocrd_PIL,uvicorn,uvicorn_access,uvicorn_error,multipart,paramiko,paramiko_transport # # mandatory handlers section @@ -91,6 +91,21 @@ level=INFO handlers=consoleHandler qualname=PIL +# +# paramiko loggers +# +[logger_paramiko] +level=INFO +handlers=consoleHandler +qualname=paramiko +propagate=0 + +[logger_paramiko_transport] +level=INFO +handlers=consoleHandler +qualname=paramiko.transport +propagate=0 + # # uvicorn loggers # diff --git a/src/ocrd_utils/os.py b/src/ocrd_utils/os.py index 1b3ab4e73d..18463de0c0 100644 --- a/src/ocrd_utils/os.py +++ b/src/ocrd_utils/os.py @@ -34,6 +34,7 @@ from .constants import EXT_TO_MIME from .config import config from .logging import getLogger +from .introspect import resource_string def abspath(url): """ @@ -79,12 +80,16 @@ def get_ocrd_tool_json(executable): """ Get the ``ocrd-tool`` description of ``executable``. """ + ocrd_tool = {} executable_name = Path(executable).name try: - ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE).stdout) - except (JSONDecodeError, OSError) as e: - getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json produced invalid JSON: {e}') - ocrd_tool = {} + ocrd_all_tool = loads(resource_string('ocrd', 'ocrd-all-tool.json')) + ocrd_tool = ocrd_all_tool[executable] + except (JSONDecodeError, OSError, KeyError): + try: + ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE).stdout) + except (JSONDecodeError, OSError) as e: + getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json produced invalid JSON: {e}') if 'resource_locations' not in ocrd_tool: ocrd_tool['resource_locations'] = ['data', 'cwd', 'system', 'module'] return ocrd_tool @@ -93,9 +98,13 @@ def get_ocrd_tool_json(executable): def get_moduledir(executable): moduledir = None try: - moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE).stdout.rstrip('\n') - except (JSONDecodeError, OSError) as e: - getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}') + ocrd_all_moduledir = loads(resource_string('ocrd', 'ocrd-all-module-dir.json')) + moduledir = ocrd_all_moduledir[executable] + except (JSONDecodeError, OSError, KeyError): + try: + moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE).stdout.rstrip('\n') + except (JSONDecodeError, OSError) as e: + getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}') return moduledir def list_resource_candidates(executable, fname, cwd=getcwd(), moduled=None, xdg_data_home=None): diff --git a/tests/cli/test_process.py b/tests/cli/test_process.py index d0e5dc2129..d123e857bd 100644 --- a/tests/cli/test_process.py +++ b/tests/cli/test_process.py @@ -1,16 +1,61 @@ +from os.path import exists +from os import remove, getcwd +from time import sleep +from contextlib import ExitStack +from multiprocessing import Process, set_start_method +# necessary for macos +from sys import platform +if platform == "darwin": + set_start_method("fork") + +from ocrd import Resolver, Workspace, OcrdMetsServer from ocrd.cli import process_cli -from ocrd_utils import pushd_popd, disableLogging +from ocrd_utils import pushd_popd from tests.base import CapturingTestCase as TestCase, main, assets, copy_of_directory -class TestLogCli(TestCase): +class TestCli(TestCase): + + def setUp(self): + super().setUp() + # make sure we get an isolated temporary copy of the testdata each time + # as long as we are not using pytest but unittest, we need to manage contexts + # (enterContext is only supported starting with py311) + with ExitStack() as stack: + self.workdir = stack.enter_context(copy_of_directory(assets.path_to('kant_aufklaerung_1784/data'))) + stack.enter_context(pushd_popd(self.workdir)) + self.addCleanup(stack.pop_all().close) def test_cli_process_smoke(self): - disableLogging() - with copy_of_directory(assets.path_to('kant_aufklaerung_1784/data')) as wsdir: - with pushd_popd(wsdir): - with self.assertRaisesRegex(Exception, "Executable not found in PATH: ocrd-foo"): - self.invoke_cli(process_cli, ['foo']) + with self.assertRaisesRegex(Exception, "Executable not found in PATH: ocrd-foo"): + self.invoke_cli(process_cli, ['foo']) + + def test_cli_process_dummy(self): + code, out, err = self.invoke_cli(process_cli, ['dummy -I OCR-D-GT-PAGE -O OCR-D-DUMMY']) + print(code, out, err) + self.assertFalse(code) + self.assertTrue(exists('OCR-D-DUMMY')) + + def test_cli_process_mets_server(self): + # stolen from test_mets_server.fixture_start_mets_server ... + def _start_mets_server(*args, **kwargs): + mets_server = OcrdMetsServer(*args, **kwargs) + mets_server.startup() + if exists('mets.sock'): + remove('mets.sock') + ws = Workspace(Resolver(), getcwd()) + p = Process(target=_start_mets_server, kwargs={'workspace': ws, 'url': 'mets.sock'}) + p.daemon = True + p.start() + sleep(1) # sleep to start up server + self.assertTrue(exists('mets.sock')) + code, out, err = self.invoke_cli(process_cli, ['-U', 'mets.sock', 'dummy -I OCR-D-GT-PAGE -O OCR-D-DUMMY']) + print(code, out, err) + self.assertFalse(code) + self.assertTrue(exists('OCR-D-DUMMY')) + p.terminate() + ws.reload_mets() + self.assertIn('OCR-D-DUMMY', ws.mets.file_groups) if __name__ == '__main__': main(__file__) diff --git a/tests/cli/test_resmgr.py b/tests/cli/test_resmgr.py index 6cec6225b8..9c52100cba 100644 --- a/tests/cli/test_resmgr.py +++ b/tests/cli/test_resmgr.py @@ -21,7 +21,7 @@ def test_url_tool_name_unregistered(mgr_with_tmp_path): """ We should add a test for the -n URL TOOL NAME use-case as well (both as an unregistered resource and as URL-override). """ - tmp_path, mgr, env = mgr_with_tmp_path + _, mgr, env = mgr_with_tmp_path print(mgr.list_installed(executable)[0][1]) rsrcs_before = len(mgr.list_installed(executable)[0][1]) @@ -33,23 +33,28 @@ def test_url_tool_name_unregistered(mgr_with_tmp_path): rsrcs = mgr.list_installed(executable)[0][1] assert len(rsrcs) == rsrcs_before + 1 - assert rsrcs[0]['name'] == name - assert rsrcs[0]['url'] == url + assert rsrcs[-1]['name'] == name + assert rsrcs[-1]['url'] == url # add resource with different URL but same name url2 = url.replace('dzo', 'bos') - r = runner.invoke(resmgr_cli, ['download', '--allow-uninstalled', '--any-url', url2, executable, name], env=env) - assert 'already exists but --overwrite is not set' in r.output + # + # TODO(kba): Silently skipped since https://github.com/OCR-D/core/commit/d5173ada7d052c107c04da8732ccd30f61c4d9a1 + # so we'd need to check the log output which is not captured by + # CliRunner, even though `mix_stderr == True` + # + # r = runner.invoke(resmgr_cli, ['download', '--allow-uninstalled', '--any-url', url2, executable, name], env=env) + # assert 'already exists but --overwrite is not set' in r.output r = runner.invoke(resmgr_cli, ['download', '--overwrite', '--allow-uninstalled', '--any-url', url2, executable, name], env=env) - assert 'already exists but --overwrite is not set' not in r.output + # assert 'already exists but --overwrite is not set' not in r.output mgr.load_resource_list(mgr.user_list) rsrcs = mgr.list_installed(executable)[0][1] print(rsrcs) assert len(rsrcs) == rsrcs_before + 1 - assert rsrcs[0]['name'] == name - assert rsrcs[0]['url'] == url2 + assert rsrcs[-1]['name'] == name + assert rsrcs[-1]['url'] == url2 def test_directory_copy(mgr_with_tmp_path): """ @@ -76,13 +81,18 @@ def test_directory_copy(mgr_with_tmp_path): assert Path(mgr_path / 'ocrd-resources' / proc).exists() assert directory_size(mgr_path / 'ocrd-resources' / proc / res_name) == 30 - r = runner.invoke( - resmgr_cli, - ['download', '--allow-uninstalled', '--any-url', tmp_path, proc, res_name], - env=env, - catch_exceptions=False - ) - assert 'already exists but --overwrite is not set' in r.output + # + # TODO(kba): Silently skipped since https://github.com/OCR-D/core/commit/d5173ada7d052c107c04da8732ccd30f61c4d9a1 + # so we'd need to check the log output which is not captured by + # CliRunner, even though `mix_stderr == True` + # + # r = runner.invoke( + # resmgr_cli, + # ['download', '--allow-uninstalled', '--any-url', tmp_path, proc, res_name], + # env=env, + # catch_exceptions=False + # ) + # assert 'already exists but --overwrite is not set' in r.output r = runner.invoke( resmgr_cli, ['download', '--overwrite', '--allow-uninstalled', '--any-url', tmp_path, proc, res_name], diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 3bf342b8ef..739db7625a 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -61,10 +61,11 @@ def test_unique_identifier_from_nothing(): def test_str(): - mets = OcrdMets(content='', cache_flag=False) - assert str(mets) == 'OcrdMets[cached=False,fileGrps=[],files=[]]' - mets_cached = OcrdMets(content='', cache_flag=True) - assert str(mets_cached) == 'OcrdMets[cached=True,fileGrps=[],files=[]]' + with temp_env_var('OCRD_METS_CACHING', None): + mets = OcrdMets(content='', cache_flag=False) + assert str(mets) == 'OcrdMets[cached=False,fileGrps=[],files=[]]' + mets_cached = OcrdMets(content='', cache_flag=True) + assert str(mets_cached) == 'OcrdMets[cached=True,fileGrps=[],files=[]]' def test_file_groups(sbb_sample_01): @@ -383,16 +384,20 @@ def test_invalid_filegrp(): @contextmanager def temp_env_var(k, v): v_before = environ.get(k, None) - environ[k] = v + if v == None: + environ.pop(k, None) + else: + environ[k] = v yield if v_before is not None: environ[k] = v_before else: - del environ[k] + environ.pop(k, None) def test_envvar(): - assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag - assert not OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag + with temp_env_var('OCRD_METS_CACHING', None): + assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag + assert not OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag with temp_env_var('OCRD_METS_CACHING', 'true'): assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag diff --git a/tests/network/docker-compose.yml b/tests/network/docker-compose.yml index a5cef49e23..ec45bc7e8a 100644 --- a/tests/network/docker-compose.yml +++ b/tests/network/docker-compose.yml @@ -22,7 +22,7 @@ services: test: echo 'db.runCommand("ping").ok' | mongosh localhost:27017/test --quiet interval: 1s timeout: 3s - retries: 30 + retries: 90 ocrd_network_rabbit_mq: image: "rabbitmq:3.12-management" @@ -42,7 +42,7 @@ services: test: rabbitmq-diagnostics check_port_connectivity interval: 1s timeout: 3s - retries: 30 + retries: 90 ocrd_network_processing_server: image: "ocrd_core_test" diff --git a/tests/test_decorators.py b/tests/test_decorators.py index 3c38f20789..5ab2880053 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -15,7 +15,7 @@ ocrd_loglevel, ocrd_cli_wrap_processor, ) # pylint: disable=protected-access -from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging +from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging, get_logging_config_files @click.command() @ocrd_cli_options @@ -60,6 +60,8 @@ def test_loglevel_invalid(self): assert "'foo' is not one of" in err def test_loglevel_override(self): + if get_logging_config_files(): + pytest.skip(f"ocrd_logging.conf found at {get_logging_config_files()}, skipping logging test") import logging disableLogging() assert logging.getLogger('ocrd').getEffectiveLevel() == logging.WARNING diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index b1350ed663..1487617a71 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -6,8 +6,11 @@ from itertools import repeat from multiprocessing import Process, Pool, Pipe, set_start_method -# necessary for macos -set_start_method("fork") +try: + # necessary for macos + set_start_method("fork") +except RuntimeError: + pass from shutil import rmtree, copytree from os import remove, stat as os_stat from os.path import exists diff --git a/tests/test_resolver.py b/tests/test_resolver.py index 7e102612e1..c2575b6086 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -117,6 +117,9 @@ def test_workspace_from_url_kant_with_resources(mock_request, tmp_path): @patch.object(Session, "get") def test_workspace_from_url_kant_with_resources_existing_local(mock_request, tmp_path): + """ + Fail with clobber_mets=False, succeeed with clobber_mets=True + """ # arrange url_src = 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml' @@ -127,12 +130,14 @@ def test_workspace_from_url_kant_with_resources_existing_local(mock_request, tmp dst_mets = Path(dst_dir, 'mets.xml') shutil.copyfile(src_mets, dst_mets) - # act - Resolver().workspace_from_url(url_src, clobber_mets=False, dst_dir=dst_dir) + # fail + with pytest.raises(FileExistsError) as exc: + Resolver().workspace_from_url(url_src, clobber_mets=False, dst_dir=dst_dir) + assert mock_request.call_count == 0 - # assert - # no real request was made, since mets already present - assert mock_request.call_count == 0 + # succeed + Resolver().workspace_from_url(url_src, clobber_mets=True, dst_dir=dst_dir) + assert mock_request.call_count == 1 @patch.object(Session, "get") @@ -229,7 +234,7 @@ def test_workspace_from_nothing_noclobber(tmp_path): ws2 = Resolver().workspace_from_nothing(tmp_path) assert ws2.directory == tmp_path - with pytest.raises(Exception) as exc: + with pytest.raises(FileExistsError) as exc: Resolver().workspace_from_nothing(tmp_path) # assert diff --git a/tests/test_resource_manager.py b/tests/test_resource_manager.py index 221b0a3af1..653167e10a 100644 --- a/tests/test_resource_manager.py +++ b/tests/test_resource_manager.py @@ -1,9 +1,13 @@ import os from pathlib import Path +from shutil import rmtree # import pdb -from ocrd.resource_manager import OcrdResourceManager -from ocrd_utils import config +# avoid importing early (before mocking environ) +# because ocrd_utils.config module-level init is crucial: +#from ocrd.resource_manager import OcrdResourceManager +#from ocrd_utils import config + from ocrd_utils.os import get_ocrd_tool_json from pytest import raises, fixture @@ -12,25 +16,31 @@ CONST_RESOURCE_YML = 'resources.yml' CONST_RESOURCE_URL_LAYOUT = 'https://github.com/tesseract-ocr/tessdata_best/raw/main/bos.traineddata' + @fixture(autouse=True) def drop_get_ocrd_tool_json_cache(): get_ocrd_tool_json.cache_clear() yield + def test_resources_manager_config_default(monkeypatch, tmp_path): # arrange monkeypatch.setenv('HOME', str(tmp_path)) if 'XDG_CONFIG_HOME' in os.environ: monkeypatch.delenv('XDG_CONFIG_HOME', raising=False) + if 'XDG_DATA_HOME' in os.environ: + monkeypatch.delenv('XDG_DATA_HOME', raising=False) # act + from ocrd.resource_manager import OcrdResourceManager mgr = OcrdResourceManager() # assert default_config_dir = os.path.join(os.environ['HOME'], '.config', 'ocrd') f = Path(default_config_dir) / CONST_RESOURCE_YML assert os.environ['HOME'] == str(tmp_path) + from ocrd_utils import config assert config.HOME == tmp_path assert Path.home() == tmp_path assert f == mgr.user_list @@ -54,6 +64,7 @@ def test_resources_manager_from_environment(tmp_path, monkeypatch): monkeypatch.setenv('HOME', str(tmp_path)) # act + from ocrd.resource_manager import OcrdResourceManager mgr = OcrdResourceManager() # assert @@ -72,6 +83,7 @@ def test_resources_manager_from_environment(tmp_path, monkeypatch): def test_resources_manager_config_explicite(tmp_path): # act + from ocrd.resource_manager import OcrdResourceManager mgr = OcrdResourceManager(xdg_config_home=str(tmp_path / 'config'), xdg_data_home=str(tmp_path / 'data')) # assert @@ -85,9 +97,11 @@ def test_resources_manager_config_explicite(tmp_path): assert fpath.exists() assert mgr.add_to_user_database(proc, fpath) + def test_resources_manager_config_explicit_invalid(tmp_path): # act + from ocrd.resource_manager import OcrdResourceManager (tmp_path / 'ocrd').mkdir() (tmp_path / 'ocrd' / CONST_RESOURCE_YML).write_text('::INVALID::') @@ -95,9 +109,11 @@ def test_resources_manager_config_explicit_invalid(tmp_path): with raises(ValueError, match='is invalid'): OcrdResourceManager(xdg_config_home=tmp_path) + def test_find_resources(tmp_path): # act + from ocrd.resource_manager import OcrdResourceManager f = tmp_path / 'ocrd-foo' / 'foo.bar' f.parent.mkdir() f.write_text('foobar') @@ -109,29 +125,39 @@ def test_find_resources(tmp_path): assert 'ocrd-foo' in [x for x, _ in mgr.list_available()] assert 'ocrd-foo' in [x for x, _ in mgr.list_available(url='http://foo/bar')] + def test_parameter_usage(tmp_path): + from ocrd.resource_manager import OcrdResourceManager mgr = OcrdResourceManager(xdg_config_home=tmp_path) assert mgr.parameter_usage('foo.bar') == 'foo.bar' assert mgr.parameter_usage('foo.bar', 'without-extension') == 'foo' with raises(ValueError, match='No such usage'): mgr.parameter_usage('foo.bar', 'baz') + def test_default_resource_dir(tmp_path): + from ocrd.resource_manager import OcrdResourceManager mgr = OcrdResourceManager(xdg_data_home=tmp_path) assert mgr.xdg_config_home != mgr.xdg_data_home assert mgr.default_resource_dir == str(mgr.xdg_data_home / 'ocrd-resources') + def test_list_available0(tmp_path): + from ocrd.resource_manager import OcrdResourceManager mgr = OcrdResourceManager(xdg_data_home=tmp_path) res = mgr.list_available() assert len(res) > 0 + def test_list_available_with_unknown_executable(tmp_path): + from ocrd.resource_manager import OcrdResourceManager mgr = OcrdResourceManager(xdg_data_home=tmp_path) res = mgr.list_available(executable="ocrd-non-existing-processor") assert len(res[0][1]) == 0 + def test_date_as_string(tmp_path): + from ocrd.resource_manager import OcrdResourceManager mgr = OcrdResourceManager(xdg_data_home=tmp_path) test_list = tmp_path / 'test-list.yml' with open(test_list, 'w', encoding='utf-8') as fout: @@ -147,7 +173,9 @@ def test_date_as_string(tmp_path): mgr.load_resource_list(test_list) mgr.list_available(executable='ocrd-eynollah-segment') + def test_download_archive(tmp_path): + from ocrd.resource_manager import OcrdResourceManager mgr = OcrdResourceManager(xdg_data_home=tmp_path) for archive_type in ('.zip', '.tar.gz', '.tar.xz'): mgr.download( @@ -163,5 +191,33 @@ def test_download_archive(tmp_path): assert filecontent_path.read_text() == '1\n' +def test_copy_impl(tmp_path): + from ocrd.resource_manager import OcrdResourceManager + root_dir = f"{tmp_path}/mgr_copy_impl_test" + root_dir_copied = f"{tmp_path}/mgr_copy_impl_test_copied" + rmtree(path=root_dir, ignore_errors=True) + rmtree(path=root_dir_copied, ignore_errors=True) + + def _create_test_folder(test_dir: str, letter: str) -> str: + Path(f"{test_dir}/{letter}").mkdir(parents=True, exist_ok=True) + file_path = f"{test_dir}/{letter}/{letter}.txt" + with open(f"{file_path}", "w") as file: + file.write(f"{letter}") + return file_path + + _create_test_folder(test_dir=root_dir, letter="a") + _create_test_folder(test_dir=root_dir, letter="b") + _create_test_folder(test_dir=root_dir, letter="c") + + mgr = OcrdResourceManager(xdg_data_home=tmp_path) + mgr._copy_impl(src_filename=root_dir, filename=root_dir_copied) + + assert Path(f"{root_dir_copied}/a/a.txt").exists() + assert Path(f"{root_dir_copied}/b/b.txt").exists() + assert Path(f"{root_dir_copied}/c/c.txt").exists() + rmtree(path=root_dir, ignore_errors=True) + rmtree(path=root_dir_copied, ignore_errors=True) + + if __name__ == "__main__": main(__file__) diff --git a/tests/utils/test_os.py b/tests/utils/test_os.py index a651ef608f..f6dbde3549 100644 --- a/tests/utils/test_os.py +++ b/tests/utils/test_os.py @@ -11,6 +11,7 @@ redirect_stderr_and_stdout_to_file, guess_media_type, ) +from ocrd_utils import config class TestOsUtils(TestCase): @@ -26,6 +27,8 @@ def tearDown(self): def test_resolve_basic(self): def dehomify(s): + if ENV['HOME'] == '/' or expanduser('~') == '/': + return s return s.replace(ENV['HOME'], '$HOME').replace(expanduser('~'), '$HOME') fname = 'foo.bar' cands = list_resource_candidates('ocrd-dummy', fname) @@ -34,7 +37,7 @@ def dehomify(s): self.assertEqual(cands, [join(x, fname) for x in [ dehomify(join(getcwd())), dehomify(self.tempdir_path), - '$HOME/.local/share/ocrd-resources/ocrd-dummy', + dehomify(join(config.XDG_DATA_HOME, 'ocrd-resources', 'ocrd-dummy')), '/usr/local/share/ocrd-resources/ocrd-dummy', ]])