diff --git a/.pylintrc b/.pylintrc index b2125d824c..2e3af4288b 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,19 +1,22 @@ [MASTER] -extension-pkg-whitelist=lxml -ignored-modules=cv2,tesserocr,ocrd.model +extension-pkg-whitelist=lxml,pydantic +ignored-modules=cv2,tesserocr,ocrd_models.ocrd_page_generateds +ignore-paths=ocrd_page_generateds.py +ignore-patterns=.*generateds.* [MESSAGES CONTROL] -ignore-patterns='.*generateds.*' disable = fixme, - E501, + line-too-long, + consider-using-f-string, + logging-fstring-interpolation, trailing-whitespace, logging-not-lazy, inconsistent-return-statements, + disallowed-name, invalid-name, line-too-long, missing-docstring, - no-self-use, wrong-import-order, too-many-nested-blocks, superfluous-parens, @@ -25,13 +28,9 @@ disable = ungrouped-imports, useless-object-inheritance, useless-import-alias, - bad-continuation, no-else-return, logging-not-lazy -[FORMAT] -no-space-check=empty-line - [DESIGN] # Maximum number of arguments for function / method max-args=12 @@ -40,7 +39,7 @@ max-locals=30 # Maximum number of return / yield for function / method body max-returns=12 # Maximum number of branch for function / method body -max-branchs=30 +max-branches=30 # Maximum number of statements in function / method body max-statements=60 # Maximum number of parents for a class (see R0901). diff --git a/CHANGELOG.md b/CHANGELOG.md index 351f5a56aa..04ea2d42a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,164 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [3.0.0b7] - 2024-11-12 + +Fixed: + - `initLogging`: only add root handler instead of multiple redundant handlers with `propagate=false` + - `setOverrideLogLevel`: override all currently active loggers' level + +Changed: + - :fire: logging: increase default root (not `ocrd`) level from `INFO` to `WARNING` + - :fire: `initLogging`: do not remove any previous handlers/levels, unless `force_reinit` + - :fire: `disableLogging`: remove all handlers, reset all levels - instead of being selective + - :fire: Processor: replace `weakref` with `__del__` to trigger `shutdown` + - :fire: `OCRD_MAX_PARALLEL_PAGES>1`: log via `QueueHandler` in subprocess, `QueueListener` in main + +## [3.0.0b6] - 2024-10-30 + +Fixed: + - `OcrdMets.get_physical_pages`: cover `return_divs` w/o `for_fileIds` and `for_pageIds` + +Changed: + - :fire: `ocrd_utils.initLogging`: also add handler to root logger (as in file config), + but disable message propagation to avoid duplication + - only import `ocrd_network` in `src/ocrd/decorators/__init__.py` once needed + - `Processor.process_page_file`: skip computing `process_page_pcgts` if output already exists, + but `OCRD_EXISTING_OUTPUT!=OVERWRITE` + - :fire: `OCRD_MAX_PARALLEL_PAGES>1`: switch from multithreading to multiprocessing, depend on + `loky` instead of stdlib `concurrent.futures` + - `OCRD_PROCESSING_PAGE_TIMEOUT>0`: actually enforce timeout within worker + - `OCRD_MAX_MISSING_OUTPUTS>0`: abort early if too many failures already, prospectively + - `Processor.process_workspace`: split up into overridable sub-methods: + - `process_workspace_submit_tasks` (iterate input file group and schedule page tasks) + - `process_workspace_submit_page_task` (download input files and submit single page task) + - `process_workspace_handle_tasks` (monitor page tasks and aggregate results) + - `process_workspace_handle_page_task` (await single page task and handle errors) + + +## [3.0.0b5] - 2024-09-16 + +Fixed: + - tests: ensure `ocrd_utils.config` gets reset whenever changing it globally + - `OcrdMetsServer.add_file`: pass on `force` kwarg + - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup` + - `ocrd.cli.validate "tasks"`: pass on `--mets-server-url` + - `ocrd.cli.bashlib "input-files"`: pass on `--mets-server-url` + - `lib.bash input-files`: pass on `--mets-server-url`, `--overwrite`, and parameters + - `lib.bash`: fix `errexit` handling + - `ocrd.cli.ocrd-tool "resolve-resource"`: forgot to actually print result + +Changed: + - :fire: `Processor` / `Workspace.add_file`: always `force` if `OCRD_EXISTING_OUTPUT==OVERWRITE` + - :fire: `Processor.verify`: revert 3.0.0b1 enforcing cardinality checks (stay backwards compatible) + - :fire: `Processor.verify`: check output fileGrps, too + (must not exist unless `OCRD_EXISTING_OUTPUT=OVERWRITE|SKIP` or disjoint `--page-id` range) + - lib.bash `input-files`: do not try to validate tasks here (now covered by `Processor.verify()`) + - `run_processor`: be robust if `ocrd_tool` is missing `steps` + - `PcGtsType.PageType.id` via `make_xml_id`: replace `/` with `_` + +Added: + - `OcrdPage`: new `PageType.get_ReadingOrderGroups()` to retrieve recursive RO as dict + - ocrd.cli.workspace `server`: add subcommands `reload` and `save` + - METS Server: export and delegate `physical_pages` + - processor CLI: delegate `--resolve-resource`, too + - `Processor.process_page_file` / `OcrdPageResultImage`: allow `None` besides `AlternativeImageType` + +## [3.0.0b4] - 2024-09-02 + +Fixed: + + * `Processor.metadata_location`: `src` workaround respects namespace packages, qurator-spk/eynollah#134 + * `Workspace.reload_mets`: handle ClientSideOcrdMets as well + +## [3.0.0b3] - 2024-08-30 + +Added: + + * `OcrdConfig.reset_defaults` to reset config variables to their defaults + +## [3.0.0b2] - 2024-08-30 + +Added: + - `Processor.max_workers`: class attribute to control per-page parallelism of this implementation + - `Processor.max_page_seconds`: class attribute to control per-page timeout of this implementation + - `OCRD_MAX_PARALLEL_PAGES` for whether and how many workers should process pages in parallel + - `OCRD_PROCESSING_PAGE_TIMEOUT` for whether and how long processors should wait for single pages + - `OCRD_MAX_MISSING_OUTPUTS` for maximum rate (fraction) of pages before making `OCRD_MISSING_OUTPUT=abort` + +Fixed: + - `disableLogging`: also re-instate root logger to Python defaults + +## [3.0.0b1] - 2024-08-26 + +Fixed: + - actually apply CLI `--log-filename`, and show in `--help` + - adapt to Pillow changes + - `ocrd workspace clone`: do pass on `--file-grp` (for download filtering) + +Changed: + - :fire: `ocrd_utils`, `ocrd_models`, `ocrd_modelfactory`, `ocrd_validators` and `ocrd_network` are not published + as separate packages anymore, everything is contained in `ocrd` - you should adapt your `requirements.txt` accordingly + - :fire: `Processor.parameter` now a property (attribute always exists, but `None` for non-processing contexts) + - :fire: `Processor.parameter` is now a `frozendict` (contents immutable) + - :fire: `Processor.parameter` validate when(ever) set instead of (just) the constructor + - setting `Processor.parameter` will also trigger (`Processor.shutdown() and) `Processor.setup()` + - `get_processor(... instance_caching=True)`: use `min(max_instances, OCRD_MAX_PROCESSOR_CACHE)` + - :fire: `Processor.verify` always validates fileGrp cardinalities (because we have `ocrd-tool.json` defaults now) + - :fire: `OcrdMets.add_agent` without positional arguments + - `ocrd bashlib input-files` now uses normal Processor decorator, and gets passed actual `ocrd-tool.json` and tool name + from bashlib's `ocrd__wrap` + +Added: + - `Processor.metadata_filename`: expose to make local path of `ocrd-tool.json` in Python distribution reusable+overridable + - `Processor.metadata_location`: expose to make absolute path of `ocrd-tool.json` reusable+overridable + - `Processor.metadata_rawdict`: expose to make in-memory contents of `ocrd-tool.json` reusable+overridable + - `Processor.metadata`: expose to make validated and default-expanded contents of `ocrd-tool.json` reusable+overridable + - `Processor.shutdown`: to shut down processor after processing, optional + - `Processor.max_instances`: class attribute to control instance caching of this implementation + +## [3.0.0a2] - 2024-08-22 + +Changed: + - :fire: `OcrdPage` as proxy of `PcGtsType` instead of alias; also contains `etree` and `mapping` now + - :fire: `page_from_file`: removed kwarg `with_tree` - use `OcrdPage.etree` and `OcrdPage.mapping` instead + - :fire: `Processor.zip_input_files` now can throw `ocrd.NonUniqueInputFile` and `ocrd.MissingInputFile` + (the latter only if `OCRD_MISSING_INPUT=ABORT`) + - :fire: `Processor.zip_input_files` does not by default use `require_first` anymore + (so the first file in any input file tuple per page can be `None` as well) + - :fire: no more `Workspace.overwrite_mode`, merely delegate to `OCRD_EXISTING_OUTPUT=OVERWRITE` + - :art: improve on docs result for `ocrd_utils.config` + +Added: + - :point_right: `OCRD_DOWNLOAD_INPUT` for whether input files should be downloaded before processing + - :point_right: `OCRD_MISSING_INPUT` for how to handle missing input files (**`SKIP`** or `ABORT`) + - :point_right: `OCRD_MISSING_OUTPUT` for how to handle processing failures (**`SKIP`** or `ABORT` or `COPY`) + the latter behaves like ocrd-dummy for the failed page(s) + - :point_right: `OCRD_EXISTING_OUTPUT` for how to handle existing output files (**`SKIP`** or `ABORT` or `OVERWRITE`) + - new CLI option `--debug` as short-hand for `ABORT` choices above + - `Processor.logger` set up by constructor already (for re-use by processor implementors) + - `default`-expand and validate `ocrd_tool.json` in `Processor` constructor, log invalidities + - handle JSON `deprecation` in `ocrd_tool.json` by reporting warnings + +## [3.0.0a1] - 2024-08-15 + +Changed: + - :fire: Deprecate `Processor.process` + - update spec to v3.25.0, which requires annotating fileGrp cardinality in `ocrd-tool.json` + - :fire: Remove passing non-processing kwargs to `Processor` constructor, add as members + (i.e. `show_help`, `dump_json`, `dump_module_dir`, `list_resources`, `show_resource`, `resolve_resource`) + - :fire: Deprecate passing processing arg / kwargs to `Processor` constructor + (i.e. `workspace`, `page_id`, `input_file_grp`, `output_file_grp`; now all set by `run_processor`) + - :fire: Deprecate passing `ocrd-tool.json` metadata to `Processor` constructor + - `ocrd.processor`: Handle loading of bundled `ocrd-tool.json` generically + +Added: + - `Processor.process_workspace`: process a complete workspace, with default implementation + - `Processor.process_page_file`: process an OcrdFile, with default implementation + - `Processor.process_page_pcgts`: process a single OcrdPage, produce a single OcrdPage, required to implement + - `Processor.verify`: handle fileGrp cardinality verification, with default implementation + - `Processor.setup`: to set up processor before processing, optional + ## [2.68.0] - 2024-08-23 Changed: @@ -2164,6 +2322,14 @@ Fixed Initial Release +[3.0.0b6]: ../../compare/v3.0.0b6..v3.0.0b5 +[3.0.0b5]: ../../compare/v3.0.0b5..v3.0.0b4 +[3.0.0b4]: ../../compare/v3.0.0b4..v3.0.0b3 +[3.0.0b3]: ../../compare/v3.0.0b3..v3.0.0b2 +[3.0.0b2]: ../../compare/v3.0.0b2..v3.0.0b1 +[3.0.0b1]: ../../compare/v3.0.0b1..v3.0.0a2 +[3.0.0a2]: ../../compare/v3.0.0a2..v3.0.0a1 +[3.0.0a1]: ../../compare/v3.0.0a1..v2.67.2 [2.68.0]: ../../compare/v2.68.0..v2.67.2 [2.67.2]: ../../compare/v2.67.2..v2.67.1 [2.67.1]: ../../compare/v2.67.1..v2.67.0 diff --git a/Dockerfile b/Dockerfile index 144ae774dc..77c24bf77e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,9 +50,9 @@ FROM ocrd_core_base as ocrd_core_test ARG SKIP_ASSETS WORKDIR /build/core COPY Makefile . +COPY .gitmodules . RUN if test -z "$SKIP_ASSETS" || test $SKIP_ASSETS -eq 0 ; then make assets ; fi COPY tests ./tests -COPY .gitmodules . COPY requirements_test.txt . RUN pip install -r requirements_test.txt RUN mkdir /ocrd-data && chmod 777 /ocrd-data diff --git a/Makefile b/Makefile index 4997066d1b..1a4a6bbdb8 100644 --- a/Makefile +++ b/Makefile @@ -238,9 +238,9 @@ repo/assets repo/spec: always-update .PHONY: spec # Copy JSON Schema, OpenAPI from OCR-D/spec -spec: repo/spec - cp repo/spec/ocrd_tool.schema.yml ocrd_validators/ocrd_validators/ocrd_tool.schema.yml - cp repo/spec/bagit-profile.yml ocrd_validators/ocrd_validators/bagit-profile.yml +spec: # repo/spec + cp repo/spec/ocrd_tool.schema.yml src/ocrd_validators/ocrd_tool.schema.yml + cp repo/spec/bagit-profile.yml src/ocrd_validators/bagit-profile.yml # # Assets @@ -273,7 +273,7 @@ test-logging: assets cp src/ocrd_utils/ocrd_logging.conf $$tempdir; \ cd $$tempdir; \ $(PYTHON) -m pytest --continue-on-collection-errors -k TestLogging -k TestDecorators $(TESTDIR); \ - rm -r $$tempdir/ocrd_logging.conf $$tempdir/.benchmarks; \ + rm -r $$tempdir/ocrd_logging.conf $$tempdir/ocrd.log $$tempdir/.benchmarks; \ rm -rf $$tempdir/.coverage; \ rmdir $$tempdir @@ -401,41 +401,3 @@ docker docker-cuda docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch: # Build wheels and source dist and twine upload them pypi: build twine upload --verbose dist/ocrd-$(VERSION)*{tar.gz,whl} - -pypi-workaround: build-workaround - for dist in $(BUILD_ORDER);do twine upload dist/$$dist-$(VERSION)*{tar.gz,whl};done - -# Only in place until v3 so we don't break existing installations -build-workaround: pyclean - cp pyproject.toml pyproject.toml.BAK - cp src/ocrd_utils/constants.py src/ocrd_utils/constants.py.BAK - cp src/ocrd/cli/__init__.py src/ocrd/cli/__init__.py.BAK - for dist in $(BUILD_ORDER);do \ - cat pyproject.toml.BAK | sed "s,^name =.*,name = \"$$dist\"," > pyproject.toml; \ - cat src/ocrd_utils/constants.py.BAK | sed "s,dist_version('ocrd'),dist_version('$$dist')," > src/ocrd_utils/constants.py; \ - cat src/ocrd/cli/__init__.py.BAK | sed "s,package_name='ocrd',package_name='$$dist'," > src/ocrd/cli/__init__.py; \ - $(MAKE) build; \ - done - rm pyproject.toml.BAK - rm src/ocrd_utils/constants.py.BAK - rm src/ocrd/cli/__init__.py.BAK - -# test that the aliased packages work in isolation and combined -test-workaround: build-workaround - $(MAKE) uninstall-workaround - for dist in $(BUILD_ORDER);do \ - pip install dist/$$dist-*.whl ;\ - ocrd --version ;\ - make test ;\ - pip uninstall --yes $$dist ;\ - done - for dist in $(BUILD_ORDER);do \ - pip install dist/$$dist-*.whl ;\ - done - ocrd --version ;\ - make test ;\ - for dist in $(BUILD_ORDER);do pip uninstall --yes $$dist;done - -uninstall-workaround: - for dist in $(BUILD_ORDER);do $(PIP) uninstall --yes $$dist;done - diff --git a/README.md b/README.md index b401428ee0..d41a2dddb6 100644 --- a/README.md +++ b/README.md @@ -47,17 +47,12 @@ complete stack of OCR-D-related software. The easiest way to install is via `pip`: -```sh -pip install ocrd + pip install ocrd -# or just the functionality you need, e.g. - -pip install ocrd_modelfactory -``` All Python software released by [OCR-D](https://github.com/OCR-D) requires Python 3.8 or higher. -**NOTE** Some OCR-D-Tools (or even test cases) _might_ reveal an unintended behavior if you have specific environment modifications, like: +> **NOTE** Some OCR-D tools (or even test cases) _might_ reveal an unintended behavior if you have specific environment modifications, like: * using a custom build of [ImageMagick](https://github.com/ImageMagick/ImageMagick), whose format delegates are different from what OCR-D supposes * custom Python logging configurations in your personal account @@ -82,7 +77,6 @@ Almost all behaviour of the OCR-D/core software is configured via CLI options an Some parts of the software are configured via environment variables: -* `OCRD_METS_CACHING`: If set to `true`, access to the METS file is cached, speeding in-memory search and modification. * `OCRD_PROFILE`: This variable configures the built-in CPU and memory profiling. If empty, no profiling is done. Otherwise expected to contain any of the following tokens: * `CPU`: Enable CPU profiling of processor runs * `RSS`: Enable RSS memory profiling @@ -95,18 +89,46 @@ Some parts of the software are configured via environment variables: * `XDG_CONFIG_HOME`: Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database) – defaults to `$HOME/.config`. * `XDG_DATA_HOME`: Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location) – defaults to `$HOME/.local/share`. -* `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of workspace files. +* `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of resources or workspace files. * `OCRD_DOWNLOAD_TIMEOUT`: Timeout in seconds for connecting or reading (comma-separated) when downloading. +* `OCRD_MISSING_INPUT`: How to deal with missing input files (for some fileGrp/pageId) during processing: + * `SKIP`: ignore and proceed with next page's input + * `ABORT`: throw `MissingInputFile` exception + +* `OCRD_MISSING_OUTPUT`: How to deal with missing output files (for some fileGrp/pageId) during processing: + * `SKIP`: ignore and proceed processing next page + * `COPY`: fall back to copying input PAGE to output fileGrp for page + * `ABORT`: re-throw whatever caused processing to fail + +* `OCRD_MAX_MISSING_OUTPUTS`: Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative). + +* `OCRD_EXISTING_OUTPUT`: How to deal with already existing output files (for some fileGrp/pageId) during processing: + * `SKIP`: ignore and proceed processing next page + * `OVERWRITE`: force writing result to output fileGrp for page + * `ABORT`: re-throw `FileExistsError` exception + + * `OCRD_METS_CACHING`: Whether to enable in-memory storage of OcrdMets data structures for speedup during processing or workspace operations. * `OCRD_MAX_PROCESSOR_CACHE`: Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers. +* `OCRD_MAX_PARALLEL_PAGES`: Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set `>1`, then a METS Server must be used for METS synchronisation. + +* `OCRD_PROCESSING_PAGE_TIMEOUT`: Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies. + * `OCRD_NETWORK_SERVER_ADDR_PROCESSING`: Default address of Processing Server to connect to (for `ocrd network client processing`). * `OCRD_NETWORK_SERVER_ADDR_WORKFLOW`: Default address of Workflow Server to connect to (for `ocrd network client workflow`). * `OCRD_NETWORK_SERVER_ADDR_WORKSPACE`: Default address of Workspace Server to connect to (for `ocrd network client workspace`). * `OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS`: Number of attempts for a worker to create its queue. Helpful if the rabbitmq-server needs time to be fully started. +* `OCRD_NETWORK_CLIENT_POLLING_SLEEP`: How many seconds to sleep before trying `ocrd network client` again. +* `OCRD_NETWORK_CLIENT_POLLING_TIMEOUT`: Timeout for a blocking `ocrd network client` (in seconds). + +* `OCRD_NETWORK_SOCKETS_ROOT_DIR`: The root directory where all mets server related socket files are created. +* `OCRD_NETWORK_LOGS_ROOT_DIR`: The root directory where all ocrd_network related file logs are stored. + + ## Packages diff --git a/README_bashlib.md b/README_bashlib.md index 09199468cc..20379c3c92 100644 --- a/README_bashlib.md +++ b/README_bashlib.md @@ -21,6 +21,9 @@ For example: * [`ocrd__log`](#ocrd__log) * [`ocrd__minversion`](#ocrd__minversion) * [`ocrd__dumpjson`](#ocrd__dumpjson) +* [`ocrd__resolve_resource`](#ocrd__resolve_resource) +* [`ocrd__show_resource`](#ocrd__show_resource) +* [`ocrd__list_resources`](#ocrd__list_resources) * [`ocrd__usage`](#ocrd__usage) * [`ocrd__parse_argv`](#ocrd__parse_argv) @@ -56,6 +59,10 @@ export OCRD_TOOL_NAME=ocrd-foo-bar (Which you automatically get from [`ocrd__wrap`](#ocrd__wrap).) +### `ocrd__resolve_resource` + +Output given resource file's path. + ### `ocrd__show_resource` Output given resource file's content. @@ -88,6 +95,7 @@ This will be filled by the parser along the following keys: - `profile`: whether `--profile` is enabled - `profile_file`: the argument of `--profile-file` - `log_level`: the argument of `--log-level` +- `mets_server_url`: the argument of `--mets-server-url` argument - `mets_file`: absolute path of the `--mets` argument - `working_dir`: absolute path of the `--working-dir` argument or the parent of `mets_file` - `page_id`: the argument of `--page-id` @@ -95,7 +103,7 @@ This will be filled by the parser along the following keys: - `output_file_grp`: the argument of `--output-file-grp` Moreover, there will be an associative array **`params`** -with the fully expanded runtime values of the ocrd-tool.json parameters. +with the fully validated and default-expanded runtime values of the `ocrd-tool.json` parameters. ### `ocrd__wrap` diff --git a/VERSION b/VERSION index 0f1ddc8105..1129dfd443 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.68.0 +3.0.0b7 diff --git a/docs/api/ocrd/ocrd.processor.ocrd_page_result.rst b/docs/api/ocrd/ocrd.processor.ocrd_page_result.rst new file mode 100644 index 0000000000..e13d50e155 --- /dev/null +++ b/docs/api/ocrd/ocrd.processor.ocrd_page_result.rst @@ -0,0 +1,7 @@ +ocrd.processor.ocrd\_page\_result module +======================================== + +.. automodule:: ocrd.processor.ocrd_page_result + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd/ocrd.processor.rst b/docs/api/ocrd/ocrd.processor.rst index 801114d2a3..7507d8439b 100644 --- a/docs/api/ocrd/ocrd.processor.rst +++ b/docs/api/ocrd/ocrd.processor.rst @@ -22,3 +22,4 @@ Submodules ocrd.processor.base ocrd.processor.helpers + ocrd.processor.ocrd_page_result diff --git a/docs/api/ocrd_network/ocrd_network.deployer.rst b/docs/api/ocrd_network/ocrd_network.deployer.rst deleted file mode 100644 index 205a331ba2..0000000000 --- a/docs/api/ocrd_network/ocrd_network.deployer.rst +++ /dev/null @@ -1,7 +0,0 @@ -ocrd\_network.deployer module -============================= - -.. automodule:: ocrd_network.deployer - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.deployment_utils.rst b/docs/api/ocrd_network/ocrd_network.deployment_utils.rst deleted file mode 100644 index cc1f315ac5..0000000000 --- a/docs/api/ocrd_network/ocrd_network.deployment_utils.rst +++ /dev/null @@ -1,7 +0,0 @@ -ocrd\_network.deployment\_utils module -====================================== - -.. automodule:: ocrd_network.deployment_utils - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.logging.rst b/docs/api/ocrd_network/ocrd_network.logging.rst deleted file mode 100644 index d2ac721d14..0000000000 --- a/docs/api/ocrd_network/ocrd_network.logging.rst +++ /dev/null @@ -1,7 +0,0 @@ -ocrd\_network.logging module -============================ - -.. automodule:: ocrd_network.logging - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.logging_utils.rst b/docs/api/ocrd_network/ocrd_network.logging_utils.rst new file mode 100644 index 0000000000..561ce00193 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.logging_utils.rst @@ -0,0 +1,7 @@ +ocrd\_network.logging\_utils module +=================================== + +.. automodule:: ocrd_network.logging_utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.helpers.rst b/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.helpers.rst new file mode 100644 index 0000000000..e13ff897a9 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.helpers.rst @@ -0,0 +1,7 @@ +ocrd\_network.rabbitmq\_utils.helpers module +============================================ + +.. automodule:: ocrd_network.rabbitmq_utils.helpers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.rst b/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.rst index 36b581a337..63fd6f89aa 100644 --- a/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.rst +++ b/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.rst @@ -15,5 +15,6 @@ Submodules ocrd_network.rabbitmq_utils.connector ocrd_network.rabbitmq_utils.constants ocrd_network.rabbitmq_utils.consumer + ocrd_network.rabbitmq_utils.helpers ocrd_network.rabbitmq_utils.ocrd_messages ocrd_network.rabbitmq_utils.publisher diff --git a/docs/api/ocrd_network/ocrd_network.rst b/docs/api/ocrd_network/ocrd_network.rst index ae12ae1f5d..4497702751 100644 --- a/docs/api/ocrd_network/ocrd_network.rst +++ b/docs/api/ocrd_network/ocrd_network.rst @@ -15,6 +15,7 @@ Subpackages ocrd_network.cli ocrd_network.models ocrd_network.rabbitmq_utils + ocrd_network.runtime_data Submodules ---------- @@ -25,15 +26,13 @@ Submodules ocrd_network.client ocrd_network.constants ocrd_network.database - ocrd_network.deployer - ocrd_network.deployment_utils - ocrd_network.logging + ocrd_network.logging_utils ocrd_network.param_validators ocrd_network.process_helpers ocrd_network.processing_server ocrd_network.processing_worker ocrd_network.processor_server - ocrd_network.runtime_data ocrd_network.server_cache ocrd_network.server_utils + ocrd_network.tcp_to_uds_mets_proxy ocrd_network.utils diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.config_parser.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.config_parser.rst new file mode 100644 index 0000000000..e56ad31f89 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.config_parser.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.config\_parser module +================================================= + +.. automodule:: ocrd_network.runtime_data.config_parser + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.connection_clients.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.connection_clients.rst new file mode 100644 index 0000000000..2fd62e5ef2 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.connection_clients.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.connection\_clients module +====================================================== + +.. automodule:: ocrd_network.runtime_data.connection_clients + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.deployer.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.deployer.rst new file mode 100644 index 0000000000..62abe20db3 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.deployer.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.deployer module +=========================================== + +.. automodule:: ocrd_network.runtime_data.deployer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.hosts.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.hosts.rst new file mode 100644 index 0000000000..8f9001c381 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.hosts.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.hosts module +======================================== + +.. automodule:: ocrd_network.runtime_data.hosts + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.network_agents.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.network_agents.rst new file mode 100644 index 0000000000..1a597caad1 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.network_agents.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.network\_agents module +================================================== + +.. automodule:: ocrd_network.runtime_data.network_agents + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.network_services.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.network_services.rst new file mode 100644 index 0000000000..d72e67c9d6 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.network_services.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.network\_services module +==================================================== + +.. automodule:: ocrd_network.runtime_data.network_services + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.rst index fefa00b492..cdf45f6b6e 100644 --- a/docs/api/ocrd_network/ocrd_network.runtime_data.rst +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.rst @@ -1,7 +1,20 @@ -ocrd\_network.runtime\_data module -================================== +ocrd\_network.runtime\_data package +=================================== .. automodule:: ocrd_network.runtime_data :members: :undoc-members: :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + ocrd_network.runtime_data.config_parser + ocrd_network.runtime_data.connection_clients + ocrd_network.runtime_data.deployer + ocrd_network.runtime_data.hosts + ocrd_network.runtime_data.network_agents + ocrd_network.runtime_data.network_services diff --git a/docs/api/ocrd_network/ocrd_network.tcp_to_uds_mets_proxy.rst b/docs/api/ocrd_network/ocrd_network.tcp_to_uds_mets_proxy.rst new file mode 100644 index 0000000000..fa6e607f94 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.tcp_to_uds_mets_proxy.rst @@ -0,0 +1,7 @@ +ocrd\_network.tcp\_to\_uds\_mets\_proxy module +============================================== + +.. automodule:: ocrd_network.tcp_to_uds_mets_proxy + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/conf.py b/docs/conf.py index 3ab2e1826f..917c5c62ca 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,7 +15,7 @@ # import os # import sys # # sys.path.insert(0, os.path.abspath('..')) -with open('VERSION', encoding='utf-8') as f: +with open('../VERSION', encoding='utf-8') as f: VERSION = f.read() @@ -72,7 +72,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path . -exclude_patterns = [u'build', 'Thumbs.db', '.DS_Store', 'src', 'venv'] +exclude_patterns = [u'build', 'Thumbs.db', '.DS_Store', 'tests', 'venv'] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' diff --git a/docs/index.rst b/docs/index.rst index 96a4e98360..67bba66fe0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -7,9 +7,10 @@ OCR-D/core ocrd ocrd_utils + ocrd_modelfactory ocrd_models ocrd_validators - ocrd_modelfactory + ocrd_network Indices and tables diff --git a/requirements.txt b/requirements.txt index ed5fd56d59..05d4e9aa44 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,8 @@ gdown httpx>=0.22.0 importlib_metadata ; python_version < '3.8' importlib_resources ; python_version < '3.10' -jsonschema +jsonschema>=4 +loky lxml memory-profiler >= 0.58.0 # XXX explicitly do not restrict the numpy version because different diff --git a/requirements_test.txt b/requirements_test.txt index d8cef1dae7..a6a87918fc 100644 --- a/requirements_test.txt +++ b/requirements_test.txt @@ -3,6 +3,7 @@ cryptography < 43.0.0 pytest >= 4.0.0 generateDS == 2.35.20 pytest-benchmark >= 3.2.3 +pytest-timeout coverage >= 4.5.2 sphinx sphinx_click diff --git a/src/ocrd/__init__.py b/src/ocrd/__init__.py index 62b6ffbc0a..e4c782685b 100644 --- a/src/ocrd/__init__.py +++ b/src/ocrd/__init__.py @@ -14,8 +14,9 @@ """ -from ocrd.processor.base import run_processor, run_cli, Processor -from ocrd_models import OcrdMets, OcrdExif, OcrdFile, OcrdAgent +from ocrd.processor.base import run_processor, run_cli, Processor, ResourceNotFoundError +from ocrd.processor.ocrd_page_result import OcrdPageResult, OcrdPageResultImage +from ocrd_models import OcrdMets, OcrdPage, OcrdExif, OcrdFile, OcrdAgent from ocrd.resolver import Resolver from ocrd_validators import * from ocrd.workspace import Workspace diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 70d738f083..9e8a37b8bf 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -10,6 +10,34 @@ from ocrd_utils import config +# pylint: disable=wrong-import-position + +def command_with_replaced_help(*replacements): + + class CommandWithReplacedHelp(click.Command): + def get_help(self, ctx): + newhelp = super().get_help(ctx) + for replacement in replacements: + newhelp = re.sub(*replacement, newhelp) + # print(newhelp) + return newhelp + + return CommandWithReplacedHelp + +# pylint: enable=wrong-import-position + +from ..decorators import ocrd_loglevel +from .ocrd_tool import ocrd_tool_cli +from .workspace import workspace_cli +from .process import process_cli +from .bashlib import bashlib_cli +from .validate import validate_cli +from .resmgr import resmgr_cli +from .zip import zip_cli +from .log import log_cli +from .network import network_cli + + __all__ = ['cli'] _epilog = f""" @@ -31,6 +59,14 @@ \b {config.describe('OCRD_DOWNLOAD_TIMEOUT')} \b +{config.describe('OCRD_DOWNLOAD_INPUT')} +\b +{config.describe('OCRD_MISSING_INPUT', wrap_text=False)} +\b +{config.describe('OCRD_MISSING_OUTPUT', wrap_text=False)} +\b +{config.describe('OCRD_EXISTING_OUTPUT', wrap_text=False)} +\b {config.describe('OCRD_METS_CACHING')} \b {config.describe('OCRD_MAX_PROCESSOR_CACHE')} @@ -58,30 +94,6 @@ {config.describe('OCRD_LOGGING_DEBUG')} """ -def command_with_replaced_help(*replacements): - - class CommandWithReplacedHelp(click.Command): - def get_help(self, ctx): - help = super().get_help(ctx) - for replacement in replacements: - help = re.sub(*replacement, help) - # print(help) - return help - - return CommandWithReplacedHelp - -from ocrd.cli.ocrd_tool import ocrd_tool_cli -from ocrd.cli.workspace import workspace_cli -from ocrd.cli.process import process_cli -from ocrd.cli.bashlib import bashlib_cli -from ocrd.cli.validate import validate_cli -from ocrd.cli.resmgr import resmgr_cli -from ocrd.decorators import ocrd_loglevel -from .zip import zip_cli -from .log import log_cli -from .network import network_cli - - @click.group(epilog=_epilog) @click.version_option(package_name='ocrd') @ocrd_loglevel diff --git a/src/ocrd/cli/bashlib.py b/src/ocrd/cli/bashlib.py index 1def4638c7..b6817abe91 100644 --- a/src/ocrd/cli/bashlib.py +++ b/src/ocrd/cli/bashlib.py @@ -8,7 +8,6 @@ """ from __future__ import print_function import sys -from os.path import isfile import click from ocrd.constants import BASHLIB_FILENAME @@ -20,15 +19,10 @@ from ocrd.decorators import ( parameter_option, parameter_override_option, - ocrd_loglevel + ocrd_loglevel, + ocrd_cli_wrap_processor ) -from ocrd_utils import ( - is_local_filename, - get_local_filename, - initLogging, - make_file_id -) -from ocrd.resolver import Resolver +from ocrd_utils import make_file_id from ocrd.processor import Processor # ---------------------------------------------------------------------- @@ -79,17 +73,23 @@ def bashlib_constants(name): print(val) @bashlib_cli.command('input-files') +@click.option('--ocrd-tool', help="path to ocrd-tool.json of processor to feed", default=None) +@click.option('--executable', help="name of processor executable in ocrd-tool.json", default=None) @click.option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME) -@click.option('-w', '--working-dir', help="Working Directory") -@click.option('-I', '--input-file-grp', help='File group(s) used as input.', default='INPUT') -@click.option('-O', '--output-file-grp', help='File group(s) used as output.', default='OUTPUT') -# repeat some other processor options for convenience (will be ignored here) +@click.option('-U', '--mets-server-url', help='TCP host URI or UDS path of METS server', default=None) +@click.option('-d', '--working-dir', help="Working Directory") +@click.option('-I', '--input-file-grp', help='File group(s) used as input.', default=None) +@click.option('-O', '--output-file-grp', help='File group(s) used as output.', default=None) @click.option('-g', '--page-id', help="ID(s) of the pages to process") -@click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist") +@click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist\n" + "(with '--page-id', remove only those).\n" + "Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE") +@click.option('--debug', is_flag=True, default=False, help="Abort on any errors with full stack trace.\n" + "Short-hand for OCRD_MISSING_OUTPUT=ABORT") @parameter_option @parameter_override_option @ocrd_loglevel -def bashlib_input_files(**kwargs): +def bashlib_input_files(ocrd_tool, executable, **kwargs): """ List input files for processing @@ -100,29 +100,49 @@ def bashlib_input_files(**kwargs): (The printing format is one associative array initializer per line.) """ - initLogging() - mets = kwargs.pop('mets') - working_dir = kwargs.pop('working_dir') - if is_local_filename(mets) and not isfile(get_local_filename(mets)): - msg = "File does not exist: %s" % mets - raise FileNotFoundError(msg) - resolver = Resolver() - workspace = resolver.workspace_from_url(mets, working_dir) - processor = Processor(workspace, - ocrd_tool=None, - page_id=kwargs['page_id'], - input_file_grp=kwargs['input_file_grp'], - output_file_grp=kwargs['output_file_grp']) - for input_files in processor.zip_input_files(mimetype=None, on_error='abort'): - # ensure all input files exist locally (without persisting them in the METS) - # - this mimics the default behaviour of all Pythonic processors - input_files = [workspace.download_file(input_file) if input_file else None - for input_file in input_files] - for field in ['url', 'local_filename', 'ID', 'mimetype', 'pageId']: - # make this bash-friendly (show initialization for associative array) - if len(input_files) > 1: - # single quotes allow us to preserve the list value inside the alist - print("[%s]='%s'" % (field, ' '.join(str(getattr(res, field)) for res in input_files)), end=' ') - else: - print("[%s]='%s'" % (field, str(getattr(input_files[0], field))), end=' ') - print("[outputFileId]='%s'" % make_file_id(input_files[0], kwargs['output_file_grp'])) + class BashlibProcessor(Processor): + # go half way of the normal run_processor / process_workspace call tree + # by just delegating to process_workspace, overriding process_page_file + # to ensure all input files exist locally (without persisting them in the METS) + # and print what needs to be acted on in bash-friendly way + def process_page_file(self, *input_files): + for field in ['url', 'local_filename', 'ID', 'mimetype', 'pageId']: + # make this bash-friendly (show initialization for associative array) + if len(input_files) > 1: + # single quotes allow us to preserve the list value inside the alist + value = ' '.join(str(getattr(res, field)) for res in input_files) + else: + value = str(getattr(input_files[0], field)) + print(f"[{field}]='{value}'", end=' ') + output_file_id = make_file_id(input_files[0], kwargs['output_file_grp']) + print(f"[outputFileId]='{output_file_id}'") + if ocrd_tool and executable: + class FullBashlibProcessor(BashlibProcessor): + @property + def metadata_location(self): + # needed for metadata loading and validation mechanism + return ocrd_tool + @property + def executable(self): + # needed for ocrd_tool lookup + return executable + processor_class = FullBashlibProcessor + else: + # we have no true metadata file, so fill in just to make it work + class UnknownBashlibProcessor(BashlibProcessor): + @property + def ocrd_tool(self): + # needed to satisfy the validator + return {'executable': '', + # required now + 'input_file_grp_cardinality': 1, + 'output_file_grp_cardinality': 1, + 'steps': [''] + } + @property + def version(self): + # needed to satisfy the validator and wrapper + return '1.0' + processor_class = UnknownBashlibProcessor + + ocrd_cli_wrap_processor(processor_class, **kwargs) diff --git a/src/ocrd/cli/ocrd_tool.py b/src/ocrd/cli/ocrd_tool.py index 2a7fa99ec9..3ceaba40c5 100644 --- a/src/ocrd/cli/ocrd_tool.py +++ b/src/ocrd/cli/ocrd_tool.py @@ -17,7 +17,6 @@ from ocrd.processor import Processor from ocrd_utils import ( set_json_key_value_overrides, - VERSION as OCRD_VERSION, parse_json_string_or_file, parse_json_string_with_comments as loads ) @@ -29,7 +28,29 @@ def __init__(self, filename): self.filename = filename with codecs.open(filename, encoding='utf-8') as f: self.content = f.read() + # perhaps the validator should _always_ run (for default expansion) + # so validate command only for the report? self.json = loads(self.content) + self.tool_name = '' + + class BashProcessor(Processor): + @property + def metadata(inner_self): # pylint: disable=no-self-argument,arguments-renamed + return self.json + @property + def executable(inner_self): # pylint: disable=no-self-argument,arguments-renamed + return self.tool_name + @property + def moduledir(inner_self): # pylint: disable=no-self-argument,arguments-renamed + return os.path.dirname(self.filename) + # set docstrings to empty + __doc__ = None + # HACK: override the module-level docstring, too + getmodule(OcrdToolCtx).__doc__ = None + def process(inner_self): # pylint: disable=no-self-argument,arguments-renamed + return super() + + self.processor = BashProcessor pass_ocrd_tool = click.make_pass_decorator(OcrdToolCtx) @@ -98,48 +119,25 @@ def ocrd_tool_tool_description(ctx): @ocrd_tool_tool.command('list-resources', help="List tool's file resources") @pass_ocrd_tool def ocrd_tool_tool_list_resources(ctx): - class BashProcessor(Processor): - @property - def moduledir(self): - return os.path.dirname(ctx.filename) - BashProcessor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name], - list_resources=True) + ctx.processor(None).list_resources() @ocrd_tool_tool.command('resolve-resource', help="Get a tool's file resource full path name") @click.argument('res_name') @pass_ocrd_tool def ocrd_tool_tool_resolve_resource(ctx, res_name): - class BashProcessor(Processor): - @property - def moduledir(self): - return os.path.dirname(ctx.filename) - BashProcessor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name], - resolve_resource=res_name) + print(ctx.processor(None).resolve_resource(res_name)) @ocrd_tool_tool.command('show-resource', help="Dump a tool's file resource") @click.argument('res_name') @pass_ocrd_tool def ocrd_tool_tool_show_resource(ctx, res_name): - class BashProcessor(Processor): - @property - def moduledir(self): - return os.path.dirname(ctx.filename) - BashProcessor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name], - show_resource=res_name) + ctx.processor(None).show_resource(res_name) @ocrd_tool_tool.command('help', help="Generate help for processors") @click.argument('subcommand', required=False) @pass_ocrd_tool def ocrd_tool_tool_params_help(ctx, subcommand): - class BashProcessor(Processor): - # set docstrings to empty - __doc__ = None - # HACK: override the module-level docstring, too - getmodule(OcrdToolCtx).__doc__ = None - def process(self): - return super() - BashProcessor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name], - show_help=True, subcommand=subcommand) + ctx.processor(None).show_help(subcommand=subcommand) # ---------------------------------------------------------------------- # ocrd ocrd-tool tool categories diff --git a/src/ocrd/cli/validate.py b/src/ocrd/cli/validate.py index b26803d053..a1ec8fafd6 100644 --- a/src/ocrd/cli/validate.py +++ b/src/ocrd/cli/validate.py @@ -40,7 +40,7 @@ def validate_cli(): @click.argument('ocrd_tool', required=False, nargs=1) def validate_ocrd_tool(ocrd_tool): ''' - Validate OCRD_TOOL as an ocrd-tool.json file. + Validate OCRD_TOOL as an `ocrd-tool.json` file. ''' if not ocrd_tool: ocrd_tool = 'ocrd-tool.json' @@ -102,16 +102,19 @@ def validate_page(page, **kwargs): @validate_cli.command('tasks') @click.option('--workspace', nargs=1, required=False, help='Workspace directory these tasks are to be run. If omitted, only validate syntax') @click.option('-M', '--mets-basename', nargs=1, default=DEFAULT_METS_BASENAME, help='Basename of the METS file, used in conjunction with --workspace') +@click.option('-U', '--mets-server-url', help='TCP host URI or UDS path of METS server') @click.option('--overwrite', is_flag=True, default=False, help='When checking against a concrete workspace, simulate overwriting output or page range.') @click.option('-g', '--page-id', help="ID(s) of the pages to process") @click.argument('tasks', nargs=-1, required=True) -def validate_process(tasks, workspace, mets_basename, overwrite, page_id): +def validate_process(tasks, workspace, mets_basename, mets_server_url, overwrite, page_id): ''' - Validate a sequence of tasks passable to 'ocrd process' + Validate a sequence of tasks passable to `ocrd process` ''' if workspace: - _inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks], - Workspace(Resolver(), directory=workspace, mets_basename=mets_basename), page_id=page_id, overwrite=overwrite)) + _inform_of_result(validate_tasks( + [ProcessorTask.parse(t) for t in tasks], + Workspace(Resolver(), directory=workspace, mets_basename=mets_basename, mets_server_url=mets_server_url), + page_id=page_id, overwrite=overwrite)) else: for t in [ProcessorTask.parse(t) for t in tasks]: _inform_of_result(t.validate()) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 0c70fd3a36..77797b3037 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -6,7 +6,7 @@ :nested: full """ import os -from os import getcwd, rmdir, unlink +from os import rmdir, unlink from os.path import dirname, relpath, normpath, exists, join, isabs, isdir from pathlib import Path from json import loads, dumps @@ -14,7 +14,6 @@ from glob import glob # XXX pathlib.Path.glob does not support absolute globs import re import time -import numpy as np import click @@ -37,6 +36,17 @@ def __init__(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, met = self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename, mets_server_url) self.automatic_backup = automatic_backup + def workspace(self): + return Workspace( + self.resolver, + directory=self.directory, + mets_basename=self.mets_basename, + automatic_backup=self.automatic_backup, + mets_server_url=self.mets_server_url, + ) + def backup_manager(self): + return WorkspaceBackupManager(self.workspace()) + pass_workspace = click.make_pass_decorator(WorkspaceCtx) @@ -118,7 +128,7 @@ def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency @workspace_cli.command('clone', cls=command_with_replaced_help( (r' \[WORKSPACE_DIR\]', ''))) # XXX deprecated argument @click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True) -@click.option('-a', '--download', is_flag=True, help="Download all files and change location in METS file after cloning") +@click.option('-a', '--download', is_flag=True, help="Download all selected files and add local path references in METS file afterwards") @click.argument('mets_url') @mets_find_options # XXX deprecated @@ -129,20 +139,25 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim Create a workspace from METS_URL and return the directory METS_URL can be a URL, an absolute path or a path relative to $PWD. - If METS_URL is not provided, use --mets accordingly. METS_URL can also be an OAI-PMH GetRecord URL wrapping a METS file. + + Additional options pertain to the selection of files / fileGrps / pages + to be downloaded, if --download is used. """ LOG = getLogger('ocrd.cli.workspace.clone') if workspace_dir: LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir)) ctx.directory = workspace_dir + assert not ctx.mets_server_url, \ + f"clone cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.resolver.workspace_from_url( mets_url, dst_dir=ctx.directory, mets_basename=ctx.mets_basename, clobber_mets=clobber_mets, download=download, + fileGrp=file_grp, ID=file_id, pageId=page_id, mimetype=mimetype, @@ -171,10 +186,12 @@ def workspace_init(ctx, clobber_mets, directory): if directory: LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory)) ctx.directory = directory + assert not ctx.mets_server_url, \ + f"init cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.resolver.workspace_from_nothing( directory=ctx.directory, mets_basename=ctx.mets_basename, - clobber_mets=clobber_mets + clobber_mets=clobber_mets, ) workspace.save_mets() print(workspace.directory) @@ -198,13 +215,7 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_ Add a file or http(s) URL FNAME to METS in a workspace. If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace. """ - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - automatic_backup=ctx.automatic_backup, - mets_server_url=ctx.mets_server_url, - ) + workspace = ctx.workspace() log = getLogger('ocrd.cli.workspace.add') if not mimetype: @@ -308,15 +319,10 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi echo PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml; \\ } | ocrd workspace bulk-add -r '(?P.*) (?P.*) (?P.*) (?P.*)' \\ -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' - + """ log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - automatic_backup=ctx.automatic_backup, - mets_server_url=ctx.mets_server_url, - ) + workspace = ctx.workspace() try: pat = re.compile(regex) @@ -407,7 +413,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi if dry_run: log.info('workspace.add_file(%s)' % file_dict) else: - workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) + workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) # pylint: disable=redundant-keyword-arg # save changes to disk workspace.save_mets() @@ -451,13 +457,8 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl snake_to_camel = {"file_id": "ID", "page_id": "pageId", "file_grp": "fileGrp"} output_field = [snake_to_camel.get(x, x) for x in output_field] modified_mets = False - ret = list() - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - mets_server_url=ctx.mets_server_url, - ) + ret = [] + workspace = ctx.workspace() with pushd_popd(workspace.directory): for f in workspace.find_files( file_id=file_id, @@ -507,7 +508,9 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin (If any ``ID`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + assert not ctx.mets_server_url, \ + f"remove cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" + workspace = ctx.workspace() for i in id: workspace.remove_file(i, force=force, keep_file=keep_file) workspace.save_mets() @@ -525,7 +528,9 @@ def rename_group(ctx, old, new): """ Rename fileGrp (USE attribute ``NEW`` to ``OLD``). """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + assert not ctx.mets_server_url, \ + f"rename-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" + workspace = ctx.workspace() workspace.rename_file_group(old, new) workspace.save_mets() @@ -546,7 +551,9 @@ def remove_group(ctx, group, recursive, force, keep_files): (If any ``GROUP`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + assert not ctx.mets_server_url, \ + f"remove-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" + workspace = ctx.workspace() for g in group: workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files) workspace.save_mets() @@ -568,7 +575,9 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id): (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + assert not ctx.mets_server_url, \ + f"prune-files cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" + workspace = ctx.workspace() with pushd_popd(workspace.directory): for f in workspace.find_files( file_id=file_id, @@ -605,8 +614,7 @@ def clean(ctx, dry_run, directories, path_glob): If no PATH_GLOB are specified, then all files and directories may match. """ - log = getLogger('ocrd.cli.workspace.clean') - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() allowed_files = [normpath(f.local_filename) for f in workspace.find_files(local_only=True)] allowed_files.append(relpath(workspace.mets_target, start=workspace.directory)) allowed_dirs = set(dirname(path) for path in allowed_files) @@ -624,7 +632,7 @@ def clean(ctx, dry_run, directories, path_glob): if normpath(path) in allowed_files: continue if dry_run: - log.info('unlink(%s)' % path) + ctx.log.info('unlink(%s)' % path) else: unlink(path) if not directories: @@ -634,7 +642,7 @@ def clean(ctx, dry_run, directories, path_glob): if normpath(path) in allowed_dirs: continue if dry_run: - log.info('rmdir(%s)' % path) + ctx.log.info('rmdir(%s)' % path) else: rmdir(path) @@ -648,7 +656,7 @@ def list_groups(ctx): """ List fileGrp USE attributes """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() print("\n".join(workspace.mets.file_groups)) # ---------------------------------------------------------------------- @@ -674,20 +682,16 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) - find_kwargs = {} - if page_id_range and 'ID' in output_field: - find_kwargs['pageId'] = page_id_range - page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId}) + workspace = ctx.workspace() ret = [] - - if output_field == ['ID']: - ret = [[x] for x in page_ids] - else: - for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)): + if page_id_range or list(output_field) != ['ID']: + for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=page_id_range, return_divs=True)): ret.append([]) for k in output_field: ret[i].append(page_div.get(k, 'None')) + else: + for page_id in workspace.mets.physical_pages: + ret.append([page_id]) if numeric_range: start, end = map(int, numeric_range.split('..')) @@ -721,7 +725,7 @@ def get_id(ctx): """ Get METS id if any """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() ID = workspace.mets.unique_identifier if ID: print(ID) @@ -741,13 +745,13 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin Otherwise will create a new {{ ID }}. """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() workspace.mets.unique_identifier = id workspace.save_mets() @workspace_cli.command('update-page') @click.option('--set', 'attr_value_pairs', help=f"set mets:div ATTR to VALUE. possible keys: {METS_PAGE_DIV_ATTRIBUTE.names()}", metavar="ATTR VALUE", nargs=2, multiple=True) -@click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER') +@click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER') @click.option('--orderlabel', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL') @click.option('--contentids', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL') @click.argument('PAGE_ID') @@ -756,7 +760,7 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): """ Update the @ID, @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID """ - update_kwargs = {k: v for k, v in attr_value_pairs} + update_kwargs = dict(attr_value_pairs) if order: update_kwargs['ORDER'] = order if orderlabel: @@ -764,7 +768,9 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): if contentids: update_kwargs['CONTENTIDS'] = contentids try: - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + assert not ctx.mets_server_url, \ + f"update-page cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" + workspace = ctx.workspace() workspace.mets.update_physical_page_attributes(page_id, **update_kwargs) workspace.save_mets() except Exception as err: @@ -802,7 +808,9 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa mets_path = Path(mets_path) if filegrp_mapping: filegrp_mapping = loads(filegrp_mapping) - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + assert not ctx.mets_server_url, \ + f"merge cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" + workspace = ctx.workspace() other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name)) workspace.merge( other_workspace, @@ -826,11 +834,12 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa # ---------------------------------------------------------------------- @workspace_cli.group('backup') -@click.pass_context +@pass_workspace def workspace_backup_cli(ctx): # pylint: disable=unused-argument """ Backing and restoring workspaces - dev edition """ + assert not ctx.mets_server_url, "Workspace backups currently not interoperable with METS Server" @workspace_backup_cli.command('add') @pass_workspace @@ -838,7 +847,7 @@ def workspace_backup_add(ctx): """ Create a new backup """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() backup_manager.add() @workspace_backup_cli.command('list') @@ -847,7 +856,7 @@ def workspace_backup_list(ctx): """ List backups """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() for b in backup_manager.list(): print(b) @@ -859,7 +868,7 @@ def workspace_backup_restore(ctx, choose_first, bak): """ Restore backup BAK """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() backup_manager.restore(bak, choose_first) @workspace_backup_cli.command('undo') @@ -868,7 +877,7 @@ def workspace_backup_undo(ctx): """ Restore the last backup """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() backup_manager.undo() @@ -885,15 +894,24 @@ def workspace_serve_cli(ctx): # pylint: disable=unused-argument @workspace_serve_cli.command('stop') @pass_workspace def workspace_serve_stop(ctx): # pylint: disable=unused-argument - """Stop the METS server""" - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - mets_server_url=ctx.mets_server_url, - ) + """Stop the METS server (saving changes to disk)""" + workspace = ctx.workspace() workspace.mets.stop() +@workspace_serve_cli.command('reload') +@pass_workspace +def workspace_serve_reload(ctx): # pylint: disable=unused-argument + """Reload the METS server from disk""" + workspace = ctx.workspace() + workspace.mets.reload() + +@workspace_serve_cli.command('save') +@pass_workspace +def workspace_serve_save(ctx): # pylint: disable=unused-argument + """Save the METS changes to disk""" + workspace = ctx.workspace() + workspace.mets.save() + @workspace_serve_cli.command('start') @pass_workspace def workspace_serve_start(ctx): # pylint: disable=unused-argument diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index 580a75b0c0..f659bf58a0 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -1,4 +1,5 @@ import sys +from contextlib import nullcontext from ocrd_utils import ( config, @@ -9,9 +10,9 @@ parse_json_string_with_comments, set_json_key_value_overrides, parse_json_string_or_file, + redirect_stderr_and_stdout_to_file, ) from ocrd_validators import WorkspaceValidator -from ocrd_network import ProcessingWorker, ProcessorServer, AgentType from ..resolver import Resolver from ..processor.base import ResourceNotFoundError, run_processor @@ -21,8 +22,6 @@ from .ocrd_cli_options import ocrd_cli_options from .mets_find_options import mets_find_options -SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER] - def ocrd_cli_wrap_processor( processorClass, @@ -36,6 +35,8 @@ def ocrd_cli_wrap_processor( profile_file=None, version=False, overwrite=False, + debug=False, + resolve_resource=None, show_resource=None, list_resources=False, # ocrd_network params start # @@ -47,82 +48,79 @@ def ocrd_cli_wrap_processor( # ocrd_network params end # **kwargs ): + # FIXME: remove workspace arg entirely + processor = processorClass(None) if not sys.argv[1:]: - processorClass(None, show_help=True) + processor.show_help(subcommand=subcommand) sys.exit(1) - if dump_json or dump_module_dir or help or version or show_resource or list_resources: - processorClass( - None, - dump_json=dump_json, - dump_module_dir=dump_module_dir, - show_help=help, - subcommand=subcommand, - show_version=version, - show_resource=show_resource, - list_resources=list_resources - ) + if help: + processor.show_help(subcommand=subcommand) + sys.exit() + if version: + processor.show_version() + sys.exit() + if dump_json: + processor.dump_json() + sys.exit() + if dump_module_dir: + processor.dump_module_dir() + sys.exit() + if resolve_resource: + try: + res = processor.resolve_resource(resolve_resource) + print(res) + sys.exit() + except ResourceNotFoundError as e: + log = getLogger('ocrd.processor.base') + log.critical(e.message) + sys.exit(1) + if show_resource: + try: + processor.show_resource(show_resource) + sys.exit() + except ResourceNotFoundError as e: + log = getLogger('ocrd.processor.base') + log.critical(e.message) + sys.exit(1) + if list_resources: + processor.list_resources() sys.exit() - if subcommand: + if subcommand or address or queue or database: # Used for checking/starting network agents for the WebAPI architecture check_and_run_network_agent(processorClass, subcommand, address, database, queue) - elif address or queue or database: - raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}") + # from here: single-run processing context initLogging() - - LOG = getLogger('ocrd.cli_wrap_processor') - assert kwargs['input_file_grp'] is not None - assert kwargs['output_file_grp'] is not None - # LOG.info('kwargs=%s' % kwargs) if 'parameter' in kwargs: # Disambiguate parameter file/literal, and resolve file - # (but avoid entering processing context of constructor) - class DisposableSubclass(processorClass): - def show_version(self): - pass - disposable = DisposableSubclass(None, show_version=True) def resolve(name): try: - return disposable.resolve_resource(name) + return processor.resolve_resource(name) except ResourceNotFoundError: return None kwargs['parameter'] = parse_json_string_or_file(*kwargs['parameter'], resolve_preset_file=resolve) else: - kwargs['parameter'] = dict() + kwargs['parameter'] = {} # Merge parameter overrides and parameters if 'parameter_override' in kwargs: - set_json_key_value_overrides(kwargs['parameter'], *kwargs['parameter_override']) - # TODO OCR-D/core#274 + set_json_key_value_overrides(kwargs['parameter'], *kwargs.pop('parameter_override')) # Assert -I / -O - # if not kwargs['input_file_grp']: - # raise ValueError('-I/--input-file-grp is required') - # if not kwargs['output_file_grp']: - # raise ValueError('-O/--output-file-grp is required') + if not kwargs['input_file_grp']: + raise ValueError('-I/--input-file-grp is required') + if not kwargs['output_file_grp']: + raise ValueError('-O/--output-file-grp is required') resolver = Resolver() working_dir, mets, _, mets_server_url = \ resolver.resolve_mets_arguments(working_dir, mets, None, mets_server_url) workspace = resolver.workspace_from_url(mets, working_dir, mets_server_url=mets_server_url) page_id = kwargs.get('page_id') - # XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505 - # if overwrite - # if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']: - # raise Exception("--overwrite requires --output-file-grp") - # LOG.info("Removing files because of --overwrite") - # for grp in kwargs['output_file_grp'].split(','): - # if page_id: - # for one_page_id in kwargs['page_id'].split(','): - # LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id) - # for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp): - # workspace.remove_file(file, force=True, keep_file=False, page_recursive=True) - # else: - # LOG.debug("Removing all files in output file group %s ", grp) - # # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors) - # workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False) - # workspace.save_mets() - # XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace + if debug: + config.OCRD_MISSING_INPUT = 'ABORT' + config.OCRD_MISSING_OUTPUT = 'ABORT' + config.OCRD_EXISTING_OUTPUT = 'ABORT' if overwrite: - workspace.overwrite_mode = True + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) if not report.is_valid: raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) @@ -139,22 +137,31 @@ def resolve(name): print("Profiling...") pr = cProfile.Profile() pr.enable() - def exit(): + def goexit(): pr.disable() print("Profiling completed") if profile_file: - with open(profile_file, 'wb') as f: - pr.dump_stats(profile_file) + pr.dump_stats(profile_file) s = io.StringIO() pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats() print(s.getvalue()) - atexit.register(exit) - run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) + atexit.register(goexit) + if log_filename: + log_ctx = redirect_stderr_and_stdout_to_file(log_filename) + else: + log_ctx = nullcontext() + with log_ctx: + run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str): """ """ + from ocrd_network import ProcessingWorker, ProcessorServer, AgentType + SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER] + + if not subcommand: + raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}") if subcommand not in SUBCOMMANDS: raise ValueError(f"SUBCOMMAND can only be one of {SUBCOMMANDS}") diff --git a/src/ocrd/decorators/ocrd_cli_options.py b/src/ocrd/decorators/ocrd_cli_options.py index e640a20032..a401264ed2 100644 --- a/src/ocrd/decorators/ocrd_cli_options.py +++ b/src/ocrd/decorators/ocrd_cli_options.py @@ -33,21 +33,23 @@ def cli(mets_url): option('-O', '--output-file-grp', default=None), option('-g', '--page-id'), option('--overwrite', is_flag=True, default=False), + option('--debug', is_flag=True, default=False), option('--profile', is_flag=True, default=False), option('--profile-file', type=Path(dir_okay=False, writable=True)), parameter_option, parameter_override_option, loglevel_option, + option('--log-filename', default=None), option('--address', type=ServerAddressParamType()), option('--queue', type=QueueServerParamType()), option('--database', type=DatabaseParamType()), + option('-R', '--resolve-resource'), option('-C', '--show-resource'), option('-L', '--list-resources', is_flag=True, default=False), option('-J', '--dump-json', is_flag=True, default=False), option('-D', '--dump-module-dir', is_flag=True, default=False), option('-h', '--help', is_flag=True, default=False), option('-V', '--version', is_flag=True, default=False), - option('--log-filename', default=None), # Subcommand, only used for 'worker'/'server'. Cannot be handled in # click because processors use the @command decorator and even if they # were using `group`, you cannot combine have a command with diff --git a/src/ocrd/decorators/parameter_option.py b/src/ocrd/decorators/parameter_option.py index 0fbe3e0577..2f8be3d868 100644 --- a/src/ocrd/decorators/parameter_option.py +++ b/src/ocrd/decorators/parameter_option.py @@ -1,16 +1,16 @@ from click import option -#from ocrd_utils import parse_json_string_or_file __all__ = ['parameter_option', 'parameter_override_option'] def _handle_param_option(ctx, param, value): + from ocrd_utils import parse_json_string_or_file return parse_json_string_or_file(*list(value)) parameter_option = option('-p', '--parameter', help="Parameters, either JSON string or path to JSON file", multiple=True, - default=['{}'], + default=[], # now handled in ocrd_cli_wrap_processor to resolve processor preset files # callback=_handle_param_option callback=lambda ctx, param, kv: list(kv)) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index 1e3ecfc6eb..52bde30258 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -27,12 +27,22 @@ ocrd__log () { ## Ensure minimum version # ht https://stackoverflow.com/posts/4025065 ocrd__minversion () { - local minversion="$1" - local version=$(ocrd --version|sed 's/ocrd, version //') - #echo "$minversion < $version?" - local IFS=. - version=($version) - minversion=($minversion) + set -e + local minversion_raw="$1" + local version_raw=$(ocrd --version|sed 's/ocrd, version //') + local version_mmp=$(echo "$version_raw" | grep -Eo '([0-9]+\.?){3}') + local version_prerelease_suffix="${version_raw#$version_mmp}" + if [[ -z $version_prerelease_suffix ]];then + version_prerelease_suffix=0 + fi + local minversion_mmp=$(echo "$minversion_raw" | grep -Eo '([0-9]+\.?){3}') + local minversion_prerelease_suffix="${minversion_raw#$minversion_mmp}" + if [[ -z $minversion_prerelease_suffix ]];then + minversion_prerelease_suffix=0 + fi + local IFS='.' + version=($version_mmp) + minversion=($minversion_mmp) # MAJOR > MAJOR if (( ${version[0]} > ${minversion[0]} ));then return @@ -44,12 +54,17 @@ ocrd__minversion () { # MINOR == MINOR elif (( ${version[1]} == ${minversion[1]} ));then # PATCH > PATCH - if (( ${version[2]} >= ${minversion[2]} ));then + if (( ${version[2]} > ${minversion[2]} ));then return + elif (( ${version[2]} == ${minversion[2]}));then + # Match prerelease suffix like a1, b1 alphabetically + if [ "$version_prerelease_suffix" = "$minversion_prerelease_suffix" -o "$version_prerelease_suffix" \> "$minversion_prerelease_suffix" ]; then + return + fi fi fi fi - ocrd__raise "ocrd/core is too old (${version[*]} < ${minversion[*]}). Please update OCR-D/core" + ocrd__raise "ocrd/core is too old ($version_raw < $minversion_raw). Please update OCR-D/core" } ## ### `ocrd__dumpjson` @@ -108,6 +123,7 @@ ocrd__usage () { ## declare -A ocrd__argv=() ## ``` ocrd__parse_argv () { + set -e # if [[ -n "$ZSH_VERSION" ]];then # print -r -- ${+ocrd__argv} ${(t)ocrd__argv} @@ -120,11 +136,16 @@ ocrd__parse_argv () { ocrd__raise "Must set \$params (declare -A params)" fi + if ! declare -p "params_json" >/dev/null 2>/dev/null ;then + ocrd__raise "Must set \$params_json (declare params_json)" + fi + if [[ $# = 0 ]];then ocrd__usage exit 1 fi + ocrd__argv[debug]=false ocrd__argv[overwrite]=false ocrd__argv[profile]=false ocrd__argv[profile_file]= @@ -141,6 +162,7 @@ ocrd__parse_argv () { while [[ "${1:-}" = -* ]];do case "$1" in -l|--log-level) ocrd__argv[log_level]=$2 ; shift ;; + --log-filename) exec 2> "$2" ; shift ;; -h|--help|--usage) ocrd__usage; exit ;; -J|--dump-json) ocrd__dumpjson; exit ;; -D|--dump-module-dir) echo $(dirname "$OCRD_TOOL_JSON"); exit ;; @@ -154,6 +176,7 @@ ocrd__parse_argv () { -w|--working-dir) ocrd__argv[working_dir]=$(realpath "$2") ; shift ;; -m|--mets) ocrd__argv[mets_file]=$(realpath "$2") ; shift ;; -U|--mets-server-url) ocrd__argv[mets_server_url]="$2" ; shift ;; + --debug) ocrd__argv[debug]=true ;; --overwrite) ocrd__argv[overwrite]=true ;; --profile) ocrd__argv[profile]=true ;; --profile-file) ocrd__argv[profile_file]=$(realpath "$2") ; shift ;; @@ -226,17 +249,6 @@ ocrd__parse_argv () { trap showtime DEBUG fi - # check fileGrps - local _valopts=( --workspace "${ocrd__argv[working_dir]}" --mets-basename "$(basename ${ocrd__argv[mets_file]})" ) - if [[ ${ocrd__argv[overwrite]} = true ]]; then - _valopts+=( --overwrite ) - fi - if [[ -n "${ocrd__argv[page_id]:-}" ]]; then - _valopts+=( --page-id "${ocrd__argv[page_id]}" ) - fi - _valopts+=( "${OCRD_TOOL_NAME#ocrd-} -I ${ocrd__argv[input_file_grp]} -O ${ocrd__argv[output_file_grp]} ${__parameters[*]@Q} ${__parameter_overrides[*]@Q}" ) - ocrd validate tasks "${_valopts[@]}" || exit $? - # check parameters local params_parsed retval params_parsed="$(ocrd ocrd-tool "$OCRD_TOOL_JSON" tool $OCRD_TOOL_NAME parse-params "${__parameters[@]}" "${__parameter_overrides[@]}")" || { @@ -245,10 +257,12 @@ ocrd__parse_argv () { $params_parsed" } eval "$params_parsed" + params_json="$(ocrd ocrd-tool "$OCRD_TOOL_JSON" tool $OCRD_TOOL_NAME parse-params --json "${__parameters[@]}" "${__parameter_overrides[@]}")" } ocrd__wrap () { + set -e declare -gx OCRD_TOOL_JSON="$1" declare -gx OCRD_TOOL_NAME="$2" @@ -256,6 +270,7 @@ ocrd__wrap () { shift declare -Agx params params=() + declare -g params_json declare -Agx ocrd__argv ocrd__argv=() @@ -277,20 +292,26 @@ ocrd__wrap () { ocrd__parse_argv "$@" - i=0 - declare -ag ocrd__files=() - while read line; do - eval declare -Ag "ocrd__file$i=( $line )" - eval "ocrd__files[$i]=ocrd__file$i" - let ++i - done < <(ocrd bashlib input-files \ + declare -ag ocrd__files + IFS=$'\n' + ocrd__files=( $(ocrd bashlib input-files \ + --ocrd-tool $OCRD_TOOL_JSON \ + --executable $OCRD_TOOL_NAME \ + $(if [[ ${ocrd__argv[debug]} = true ]]; then echo --debug; fi) \ + $(if [[ ${ocrd__argv[overwrite]} = true ]]; then echo --overwrite; fi) \ -m "${ocrd__argv[mets_file]}" \ + -d "${ocrd__argv[working_dir]}" \ + ${ocrd__argv[mets_server_url]:+-U} ${ocrd__argv[mets_server_url]:-} \ + -p "$params_json" \ -I "${ocrd__argv[input_file_grp]}" \ -O "${ocrd__argv[output_file_grp]}" \ - ${ocrd__argv[page_id]:+-g} ${ocrd__argv[page_id]:-}) + ${ocrd__argv[page_id]:+-g} ${ocrd__argv[page_id]:-}) ) + IFS=$' \t\n' } ## usage: pageId=$(ocrd__input_file 3 pageId) ocrd__input_file() { - eval echo "\${${ocrd__files[$1]}[$2]}" + declare -A input_file + eval input_file=( "${ocrd__files[$1]}" ) + eval echo "${input_file[$2]}" } diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 0d4c0a0785..101727e064 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -21,7 +21,7 @@ import uvicorn from ocrd_models import OcrdFile, ClientSideOcrdFile, OcrdAgent, ClientSideOcrdAgent -from ocrd_utils import getLogger, deprecated_alias +from ocrd_utils import getLogger # @@ -88,6 +88,14 @@ def create(file_groups: List[str]): return OcrdFileGroupListModel(file_groups=file_groups) +class OcrdPageListModel(BaseModel): + physical_pages: List[str] = Field() + + @staticmethod + def create(physical_pages: List[str]): + return OcrdPageListModel(physical_pages=physical_pages) + + class OcrdAgentListModel(BaseModel): agents: List[OcrdAgentModel] = Field() @@ -120,7 +128,7 @@ class ClientSideOcrdMets: def __init__(self, url, workspace_path: Optional[str] = None): self.protocol = "tcp" if url.startswith("http://") else "uds" - self.log = getLogger(f"ocrd.mets_client[{url}]") + self.log = getLogger(f"ocrd.models.ocrd_mets.client.{url}") self.url = url if self.protocol == "tcp" else f'http+unix://{url.replace("/", "%2F")}' self.ws_dir_path = workspace_path if workspace_path else None @@ -210,6 +218,17 @@ def workspace_path(self): ).json()["text"] return self.ws_dir_path + @property + def physical_pages(self) -> List[str]: + if not self.multiplexing_mode: + return self.session.request("GET", f"{self.url}/physical_pages").json()["physical_pages"] + else: + return self.session.request( + "POST", + self.url, + json=MpxReq.physical_pages(self.ws_dir_path) + ).json()["physical_pages"] + @property def file_groups(self): if not self.multiplexing_mode: @@ -236,7 +255,7 @@ def agents(self): agent_dict["_type"] = agent_dict.pop("type") return [ClientSideOcrdAgent(None, **agent_dict) for agent_dict in agent_dicts] - def add_agent(self, *args, **kwargs): + def add_agent(self, **kwargs): if not self.multiplexing_mode: return self.session.request("POST", f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).dict()) else: @@ -247,11 +266,9 @@ def add_agent(self, *args, **kwargs): ).json() return OcrdAgentModel.create(**kwargs) - @deprecated_alias(ID="file_id") - @deprecated_alias(pageId="page_id") - @deprecated_alias(fileGrp="file_grp") def find_files(self, **kwargs): self.log.debug("find_files(%s)", kwargs) + # translate from native OcrdMets kwargs to OcrdMetsServer REST params if "pageId" in kwargs: kwargs["page_id"] = kwargs.pop("pageId") if "ID" in kwargs: @@ -277,28 +294,31 @@ def find_files(self, **kwargs): def find_all_files(self, *args, **kwargs): return list(self.find_files(*args, **kwargs)) - @deprecated_alias(pageId="page_id") - @deprecated_alias(ID="file_id") def add_file( - self, file_grp, content=None, file_id=None, url=None, local_filename=None, mimetype=None, page_id=None, **kwargs + self, file_grp, content=None, ID=None, url=None, local_filename=None, mimetype=None, pageId=None, **kwargs ): data = OcrdFileModel.create( - file_id=file_id, file_grp=file_grp, page_id=page_id, mimetype=mimetype, url=url, - local_filename=local_filename + file_grp=file_grp, + # translate from native OcrdMets kwargs to OcrdMetsServer REST params + file_id=ID, page_id=pageId, + mimetype=mimetype, url=url, local_filename=local_filename ) + # add force+ignore + kwargs = {**kwargs, **data.dict()} if not self.multiplexing_mode: - r = self.session.request("POST", f"{self.url}/file", data=data.dict()) - if not r: - raise RuntimeError("Add file failed. Please check provided parameters") + r = self.session.request("POST", f"{self.url}/file", data=kwargs) + if not r.ok: + raise RuntimeError(f"Failed to add file ({str(data)}): {r.json()}") else: - r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, data.dict())) - if "error" in r: - raise RuntimeError(f"Add file failed: Msg: {r['error']}") + r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, kwargs)) + if not r.ok: + raise RuntimeError(f"Failed to add file ({str(data)}): {r.json()[errors]}") return ClientSideOcrdFile( - None, ID=file_id, fileGrp=file_grp, url=url, pageId=page_id, mimetype=mimetype, - local_filename=local_filename + None, fileGrp=file_grp, + ID=ID, pageId=pageId, + url=url, mimetype=mimetype, local_filename=local_filename ) @@ -348,6 +368,11 @@ def workspace_path(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( ws_dir_path, method_type="GET", response_type="text", request_url="workspace_path", request_data={}) + @staticmethod + def physical_pages(ws_dir_path: str) -> Dict: + return MpxReq.__args_wrapper( + ws_dir_path, method_type="GET", response_type="dict", request_url="physical_pages", request_data={}) + @staticmethod def file_groups(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( @@ -404,7 +429,6 @@ def create_process(mets_server_url: str, ws_dir_path: str, log_file: str) -> int @staticmethod def kill_process(mets_server_pid: int): subprocess_run(args=["kill", "-s", "SIGINT", f"{mets_server_pid}"], shell=False, universal_newlines=True) - return def shutdown(self): if self.is_uds: @@ -468,6 +492,10 @@ async def unique_identifier(): async def workspace_path(): return Response(content=workspace.directory, media_type="text/plain") + @app.get(path='/physical_pages', response_model=OcrdPageListModel) + async def physical_pages(): + return {'physical_pages': workspace.mets.physical_pages} + @app.get(path='/file_groups', response_model=OcrdFileGroupListModel) async def file_groups(): return {'file_groups': workspace.mets.file_groups} @@ -507,7 +535,8 @@ async def add_file( page_id: Optional[str] = Form(), mimetype: str = Form(), url: Optional[str] = Form(None), - local_filename: Optional[str] = Form(None) + local_filename: Optional[str] = Form(None), + force: bool = Form(False), ): """ Add a file @@ -519,7 +548,7 @@ async def add_file( ) # Add to workspace kwargs = file_resource.dict() - workspace.add_file(**kwargs) + workspace.add_file(**kwargs, force=force) return file_resource # ------------- # diff --git a/src/ocrd/processor/__init__.py b/src/ocrd/processor/__init__.py index 21b0c69eb2..7cbcb851de 100644 --- a/src/ocrd/processor/__init__.py +++ b/src/ocrd/processor/__init__.py @@ -1,9 +1,15 @@ from .base import ( Processor, - ResourceNotFoundError + ResourceNotFoundError, + NonUniqueInputFile, + MissingInputFile, + generate_processor_help, +) +from .ocrd_page_result import ( + OcrdPageResult, + OcrdPageResultImage ) from .helpers import ( run_cli, run_processor, - generate_processor_help ) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 8303413933..d6348b40e1 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -9,34 +9,68 @@ 'run_processor' ] -from os.path import exists +from functools import cached_property +from os.path import exists, join from shutil import copyfileobj import json import os from os import getcwd from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union, get_args import sys +import logging +import logging.handlers +import inspect import tarfile import io -from ocrd.workspace import Workspace +from collections import defaultdict +from frozendict import frozendict +# concurrent.futures is buggy in py38, +# this is where the fixes came from: +from loky import Future, ProcessPoolExecutor +import multiprocessing as mp +from threading import Timer +from _thread import interrupt_main +from click import wrap_text +from deprecated import deprecated +from requests import HTTPError + +from ..workspace import Workspace +from ..mets_server import ClientSideOcrdMets +from ocrd_models.ocrd_file import OcrdFileType +from .ocrd_page_result import OcrdPageResult from ocrd_utils import ( VERSION as OCRD_VERSION, MIMETYPE_PAGE, MIME_TO_EXT, + config, getLogger, - initLogging, list_resource_candidates, pushd_popd, list_all_resources, get_processor_resource_types, resource_filename, + parse_json_file_with_comments, + make_file_id, + deprecation_warning ) from ocrd_validators import ParameterValidator -from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType +from ocrd_models.ocrd_page import ( + PageType, + AlternativeImageType, + MetadataItemType, + LabelType, + LabelsType, + OcrdPage, + to_xml, +) +from ocrd_modelfactory import page_from_file +from ocrd_validators.ocrd_tool_validator import OcrdToolValidator # XXX imports must remain for backwards-compatibility -from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import +from .helpers import run_cli, run_processor # pylint: disable=unused-import + class ResourceNotFoundError(FileNotFoundError): """ @@ -46,159 +80,785 @@ class ResourceNotFoundError(FileNotFoundError): def __init__(self, name, executable): self.name = name self.executable = executable - self.message = "Could not find resource '%s' for executable '%s'. " \ - "Try 'ocrd resmgr download %s %s' to download this resource." \ - % (name, executable, executable, name) + self.message = (f"Could not find resource '{name}' for executable '{executable}'. " + f"Try 'ocrd resmgr download {executable} {name}' to download this resource.") + super().__init__(self.message) + +class NonUniqueInputFile(ValueError): + """ + An exception signifying the specified fileGrp / pageId / mimetype + selector yields multiple PAGE files, or no PAGE files but multiple images, + or multiple files of that mimetype. + """ + def __init__(self, fileGrp, pageId, mimetype): + self.fileGrp = fileGrp + self.pageId = pageId + self.mimetype = mimetype + self.message = (f"Could not determine unique input file for fileGrp {fileGrp} " + f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}") + super().__init__(self.message) + +class MissingInputFile(ValueError): + """ + An exception signifying the specified fileGrp / pageId / mimetype + selector yields no PAGE files, or no PAGE and no image files, + or no files of that mimetype. + """ + def __init__(self, fileGrp, pageId, mimetype): + self.fileGrp = fileGrp + self.pageId = pageId + self.mimetype = mimetype + self.message = (f"Could not find input file for fileGrp {fileGrp} " + f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}") super().__init__(self.message) +class DummyFuture: + """ + Mimics some of `concurrent.futures.Future` but runs immediately. + """ + def __init__(self, fn, *args, **kwargs): + self.fn = fn + self.args = args + self.kwargs = kwargs + def result(self): + return self.fn(*self.args, **self.kwargs) +class DummyExecutor: + """ + Mimics some of `concurrent.futures.ProcessPoolExecutor` but runs + everything immediately in this process. + """ + def __init__(self, initializer=None, initargs=(), **kwargs): + initializer(*initargs) + def shutdown(self, **kwargs): + pass + def submit(self, fn, *args, **kwargs) -> DummyFuture: + return DummyFuture(fn, *args, **kwargs) + +TFuture = Union[DummyFuture, Future] +TExecutor = Union[DummyExecutor, ProcessPoolExecutor] + class Processor(): """ - A processor is a tool that implements the uniform OCR-D command-line interface - for run-time data processing. That is, it executes a single workflow step, - or a combination of workflow steps, on the workspace (represented by local METS). - It reads input files for all or requested physical pages of the input fileGrp(s), - and writes output files for them into the output fileGrp(s). It may take - a number of optional or mandatory parameters. + A processor is a tool that implements the uniform OCR-D + `command-line interface for run-time data processing `_. + + That is, it executes a single workflow step, or a combination of workflow steps, + on the workspace (represented by local METS). It reads input files for all or selected + physical pages of the input fileGrp(s), computes additional annotation, and writes output + files for them into the output fileGrp(s). It may take a number of optional or mandatory + parameters. + """ + + max_instances : int = -1 + """ + maximum number of cached instances (ignored if negative), to be applied on top of + :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller). + + (Override this if you know how many instances fit into memory - GPU / CPU RAM - at once.) + """ + + max_workers : int = -1 + """ + maximum number of processor forks for page-parallel processing (ignored if negative), + to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e. + whatever is smaller). + + (Override this if you know how many pages fit into processing units - GPU shaders / CPU cores + - at once, or if your class already creates threads prior to forking, e.g. during ``setup``.) + """ + + max_page_seconds : int = -1 + """ + maximum number of seconds may be spent processing a single page (ignored if negative), + to be applied on top of :py:data:`~ocrd_utils.config.OCRD_PROCESSING_PAGE_TIMEOUT` + (i.e. whatever is smaller). + + (Override this if you know how costly this processor may be, irrespective of image size + or complexity of the page.) """ + @property + def metadata_filename(self) -> str: + """ + Relative location of the ``ocrd-tool.json`` file inside the package. + + Used by :py:data:`metadata_location`. + + (Override if ``ocrd-tool.json`` is not in the root of the module, + e.g. ``namespace/ocrd-tool.json`` or ``data/ocrd-tool.json``). + """ + return 'ocrd-tool.json' + + @cached_property + def metadata_location(self) -> Path: + """ + Absolute path of the ``ocrd-tool.json`` file as distributed with the package. + + Used by :py:data:`metadata_rawdict`. + + (Override if ``ocrd-tool.json`` is not distributed with the Python package.) + """ + module = inspect.getmodule(self) + module_tokens = module.__package__.split('.') + # for namespace packages, we cannot just use the first token + for i in range(len(module_tokens)): + prefix = '.'.join(module_tokens[:i + 1]) + if sys.modules[prefix].__spec__.has_location: + return resource_filename(prefix, self.metadata_filename) + raise Exception("cannot find top-level module prefix for %s", module.__package__) + + @cached_property + def metadata_rawdict(self) -> dict: + """ + Raw (unvalidated, unexpanded) ``ocrd-tool.json`` dict contents of the package. + + Used by :py:data:`metadata`. + + (Override if ``ocrd-tool.json`` is not in a file.) + """ + return parse_json_file_with_comments(self.metadata_location) + + @cached_property + def metadata(self) -> dict: + """ + The ``ocrd-tool.json`` dict contents of the package, according to the OCR-D + `spec `_ for processor tools. + + After deserialisation, it also gets validated against the + `schema `_ with all defaults + expanded. + + Used by :py:data:`ocrd_tool` and :py:data:`version`. + + (Override if you want to provide metadata programmatically instead of a + JSON file.) + """ + metadata = self.metadata_rawdict + report = OcrdToolValidator.validate(metadata) + if not report.is_valid: + self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n" + f"{report.to_xml()}.\nPlease open an issue at {metadata.get('git_url', 'the website')}.") + return metadata + + @cached_property + def version(self) -> str: + """ + The program version of the package. + Usually the ``version`` part of :py:data:`metadata`. + + (Override if you do not want to use :py:data:`metadata` lookup + mechanism.) + """ + return self.metadata['version'] + + @cached_property + def executable(self) -> str: + """ + The executable name of this processor tool. Taken from the runtime + filename. + + Used by :py:data:`ocrd_tool` for lookup in :py:data:`metadata`. + + (Override if your entry-point name deviates from the ``executable`` + name, or the processor gets instantiated from another runtime.) + """ + return os.path.basename(inspect.stack()[-1].filename) + + @cached_property + def ocrd_tool(self) -> dict: + """ + The ``ocrd-tool.json`` dict contents of this processor tool. + Usually the :py:data:`executable` key of the ``tools`` part + of :py:data:`metadata`. + + (Override if you do not want to use :py:data:`metadata` lookup + mechanism.) + """ + return self.metadata['tools'][self.executable] + + @property + def parameter(self) -> Optional[dict]: + """the runtime parameter dict to be used by this processor""" + if hasattr(self, '_parameter'): + return self._parameter + return None + + @parameter.setter + def parameter(self, parameter : dict) -> None: + if self.parameter is not None: + self.shutdown() + parameterValidator = ParameterValidator(self.ocrd_tool) + report = parameterValidator.validate(parameter) + if not report.is_valid: + raise ValueError(f'Invalid parameters:\n{report.to_xml()}') + # make parameter dict read-only + self._parameter = frozendict(parameter) + # (re-)run setup to load models etc + self.setup() + def __init__( self, - workspace : Workspace, + # FIXME: remove in favor of process_workspace(workspace) + workspace : Optional[Workspace], ocrd_tool=None, parameter=None, input_file_grp=None, output_file_grp=None, page_id=None, - resolve_resource=None, - show_resource=None, - list_resources=False, - show_help=False, - subcommand=None, - show_version=False, - dump_json=False, - dump_module_dir=False, + download_files=config.OCRD_DOWNLOAD_INPUT, version=None ): """ - Instantiate, but do not process. Unless ``list_resources`` or - ``show_resource`` or ``show_help`` or ``show_version`` or - ``dump_json`` or ``dump_module_dir`` is true, setup for processing - (parsing and validating parameters, entering the workspace directory). + Instantiate, but do not setup (neither for processing nor other usage). + If given, do parse and validate :py:data:`.parameter`. Args: workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \ - Can be ``None`` even for processing (esp. on multiple workspaces), \ - but then needs to be set before running. + If not ``None``, then `chdir` to that directory. + Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ + before processing. Keyword Args: - ocrd_tool (string): JSON of the ocrd-tool description for that processor. \ - Can be ``None`` for processing, but needs to be set before running. parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \ Can be ``None`` even for processing, but then needs to be set before running. - input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input. - output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output. + input_file_grp (string): comma-separated list of METS ``fileGrp`` used for input. \ + Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ + before processing. + output_file_grp (string): comma-separated list of METS ``fileGrp`` used for output. \ + Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ + before processing. page_id (string): comma-separated list of METS physical ``page`` IDs to process \ - (or empty for all pages). - resolve_resource (string): If not ``None``, then instead of processing, resolve \ - given resource by name and print its full path to stdout. - show_resource (string): If not ``None``, then instead of processing, resolve \ - given resource by name and print its contents to stdout. - list_resources (boolean): If true, then instead of processing, find all installed \ - resource files in the search paths and print their path names. - show_help (boolean): If true, then instead of processing, print a usage description \ - including the standard CLI and all of this processor's ocrd-tool parameters and \ - docstrings. - subcommand (string): 'worker' or 'server', only used here for the right --help output - show_version (boolean): If true, then instead of processing, print information on \ - this processor's version and OCR-D version. Exit afterwards. - dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \ - on stdout. - dump_module_dir (boolean): If true, then instead of processing, print :py:attr:`moduledir` \ - on stdout. - """ - self.ocrd_tool = ocrd_tool - if dump_json: - print(json.dumps(ocrd_tool, indent=True)) - return - if dump_module_dir: - print(self.moduledir) - return - if list_resources: - for res in self.list_all_resources(): - print(res) - return - if resolve_resource: - try: - res = self.resolve_resource(resolve_resource) - print(res) - except ResourceNotFoundError as e: - log = getLogger('ocrd.processor.base') - log.critical(e.message) - sys.exit(1) - return - if show_resource: - try: - self.show_resource(show_resource) - except ResourceNotFoundError as e: - log = getLogger('ocrd.processor.base') - log.critical(e.message) - sys.exit(1) - return - if show_help: - self.show_help(subcommand=subcommand) - return - self.version = version - if show_version: - self.show_version() - return - self.workspace = workspace - # FIXME HACK would be better to use pushd_popd(self.workspace.directory) - # but there is no way to do that in process here since it's an - # overridden method. chdir is almost always an anti-pattern. - if self.workspace: + (or empty for all pages). \ + Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ + before processing. + download_files (boolean): Whether input files will be downloaded prior to processing, \ + defaults to :py:attr:`ocrd_utils.config.OCRD_DOWNLOAD_INPUT` which is ``True`` by default + """ + if ocrd_tool is not None: + deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - " + "use or override metadata/executable/ocrd-tool properties instead") + self.ocrd_tool = ocrd_tool + self.executable = ocrd_tool['executable'] + if version is not None: + deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - " + "use or override metadata/version properties instead") + self.version = version + if workspace is not None: + deprecation_warning("Passing a workspace argument other than 'None' to Processor " + "is deprecated - pass as argument to process_workspace instead") + self.workspace = workspace self.old_pwd = getcwd() os.chdir(self.workspace.directory) - self.input_file_grp = input_file_grp - self.output_file_grp = output_file_grp - self.page_id = None if page_id == [] or page_id is None else page_id - if parameter is None: - parameter = {} - parameterValidator = ParameterValidator(ocrd_tool) - report = parameterValidator.validate(parameter) - if not report.is_valid: - raise Exception("Invalid parameters %s" % report.errors) - self.parameter = parameter + if input_file_grp is not None: + deprecation_warning("Passing an input_file_grp kwarg other than 'None' to Processor " + "is deprecated - pass as argument to process_workspace instead") + self.input_file_grp = input_file_grp + if output_file_grp is not None: + deprecation_warning("Passing an output_file_grp kwarg other than 'None' to Processor " + "is deprecated - pass as argument to process_workspace instead") + self.output_file_grp = output_file_grp + if page_id is not None: + deprecation_warning("Passing a page_id kwarg other than 'None' to Processor " + "is deprecated - pass as argument to process_workspace instead") + self.page_id = page_id or None + self.download = download_files + #: The logger to be used by processor implementations. + # `ocrd.processor.base` internals should use :py:attr:`self._base_logger` + self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}') + self._base_logger = getLogger('ocrd.processor.base') + if parameter is not None: + self.parameter = parameter + # workaround for deprecated#72 (@deprecated decorator does not work for subclasses): + setattr(self, 'process', + deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')(getattr(self, 'process'))) + + def __del__(self): + self._base_logger.debug("shutting down") + self.shutdown() def show_help(self, subcommand=None): + """ + Print a usage description including the standard CLI and all of this processor's ocrd-tool + parameters and docstrings. + """ print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand)) def show_version(self): + """ + Print information on this processor's version and OCR-D version. + """ print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION)) def verify(self): """ - Verify that the :py:attr:`input_file_grp` fulfills the processor's requirements. + Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements. """ + # verify input and output file groups in parameters + assert self.input_file_grp is not None + assert self.output_file_grp is not None + input_file_grps = self.input_file_grp.split(',') + output_file_grps = self.output_file_grp.split(',') + def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg): + if isinstance(spec, int): + if spec > 0: + assert len(grps) == spec, msg % (len(grps), str(spec)) + else: + assert isinstance(spec, list) + minimum = spec[0] + maximum = spec[1] + if minimum > 0: + assert len(grps) >= minimum, msg % (len(grps), str(spec)) + if maximum > 0: + assert len(grps) <= maximum, msg % (len(grps), str(spec)) + # FIXME: enforce unconditionally as soon as grace period for deprecation is over + if 'input_file_grp_cardinality' in self.ocrd_tool: + assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'], + "Unexpected number of input file groups %d vs %s") + if 'output_file_grp_cardinality' in self.ocrd_tool: + assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'], + "Unexpected number of output file groups %d vs %s") + # verify input and output file groups in METS + for input_file_grp in input_file_grps: + assert input_file_grp in self.workspace.mets.file_groups, \ + f"input fileGrp {input_file_grp} does not exist in workspace {self.workspace}" + for output_file_grp in output_file_grps: + assert output_file_grp not in self.workspace.mets.file_groups \ + or config.OCRD_EXISTING_OUTPUT in ['OVERWRITE', 'SKIP'] \ + or not any(self.workspace.mets.find_files( + pageId=self.page_id, fileGrp=output_file_grp)), \ + f"output fileGrp {output_file_grp} already exists in workspace {self.workspace}" + # keep this for backwards compatibility: return True + def dump_json(self): + """ + Print :py:attr:`ocrd_tool` on stdout. + """ + print(json.dumps(self.ocrd_tool, indent=True)) + + def dump_module_dir(self): + """ + Print :py:attr:`moduledir` on stdout. + """ + print(self.moduledir) + + def list_resources(self): + """ + Find all installed resource files in the search paths and print their path names. + """ + for res in self.list_all_resources(): + print(res) + + def setup(self) -> None: + """ + Prepare the processor for actual data processing, + prior to changing to the workspace directory but + after parsing parameters. + + (Override this to load models into memory etc.) + """ + pass + + def shutdown(self) -> None: + """ + Bring down the processor after data processing, + after to changing back from the workspace directory but + before exiting (or setting up with different parameters). + + (Override this to unload models from memory etc.) + """ + pass + + @deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()') def process(self) -> None: """ - Process the :py:attr:`workspace` - from the given :py:attr:`input_file_grp` - to the given :py:attr:`output_file_grp` - for the given :py:attr:`page_id` - under the given :py:attr:`parameter`. - - (This contains the main functionality and needs to be overridden by subclasses.) + Process all files of the :py:data:`workspace` + from the given :py:data:`input_file_grp` + to the given :py:data:`output_file_grp` + for the given :py:data:`page_id` (or all pages) + under the given :py:data:`parameter`. + + (This contains the main functionality and needs to be + overridden by subclasses.) """ raise NotImplementedError() + def process_workspace(self, workspace: Workspace) -> None: + """ + Process all files of the given ``workspace``, + from the given :py:data:`input_file_grp` + to the given :py:data:`output_file_grp` + for the given :py:data:`page_id` (or all pages) + under the given :py:data:`parameter`. + + Delegates to :py:meth:`.process_workspace_submit_tasks` + and :py:meth:`.process_workspace_handle_tasks`. + + (This will iterate over pages and files, calling + :py:meth:`.process_page_file` and handling exceptions. + It should be overridden by subclasses to handle cases + like post-processing or computation across pages.) + """ + with pushd_popd(workspace.directory): + self.workspace = workspace + self.verify() + try: + # set up multitasking + max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES) + if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES: + self._base_logger.info("limiting number of threads from %d to %d", max_workers, self.max_workers) + max_workers = self.max_workers + if max_workers > 1: + assert isinstance(workspace.mets, ClientSideOcrdMets), \ + "OCRD_MAX_PARALLEL_PAGES>1 requires also using --mets-server-url" + max_seconds = max(0, config.OCRD_PROCESSING_PAGE_TIMEOUT) + if self.max_page_seconds > 0 and self.max_page_seconds < config.OCRD_PROCESSING_PAGE_TIMEOUT: + self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds) + max_seconds = self.max_page_seconds + + if max_workers > 1: + executor_cls = ProcessPoolExecutor + log_queue = mp.Queue() + # forward messages from log queue (in subprocesses) to all root handlers + log_listener = logging.handlers.QueueListener(log_queue, *logging.root.handlers, respect_handler_level=True) + else: + executor_cls = DummyExecutor + log_queue = None + log_listener = None + executor = executor_cls( + max_workers=max_workers or 1, + # only forking method avoids pickling + context=mp.get_context('fork'), + # share processor instance as global to avoid pickling + initializer=_page_worker_set_ctxt, + initargs=(self, log_queue), + ) + if max_workers > 1: + log_listener.start() + try: + self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) + tasks = self.process_workspace_submit_tasks(executor, max_seconds) + stats = self.process_workspace_handle_tasks(tasks) + finally: + executor.shutdown(kill_workers=True, wait=False) + if max_workers > 1: + log_listener.stop() + + except NotImplementedError: + # fall back to deprecated method + try: + self.process() + except Exception as err: + # suppress the NotImplementedError context + raise err from None + + def process_workspace_submit_tasks(self, executor : TExecutor, max_seconds : int) -> Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]: + """ + Look up all input files of the given ``workspace`` + from the given :py:data:`input_file_grp` + for the given :py:data:`page_id` (or all pages), + and schedules calling :py:meth:`.process_page_file` + on them for each page via `executor` (enforcing + a per-page time limit of `max_seconds`). + + When running with `OCRD_MAX_PARALLEL_PAGES>1` and + the workspace via METS Server, the executor will fork + this many worker parallel subprocesses each processing + one page at a time. (Interprocess communication is + done via task and result queues.) + + Otherwise, tasks are run sequentially in the + current process. + + Delegates to :py:meth:`.zip_input_files` to get + the input files for each page, and then calls + :py:meth:`.process_workspace_submit_page_task`. + + Returns a dict mapping the per-page tasks + (i.e. futures submitted to the executor) + to their corresponding pageId and input files. + """ + tasks = {} + for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): + task, page_id, input_files = self.process_workspace_submit_page_task(executor, max_seconds, input_file_tuple) + tasks[task] = (page_id, input_files) + self._base_logger.debug("submitted %d processing tasks", len(tasks)) + return tasks + + def process_workspace_submit_page_task(self, executor : TExecutor, max_seconds : int, input_file_tuple : List[Optional[OcrdFileType]]) -> Tuple[TFuture, str, List[Optional[OcrdFileType]]]: + """ + Ensure all input files for a single page are + downloaded to the workspace, then schedule + :py:meth:`.process_process_file` to be run on + them via `executor` (enforcing a per-page time + limit of `max_seconds`). + + Delegates to :py:meth:`.process_page_file` + (wrapped in :py:func:`_page_worker` to share + the processor instance across forked processes). + + \b + Returns a tuple of: + - the scheduled future object, + - the corresponding pageId, + - the corresponding input files. + """ + input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) + page_id = next(input_file.pageId + for input_file in input_file_tuple + if input_file) + self._base_logger.info(f"preparing page {page_id}") + for i, input_file in enumerate(input_file_tuple): + if input_file is None: + # file/page not found in this file grp + continue + input_files[i] = input_file + if not self.download: + continue + try: + input_files[i] = self.workspace.download_file(input_file) + except (ValueError, FileNotFoundError, HTTPError) as e: + self._base_logger.error(repr(e)) + self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}") + # process page + #executor.submit(self.process_page_file, *input_files) + return executor.submit(_page_worker, max_seconds, *input_files), page_id, input_files + + def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]) -> Tuple[int, int, Dict[str, int], int]: + """ + Look up scheduled per-page futures one by one, + handle errors (exceptions) and gather results. + + \b + Enforces policies configured by the following + environment variables: + - `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite) + - `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy) + - `OCRD_MAX_MISSING_OUTPUTS` (abort after all). + + \b + Returns a tuple of: + - the number of successfully processed pages + - the number of failed (i.e. skipped or copied) pages + - a dict of the type and corresponding number of exceptions seen + - the number of total requested pages (i.e. success+fail+existing). + + Delegates to :py:meth:`.process_workspace_handle_page_task` + for each page. + """ + # aggregate info for logging: + nr_succeeded = 0 + nr_failed = 0 + nr_errors = defaultdict(int) # count causes + if config.OCRD_MISSING_OUTPUT == 'SKIP': + reason = "skipped" + elif config.OCRD_MISSING_OUTPUT == 'COPY': + reason = "fallback-copied" + for task in tasks: + # wait for results, handle errors + page_id, input_files = tasks[task] + result = self.process_workspace_handle_page_task(page_id, input_files, task) + if isinstance(result, Exception): + nr_errors[result.__class__.__name__] += 1 + nr_failed += 1 + # FIXME: this is just prospective, because len(tasks)==nr_failed+nr_succeeded is not guaranteed + if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: + # already irredeemably many failures, stop short + nr_errors = dict(nr_errors) + raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded}, {str(nr_errors)})") + elif result: + nr_succeeded += 1 + # else skipped - already exists + nr_errors = dict(nr_errors) + if nr_failed > 0: + nr_all = nr_succeeded + nr_failed + if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS: + raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all}, {str(nr_errors)})") + self._base_logger.warning("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors)) + return nr_succeeded, nr_failed, nr_errors, len(tasks) + + def process_workspace_handle_page_task(self, page_id : str, input_files : List[Optional[OcrdFileType]], task : TFuture) -> Union[bool, Exception]: + """ + \b + Await a single page result and handle errors (exceptions), + enforcing policies configured by the following + environment variables: + - `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite) + - `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy) + - `OCRD_MAX_MISSING_OUTPUTS` (abort after all). + + \b + Returns + - true in case of success + - false in case the output already exists + - the exception in case of failure + """ + # FIXME: differentiate error cases in various ways: + # - ResourceNotFoundError → use ResourceManager to download (once), then retry + # - transient (I/O or OOM) error → maybe sleep, retry + # - persistent (data) error → skip / dummy / raise + try: + self._base_logger.debug("waiting for output of task %s (page %s)", task, page_id) + # timeout kwarg on future is useless: it only raises TimeoutError here, + # but does not stop the running process/thread, and executor itself + # offers nothing to that effect: + # task.result(timeout=max_seconds or None) + # so we instead applied the timeout within the worker function + task.result() + return True + except NotImplementedError: + # exclude NotImplementedError, so we can try process() below + raise + # handle input failures separately + except FileExistsError as err: + if config.OCRD_EXISTING_OUTPUT == 'ABORT': + raise err + if config.OCRD_EXISTING_OUTPUT == 'SKIP': + return False + if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': + # too late here, must not happen + raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") + except KeyboardInterrupt: + raise + # broad coverage of output failures (including TimeoutError) + except Exception as err: + # FIXME: add re-usable/actionable logging + if config.OCRD_MISSING_OUTPUT == 'ABORT': + self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") + raise err + self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") + if config.OCRD_MISSING_OUTPUT == 'SKIP': + pass + elif config.OCRD_MISSING_OUTPUT == 'COPY': + self._copy_page_file(input_files[0]) + else: + desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) + raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") + return err + + def _copy_page_file(self, input_file : OcrdFileType) -> None: + """ + Copy the given ``input_file`` of the :py:data:`workspace`, + representing one physical page (passed as one opened + :py:class:`~ocrd_models.OcrdFile` per input fileGrp) + and add it as if it was a processing result. + """ + input_pcgts : OcrdPage + assert isinstance(input_file, get_args(OcrdFileType)) + self._base_logger.debug(f"parsing file {input_file.ID} for page {input_file.pageId}") + try: + input_pcgts = page_from_file(input_file) + except ValueError as err: + # not PAGE and not an image to generate PAGE for + self._base_logger.error(f"non-PAGE input for page {input_file.pageId}: {err}") + return + output_file_id = make_file_id(input_file, self.output_file_grp) + input_pcgts.set_pcGtsId(output_file_id) + self.add_metadata(input_pcgts) + self.workspace.add_file( + file_id=output_file_id, + file_grp=self.output_file_grp, + page_id=input_file.pageId, + local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), + mimetype=MIMETYPE_PAGE, + content=to_xml(input_pcgts), + ) + + def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: + """ + Process the given ``input_files`` of the :py:data:`workspace`, + representing one physical page (passed as one opened + :py:class:`.OcrdFile` per input fileGrp) + under the given :py:data:`.parameter`, and make sure the + results get added accordingly. + + (This uses :py:meth:`.process_page_pcgts`, but should be overridden by subclasses + to handle cases like multiple output fileGrps, non-PAGE input etc.) + """ + input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files) + assert isinstance(input_files[0], get_args(OcrdFileType)) + page_id = input_files[0].pageId + self._base_logger.info("processing page %s", page_id) + for i, input_file in enumerate(input_files): + assert isinstance(input_file, get_args(OcrdFileType)) + self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}") + try: + page_ = page_from_file(input_file) + assert isinstance(page_, OcrdPage) + input_pcgts[i] = page_ + except ValueError as err: + # not PAGE and not an image to generate PAGE for + self._base_logger.error(f"non-PAGE input for page {page_id}: {err}") + output_file_id = make_file_id(input_files[0], self.output_file_grp) + output_file = next(self.workspace.mets.find_files(ID=output_file_id), None) + if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE': + # short-cut avoiding useless computation: + raise FileExistsError( + f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set" + ) + result = self.process_page_pcgts(*input_pcgts, page_id=page_id) + for image_result in result.images: + image_file_id = f'{output_file_id}_{image_result.file_id_suffix}' + image_file_path = join(self.output_file_grp, f'{image_file_id}.png') + if isinstance(image_result.alternative_image, PageType): + # special case: not an alternative image, but replacing the original image + # (this is needed by certain processors when the original's coordinate system + # cannot or must not be kept) + image_result.alternative_image.set_imageFilename(image_file_path) + image_result.alternative_image.set_imageWidth(image_result.pil.width) + image_result.alternative_image.set_imageHeight(image_result.pil.height) + elif isinstance(image_result.alternative_image, AlternativeImageType): + image_result.alternative_image.set_filename(image_file_path) + elif image_result.alternative_image is None: + pass # do not reference in PAGE result + else: + raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type " + f"{type(image_result.alternative_image)}") + self.workspace.save_image_file( + image_result.pil, + image_file_id, + self.output_file_grp, + page_id=page_id, + file_path=image_file_path, + ) + result.pcgts.set_pcGtsId(output_file_id) + self.add_metadata(result.pcgts) + self.workspace.add_file( + file_id=output_file_id, + file_grp=self.output_file_grp, + page_id=page_id, + local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), + mimetype=MIMETYPE_PAGE, + content=to_xml(result.pcgts), + ) + + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + """ + Process the given ``input_pcgts`` of the :py:data:`.workspace`, + representing one physical page (passed as one parsed + :py:class:`.OcrdPage` per input fileGrp) + under the given :py:data:`.parameter`, and return the + resulting :py:class:`.OcrdPageResult`. + + Optionally, add to the ``images`` attribute of the resulting + :py:class:`.OcrdPageResult` instances of :py:class:`.OcrdPageResultImage`, + which have required fields for ``pil`` (:py:class:`PIL.Image` image data), + ``file_id_suffix`` (used for generating IDs of the saved image) and + ``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType` + for setting the filename of the saved image). - def add_metadata(self, pcgts): + (This contains the main functionality and must be overridden by subclasses, + unless it does not get called by some overriden :py:meth:`.process_page_file`.) + """ + raise NotImplementedError() + + def add_metadata(self, pcgts: OcrdPage) -> None: """ Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing - the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``. + the processing step and runtime parameters to :py:class:`.OcrdPage` ``pcgts``. """ - pcgts.get_Metadata().add_MetadataItem( + metadata_obj = pcgts.get_Metadata() + assert metadata_obj is not None + metadata_obj.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=self.ocrd_tool['executable'], @@ -220,17 +880,16 @@ def add_metadata(self, pcgts): def resolve_resource(self, val): """ Resolve a resource name to an absolute file path with the algorithm in - https://ocr-d.de/en/spec/ocrd_tool#file-parameters + `spec `_ Args: val (string): resource value to resolve """ - initLogging() executable = self.ocrd_tool['executable'] - log = getLogger('ocrd.processor.base') if exists(val): - log.debug("Resolved to absolute path %s" % val) + self._base_logger.debug("Resolved to absolute path %s" % val) return val + # FIXME: remove once workspace arg / old_pwd is gone: if hasattr(self, 'old_pwd'): cwd = self.old_pwd else: @@ -239,11 +898,19 @@ def resolve_resource(self, val): cwd=cwd, moduled=self.moduledir) if exists(cand)] if ret: - log.debug("Resolved %s to absolute path %s" % (val, ret[0])) + self._base_logger.debug("Resolved %s to absolute path %s" % (val, ret[0])) return ret[0] raise ResourceNotFoundError(val, executable) def show_resource(self, val): + """ + Resolve a resource name to a file path with the algorithm in + `spec `_, + then print its contents to stdout. + + Args: + val (string): resource value to show + """ res_fname = self.resolve_resource(val) fpath = Path(res_fname) if fpath.is_dir(): @@ -308,8 +975,9 @@ def input_files(self): files for that page) - Otherwise raise an error (complaining that only PAGE-XML warrants having multiple images for a single page) - Algorithm _ - + + See `algorithm `_ + Returns: A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects. """ @@ -350,11 +1018,13 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): - if ``last``, then the last matching file for the page will be silently selected (as if the last was the only match) - if ``abort``, then an exception will be raised. + Multiple matches for PAGE-XML will always raise an exception. Keyword Args: require_first (boolean): If true, then skip a page entirely whenever it is not available in the first input `fileGrp`. + on_error (string): How to handle multiple file matches per page. mimetype (string): If not `None`, filter by the specified MIME type (literal or regex prefixed by `//`). Otherwise prefer PAGE or image. @@ -364,36 +1034,30 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): if not self.input_file_grp: raise ValueError("Processor is missing input fileGrp") - LOG = getLogger('ocrd.processor.base') ifgs = self.input_file_grp.split(",") # Iterating over all files repeatedly may seem inefficient at first sight, # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering # can actually be much more costly than traversing the ltree. # This might depend on the number of pages vs number of fileGrps. - pages = dict() + pages = {} for i, ifg in enumerate(ifgs): files_ = sorted(self.workspace.mets.find_all_files( pageId=self.page_id, fileGrp=ifg, mimetype=mimetype), # sort by MIME type so PAGE comes before images key=lambda file_: file_.mimetype) - # Warn if no files found but pageId was specified because that - # might be because of invalid page_id (range) - if self.page_id and not files_: - msg = (f"Could not find any files for --page-id {self.page_id} - " - f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.") - if on_error == 'abort': - raise ValueError(msg) - LOG.warning(msg) for file_ in files_: if not file_.pageId: + # ignore document-global files continue ift = pages.setdefault(file_.pageId, [None]*len(ifgs)) if ift[i]: - LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg) + self._base_logger.debug(f"another file {file_.ID} for page {file_.pageId} in input file group {ifg}") # fileGrp has multiple files for this page ID if mimetype: # filter was active, this must not happen + self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} " + f"conflicts with file {ift[i].ID} of same MIME type {mimetype} - on_error={on_error}") if on_error == 'skip': ift[i] = None elif on_error == 'first': @@ -401,9 +1065,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): elif on_error == 'last': ift[i] = file_ elif on_error == 'abort': - raise ValueError( - "Multiple '%s' matches for page '%s' in fileGrp '%s'." % ( - mimetype, file_.pageId, ifg)) + raise NonUniqueInputFile(ifg, file_.pageId, mimetype) else: raise Exception("Unknown 'on_error' strategy '%s'" % on_error) elif (ift[i].mimetype == MIMETYPE_PAGE and @@ -411,11 +1073,11 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): pass # keep PAGE match elif (ift[i].mimetype == MIMETYPE_PAGE and file_.mimetype == MIMETYPE_PAGE): - raise ValueError( - "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % ( - file_.pageId, ifg)) + raise NonUniqueInputFile(ifg, file_.pageId, None) else: # filter was inactive but no PAGE is in control, this must not happen + self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} " + f"conflicts with file {ift[i].ID} but no PAGE available - on_error={on_error}") if on_error == 'skip': ift[i] = None elif on_error == 'first': @@ -423,21 +1085,217 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): elif on_error == 'last': ift[i] = file_ elif on_error == 'abort': - raise ValueError( - "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % ( - file_.pageId, ifg)) + raise NonUniqueInputFile(ifg, file_.pageId, None) else: raise Exception("Unknown 'on_error' strategy '%s'" % on_error) else: - LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg) + self._base_logger.debug(f"adding file {file_.ID} for page {file_.pageId} to input file group {ifg}") ift[i] = file_ - ifts = list() + # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range) + if self.page_id and not any(pages): + self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n" + f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.") + ifts = [] for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): if not ifiles[i]: - # other fallback options? - LOG.error('found no page %s in file group %s', - page, ifg) + # could be from non-unique with on_error=skip or from true gap + self._base_logger.error(f'Found no file for page {page} in file group {ifg}') + if config.OCRD_MISSING_INPUT == 'abort': + raise MissingInputFile(ifg, page, mimetype) + if not any(ifiles): + # must be from non-unique with on_error=skip + self._base_logger.warning(f'Found no files for {page} - skipping') + continue if ifiles[0] or not require_first: ifts.append(tuple(ifiles)) return ifts + +_page_worker_processor = None +""" +This global binding for the processor is required to avoid +squeezing the processor through a mp.Queue (which is impossible +due to unpicklable attributes like .workspace.mets._tree anyway) +when calling Processor.process_page_file as page worker processes +in Processor.process_workspace. Forking allows inheriting global +objects, and with the METS Server we do not mutate the local +processor instance anyway. +""" +def _page_worker_set_ctxt(processor, log_queue): + """ + Overwrites `ocrd.processor.base._page_worker_processor` instance + for sharing with subprocesses in ProcessPoolExecutor initializer. + """ + global _page_worker_processor + _page_worker_processor = processor + if log_queue: + # replace all log handlers with just one queue handler + logging.root.handlers = [logging.handlers.QueueHandler(log_queue)] + +def _page_worker(timeout, *input_files): + """ + Wraps a `Processor.process_page_file` call as payload (call target) + of the ProcessPoolExecutor workers, but also enforces the given timeout. + """ + page_id = next((file.pageId for file in input_files + if hasattr(file, 'pageId')), "") + if timeout > 0: + timer = Timer(timeout, interrupt_main) + timer.start() + try: + _page_worker_processor.process_page_file(*input_files) + _page_worker_processor.logger.debug("page worker completed for page %s", page_id) + except KeyboardInterrupt: + _page_worker_processor.logger.debug("page worker timed out for page %s", page_id) + raise TimeoutError() + finally: + if timeout > 0: + timer.cancel() + +def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None): + """Generate a string describing the full CLI of this processor including params. + + Args: + ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json`` + processor_instance (object, optional): the processor implementation + (for adding any module/class/function docstrings) + subcommand (string): 'worker' or 'server' + """ + doc_help = '' + if processor_instance: + module = inspect.getmodule(processor_instance) + if module and module.__doc__: + doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n' + if processor_instance.__doc__: + doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n' + # Try to find the most concrete docstring among the various methods that an implementation + # could overload, first serving. + # In doing so, compare with Processor to avoid a glitch in the way py>=3.5 inherits docstrings. + # (They are supposed to only repeat information inspect.getdoc, rather than inherit __doc__ itself.) + for method in ['process_page_pcgts', 'process_page_file', 'process_workspace', 'process']: + instance_method = getattr(processor_instance, method) + superclass_method = getattr(Processor, method) + if instance_method.__doc__ and instance_method.__doc__ != superclass_method.__doc__: + doc_help += '\n' + inspect.cleandoc(instance_method.__doc__) + '\n' + break + if doc_help: + doc_help = '\n\n' + wrap_text(doc_help, width=72, + initial_indent=' > ', + subsequent_indent=' > ', + preserve_paragraphs=True) + subcommands = '''\ + worker Start a processing worker rather than do local processing + server Start a processor server rather than do local processing +''' + + processing_worker_options = '''\ + --queue The RabbitMQ server address in format + "amqp://{user}:{pass}@{host}:{port}/{vhost}" + [amqp://admin:admin@localhost:5672] + --database The MongoDB server address in format + "mongodb://{host}:{port}" + [mongodb://localhost:27018] + --log-filename Filename to redirect STDOUT/STDERR to, + if specified. +''' + + processing_server_options = '''\ + --address The Processor server address in format + "{host}:{port}" + --database The MongoDB server address in format + "mongodb://{host}:{port}" + [mongodb://localhost:27018] +''' + + processing_options = '''\ + -m, --mets URL-PATH URL or file path of METS to process [./mets.xml] + -w, --working-dir PATH Working directory of local workspace [dirname(URL-PATH)] + -I, --input-file-grp USE File group(s) used as input + -O, --output-file-grp USE File group(s) used as output + -g, --page-id ID Physical page ID(s) to process instead of full document [] + --overwrite Remove existing output pages/images + (with "--page-id", remove only those). + Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE + --debug Abort on any errors with full stack trace. + Short-hand for OCRD_MISSING_OUTPUT=ABORT + --profile Enable profiling + --profile-file PROF-PATH Write cProfile stats to PROF-PATH. Implies "--profile" + -p, --parameter JSON-PATH Parameters, either verbatim JSON string + or JSON file path + -P, --param-override KEY VAL Override a single JSON object key-value pair, + taking precedence over --parameter + -U, --mets-server-url URL URL of a METS Server for parallel incremental access to METS + If URL starts with http:// start an HTTP server there, + otherwise URL is a path to an on-demand-created unix socket + -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE] + Override log level globally [INFO] + --log-filename LOG-PATH File to redirect stderr logging to (overriding ocrd_logging.conf). +''' + + information_options = '''\ + -C, --show-resource RESNAME Dump the content of processor resource RESNAME + -L, --list-resources List names of processor resources + -J, --dump-json Dump tool description as JSON + -D, --dump-module-dir Show the 'module' resource location path for this processor + -h, --help Show this message + -V, --version Show version +''' + + parameter_help = '' + if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']: + parameter_help = ' NONE\n' + else: + def wrap(s): + return wrap_text(s, initial_indent=' '*3, + subsequent_indent=' '*4, + width=72, preserve_paragraphs=True) + for param_name, param in ocrd_tool['parameters'].items(): + parameter_help += wrap('"%s" [%s%s]' % ( + param_name, + param['type'], + ' - REQUIRED' if 'required' in param and param['required'] else + ' - %s' % json.dumps(param['default']) if 'default' in param else '')) + parameter_help += '\n ' + wrap(param['description']) + if 'enum' in param: + parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum'])) + parameter_help += "\n" + + if not subcommand: + return f'''\ +Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS] + + {ocrd_tool['description']}{doc_help} + +Subcommands: +{subcommands} +Options for processing: +{processing_options} +Options for information: +{information_options} +Parameters: +{parameter_help} +''' + elif subcommand == 'worker': + return f'''\ +Usage: {ocrd_tool['executable']} worker [OPTIONS] + + Run {ocrd_tool['executable']} as a processing worker. + + {ocrd_tool['description']}{doc_help} + +Options: +{processing_worker_options} +''' + elif subcommand == 'server': + return f'''\ +Usage: {ocrd_tool['executable']} server [OPTIONS] + + Run {ocrd_tool['executable']} as a processor sever. + + {ocrd_tool['description']}{doc_help} + +Options: +{processing_server_options} +''' + else: + pass diff --git a/src/ocrd/processor/builtin/dummy/ocrd-tool.json b/src/ocrd/processor/builtin/dummy/ocrd-tool.json index 30a6d99fd9..ef4a4810fe 100644 --- a/src/ocrd/processor/builtin/dummy/ocrd-tool.json +++ b/src/ocrd/processor/builtin/dummy/ocrd-tool.json @@ -1,12 +1,14 @@ { + "version": "1.0.0", + "git_url": "https://github.com/OCR-D/core", "tools": { "ocrd-dummy": { "executable": "ocrd-dummy", "description": "Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group", "steps": ["preprocessing/optimization"], "categories": ["Image preprocessing"], - "input_file_grp": "DUMMY_INPUT", - "output_file_grp": "DUMMY_OUTPUT", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "parameters": { "copy_files": { "type": "boolean", diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 774332a733..72a260968f 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -1,87 +1,82 @@ # pylint: disable=missing-module-docstring,invalid-name -from os.path import join, basename +from os.path import join +from typing import Optional import click from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -from ocrd_models.ocrd_page import to_xml +from ocrd.processor.ocrd_page_result import OcrdPageResult +from ocrd_models.ocrd_file import OcrdFileType +from ocrd_models.ocrd_page import OcrdPage, to_xml from ocrd_utils import ( - getLogger, - assert_file_grp_cardinality, make_file_id, MIME_TO_EXT, MIMETYPE_PAGE, parse_json_string_with_comments, - resource_string + resource_string, + config ) from ocrd_modelfactory import page_from_file -OCRD_TOOL = parse_json_string_with_comments(resource_string(__package__ + '.dummy', 'ocrd-tool.json')) - class DummyProcessor(Processor): """ Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group """ - def process(self) -> None: - LOG = getLogger('ocrd.dummy') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - copy_files = self.parameter['copy_files'] - for input_file in self.input_files: - input_file = self.workspace.download_file(input_file) + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + assert input_pcgts[0] + # nothing to do here + return OcrdPageResult(input_pcgts[0]) + + def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: + input_file = input_files[0] + assert input_file + assert input_file.local_filename + if self.parameter['copy_files'] and input_file.mimetype != MIMETYPE_PAGE: + # we need to mimic the actual copying in addition to the PAGE boilerplate file_id = make_file_id(input_file, self.output_file_grp) ext = MIME_TO_EXT.get(input_file.mimetype, '') local_filename = join(self.output_file_grp, file_id + ext) - pcgts = page_from_file(self.workspace.download_file(input_file)) - pcgts.set_pcGtsId(file_id) - self.add_metadata(pcgts) - if input_file.mimetype == MIMETYPE_PAGE: - LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) - # Source file is PAGE-XML: Write out in-memory PcGtsType - self.workspace.add_file( + self.logger.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) + with open(input_file.local_filename, 'rb') as f: + output_file = self.workspace.add_file( file_id=file_id, file_grp=self.output_file_grp, page_id=input_file.pageId, mimetype=input_file.mimetype, local_filename=local_filename, - content=to_xml(pcgts).encode('utf-8')) + content=f.read(), + ) + file_id = file_id + '_PAGE' + pcgts = page_from_file(output_file) + assert isinstance(pcgts, OcrdPage) + pcgts = self.process_page_pcgts(pcgts).pcgts + pcgts.set_pcGtsId(file_id) + self.add_metadata(pcgts) + self.logger.info("Add PAGE-XML %s generated for %s", file_id, output_file) + self.workspace.add_file(file_id=file_id, + file_grp=self.output_file_grp, + page_id=input_file.pageId, + local_filename=join(self.output_file_grp, file_id + '.xml'), + mimetype=MIMETYPE_PAGE, + content=to_xml(pcgts), + ) + else: + if self.parameter['copy_files']: + self.logger.info("Not copying %s because it is a PAGE-XML file, which gets identity-transformed", input_file.local_filename) else: - # Source file is not PAGE-XML: Copy byte-by-byte unless copy_files is False - if not copy_files: - LOG.info("Not copying %s because it is not a PAGE-XML file and copy_files was false" % input_file.local_filename) - else: - LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) - with open(input_file.local_filename, 'rb') as f: - content = f.read() - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - mimetype=input_file.mimetype, - local_filename=local_filename, - content=content) - if input_file.mimetype.startswith('image/'): - # write out the PAGE-XML representation for this image - page_file_id = file_id + '_PAGE' - pcgts.set_pcGtsId(page_file_id) - pcgts.get_Page().set_imageFilename(local_filename if copy_files else input_file.local_filename) - page_filename = join(self.output_file_grp, file_id + '.xml') - LOG.info("Add PAGE-XML %s generated for %s at %s", page_file_id, file_id, page_filename) - self.workspace.add_file( - file_id=page_file_id, - file_grp=self.output_file_grp, - page_id=input_file.pageId, - mimetype=MIMETYPE_PAGE, - local_filename=page_filename, - content=to_xml(pcgts).encode('utf-8')) + self.logger.info("Not copying %s because it is not a PAGE-XML file and copy_files was false", input_file.local_filename) + # we can rely on base implementation verbatim + super().process_page_file(input_file) + @property + def metadata_filename(self): + return 'processor/builtin/dummy/ocrd-tool.json' - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dummy'] - kwargs['version'] = '0.0.3' - super(DummyProcessor, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-dummy' @click.command() @ocrd_cli_options diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index f5b6010636..757f7ac045 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -1,21 +1,19 @@ """ Helper methods for running and documenting processors """ -from os import chdir, getcwd from time import perf_counter, process_time +from os import times from functools import lru_cache import json import inspect from subprocess import run -from typing import List +from typing import List, Optional -from click import wrap_text -from ocrd.workspace import Workspace +from ..workspace import Workspace from ocrd_utils import freeze_args, getLogger, config, setOverrideLogLevel, getLevelName, sparkline __all__ = [ - 'generate_processor_help', 'run_cli', 'run_processor' ] @@ -39,10 +37,7 @@ def run_processor( log_level=None, input_file_grp=None, output_file_grp=None, - show_resource=None, - list_resources=False, parameter=None, - parameter_override=None, working_dir=None, mets_server_url=None, instance_caching=False @@ -83,9 +78,8 @@ def run_processor( log = getLogger('ocrd.processor.helpers.run_processor') log.debug("Running processor %s", processorClass) - old_cwd = getcwd() processor = get_processor( - processor_class=processorClass, + processorClass, parameter=parameter, workspace=None, page_id=page_id, @@ -93,21 +87,20 @@ def run_processor( output_file_grp=output_file_grp, instance_caching=instance_caching ) - processor.workspace = workspace - chdir(processor.workspace.directory) ocrd_tool = processor.ocrd_tool name = '%s v%s' % (ocrd_tool['executable'], processor.version) - otherrole = ocrd_tool['steps'][0] + otherrole = ocrd_tool.get('steps', [''])[0] logProfile = getLogger('ocrd.process.profile') log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) t0_wall = perf_counter() t0_cpu = process_time() + t0_os = times() if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']): backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil' - from memory_profiler import memory_usage + from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel try: - mem_usage = memory_usage(proc=processor.process, + mem_usage = memory_usage(proc=(processor.process_workspace, [workspace], {}), # only run process once max_iterations=1, interval=.1, timeout=None, timestamps=True, @@ -118,8 +111,6 @@ def run_processor( except Exception as err: log.exception("Failure in processor '%s'" % ocrd_tool['executable']) raise err - finally: - chdir(old_cwd) mem_usage_values = [mem for mem, _ in mem_usage] mem_output = 'memory consumption: ' mem_output += sparkline(mem_usage_values) @@ -127,16 +118,20 @@ def run_processor( logProfile.info(mem_output) else: try: - processor.process() + processor.process_workspace(workspace) except Exception as err: log.exception("Failure in processor '%s'" % ocrd_tool['executable']) raise err - finally: - chdir(old_cwd) t1_wall = perf_counter() - t0_wall t1_cpu = process_time() - t0_cpu - logProfile.info("Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']" % ( + t1_os = times() + # add CPU time from child processes (page worker etc) + t1_cpu += t1_os.children_user - t0_os.children_user + t1_cpu += t1_os.children_system - t0_os.children_system + logProfile.info( + "Executing processor '%s' took %fs (wall) %fs (CPU)( " + "[--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']", ocrd_tool['executable'], t1_wall, t1_cpu, @@ -144,7 +139,7 @@ def run_processor( processor.output_file_grp or '', json.dumps(processor.parameter) or '', processor.page_id or '' - )) + ) workspace.mets.add_agent( name=name, _type='OTHER', @@ -167,6 +162,7 @@ def run_cli( workspace=None, page_id=None, overwrite=None, + debug=None, log_level=None, log_filename=None, input_file_grp=None, @@ -209,6 +205,8 @@ def run_cli( args += ['--parameter', parameter] if overwrite: args += ['--overwrite'] + if debug: + args += ['--debug'] if mets_server_url: args += ['--mets-server-url', mets_server_url] log = getLogger('ocrd.processor.helpers.run_cli') @@ -216,151 +214,15 @@ def run_cli( if not log_filename: result = run(args, check=False) else: - with open(log_filename, 'a') as file_desc: + with open(log_filename, 'a', encoding='utf-8') as file_desc: result = run(args, check=False, stdout=file_desc, stderr=file_desc) return result.returncode -def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None): - """Generate a string describing the full CLI of this processor including params. - - Args: - ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json`` - processor_instance (object, optional): the processor implementation - (for adding any module/class/function docstrings) - subcommand (string): 'worker' or 'server' - """ - doc_help = '' - if processor_instance: - module = inspect.getmodule(processor_instance) - if module and module.__doc__: - doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n' - if processor_instance.__doc__: - doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n' - if processor_instance.process.__doc__: - doc_help += '\n' + inspect.cleandoc(processor_instance.process.__doc__) + '\n' - if doc_help: - doc_help = '\n\n' + wrap_text(doc_help, width=72, - initial_indent=' > ', - subsequent_indent=' > ', - preserve_paragraphs=True) - subcommands = '''\ - worker Start a processing worker rather than do local processing - server Start a processor server rather than do local processing -''' - - processing_worker_options = '''\ - --queue The RabbitMQ server address in format - "amqp://{user}:{pass}@{host}:{port}/{vhost}" - [amqp://admin:admin@localhost:5672] - --database The MongoDB server address in format - "mongodb://{host}:{port}" - [mongodb://localhost:27018] - --log-filename Filename to redirect STDOUT/STDERR to, - if specified. -''' - - processing_server_options = '''\ - --address The Processor server address in format - "{host}:{port}" - --database The MongoDB server address in format - "mongodb://{host}:{port}" - [mongodb://localhost:27018] -''' - - processing_options = '''\ - -m, --mets URL-PATH URL or file path of METS to process [./mets.xml] - -w, --working-dir PATH Working directory of local workspace [dirname(URL-PATH)] - -I, --input-file-grp USE File group(s) used as input - -O, --output-file-grp USE File group(s) used as output - -g, --page-id ID Physical page ID(s) to process instead of full document [] - --overwrite Remove existing output pages/images - (with "--page-id", remove only those) - --profile Enable profiling - --profile-file PROF-PATH Write cProfile stats to PROF-PATH. Implies "--profile" - -p, --parameter JSON-PATH Parameters, either verbatim JSON string - or JSON file path - -P, --param-override KEY VAL Override a single JSON object key-value pair, - taking precedence over --parameter - -U, --mets-server-url URL URL of a METS Server for parallel incremental access to METS - If URL starts with http:// start an HTTP server there, - otherwise URL is a path to an on-demand-created unix socket - -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE] - Override log level globally [INFO] -''' - - information_options = '''\ - -C, --show-resource RESNAME Dump the content of processor resource RESNAME - -L, --list-resources List names of processor resources - -J, --dump-json Dump tool description as JSON - -D, --dump-module-dir Show the 'module' resource location path for this processor - -h, --help Show this message - -V, --version Show version -''' - - parameter_help = '' - if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']: - parameter_help = ' NONE\n' - else: - def wrap(s): - return wrap_text(s, initial_indent=' '*3, - subsequent_indent=' '*4, - width=72, preserve_paragraphs=True) - for param_name, param in ocrd_tool['parameters'].items(): - parameter_help += wrap('"%s" [%s%s]' % ( - param_name, - param['type'], - ' - REQUIRED' if 'required' in param and param['required'] else - ' - %s' % json.dumps(param['default']) if 'default' in param else '')) - parameter_help += '\n ' + wrap(param['description']) - if 'enum' in param: - parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum'])) - parameter_help += "\n" - - if not subcommand: - return f'''\ -Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS] - {ocrd_tool['description']}{doc_help} - -Subcommands: -{subcommands} -Options for processing: -{processing_options} -Options for information: -{information_options} -Parameters: -{parameter_help} -''' - elif subcommand == 'worker': - return f'''\ -Usage: {ocrd_tool['executable']} worker [OPTIONS] - - Run {ocrd_tool['executable']} as a processing worker. - - {ocrd_tool['description']}{doc_help} - -Options: -{processing_worker_options} -''' - elif subcommand == 'server': - return f'''\ -Usage: {ocrd_tool['executable']} server [OPTIONS] - - Run {ocrd_tool['executable']} as a processor sever. - - {ocrd_tool['description']}{doc_help} - -Options: -{processing_server_options} -''' - else: - pass - - -# Taken from https://github.com/OCR-D/core/pull/884 -@freeze_args -@lru_cache(maxsize=config.OCRD_MAX_PROCESSOR_CACHE) +# not decorated here but at runtime (on first use) +#@freeze_args +#@lru_cache(maxsize=config.OCRD_MAX_PROCESSOR_CACHE) def get_cached_processor(parameter: dict, processor_class): """ Call this function to get back an instance of a processor. @@ -373,36 +235,42 @@ def get_cached_processor(parameter: dict, processor_class): Otherwise, an instance of the `:py:class:~ocrd.Processor` is returned. """ if processor_class: - dict_params = dict(parameter) if parameter else None - return processor_class(workspace=None, parameter=dict_params) + processor = processor_class(None, parameter=dict(parameter)) + return processor return None - def get_processor( processor_class, - parameter: dict, - workspace: Workspace = None, - page_id: str = None, - input_file_grp: List[str] = None, - output_file_grp: List[str] = None, + parameter: Optional[dict] = None, + workspace: Optional[Workspace] = None, + page_id: Optional[str] = None, + input_file_grp: Optional[List[str]] = None, + output_file_grp: Optional[List[str]] = None, instance_caching: bool = False, ): if processor_class: + if parameter is None: + parameter = {} if instance_caching: - cached_processor = get_cached_processor( - parameter=parameter, - processor_class=processor_class - ) - cached_processor.workspace = workspace - cached_processor.page_id = page_id - cached_processor.input_file_grp = input_file_grp - cached_processor.output_file_grp = output_file_grp - return cached_processor - return processor_class( - workspace=workspace, - page_id=page_id, - input_file_grp=input_file_grp, - output_file_grp=output_file_grp, - parameter=parameter - ) + global get_cached_processor + if not hasattr(get_cached_processor, '__wrapped__'): + # first call: wrap + if processor_class.max_instances < 0: + maxsize = config.OCRD_MAX_PROCESSOR_CACHE + else: + maxsize = min(config.OCRD_MAX_PROCESSOR_CACHE, processor_class.max_instances) + # wrapping in call cache + # wrapping dict into frozendict (from https://github.com/OCR-D/core/pull/884) + get_cached_processor = freeze_args(lru_cache(maxsize=maxsize)(get_cached_processor)) + processor = get_cached_processor(parameter, processor_class) + else: + # avoid passing workspace already (deprecated chdir behaviour) + processor = processor_class(None, parameter=parameter) + assert processor + # set current processing parameters + processor.workspace = workspace + processor.page_id = page_id + processor.input_file_grp = input_file_grp + processor.output_file_grp = output_file_grp + return processor raise ValueError("Processor class is not known") diff --git a/src/ocrd/processor/ocrd_page_result.py b/src/ocrd/processor/ocrd_page_result.py new file mode 100644 index 0000000000..5f21a72f57 --- /dev/null +++ b/src/ocrd/processor/ocrd_page_result.py @@ -0,0 +1,17 @@ +from dataclasses import dataclass, field +from typing import List, Union, Optional +from ocrd_models.ocrd_page import OcrdPage +from PIL.Image import Image + +from ocrd_models.ocrd_page_generateds import AlternativeImageType, PageType + +@dataclass +class OcrdPageResultImage(): + pil : Image + file_id_suffix : str + alternative_image : Optional[Union[AlternativeImageType, PageType]] + +@dataclass +class OcrdPageResult(): + pcgts : OcrdPage + images : List[OcrdPageResultImage] = field(default_factory=list) diff --git a/src/ocrd/resolver.py b/src/ocrd/resolver.py index 124d006927..7ed58d4d4d 100644 --- a/src/ocrd/resolver.py +++ b/src/ocrd/resolver.py @@ -18,7 +18,6 @@ ) from ocrd.workspace import Workspace from ocrd_models import OcrdMets -from ocrd_models.constants import NAMESPACES as NS from ocrd_models.utils import handle_oai_response class Resolver(): @@ -310,5 +309,3 @@ def resolve_mets_arguments(self, directory, mets_url, mets_basename=DEFAULT_METS raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory)) return str(Path(directory).resolve()), str(mets_url), str(mets_basename), mets_server_url - - diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index 44bbd081bc..3c4c603060 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -1,6 +1,6 @@ from pathlib import Path from os.path import join -from os import environ, listdir, makedirs, getcwd, path, unlink +from os import environ, listdir, getcwd, unlink from shutil import copytree, rmtree, copy from fnmatch import filter as apply_glob from datetime import datetime @@ -13,14 +13,18 @@ from gdown.download import get_url_from_gdrive_confirmation from yaml import safe_load, safe_dump +# pylint: disable=wrong-import-position + # https://github.com/OCR-D/core/issues/867 # https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml import yaml.constructor -yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = \ - yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:str'] +yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:timestamp'] = \ + yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:str'] + +# pylint: enable=wrong-import-position from ocrd_validators import OcrdResourceListValidator -from ocrd_utils import getLogger, directory_size, get_moduledir, EXT_TO_MIME, nth_url_segment, guess_media_type, config +from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT @@ -248,7 +252,7 @@ def _download_impl(url, filename, progress_cb=None, size=None): if "Content-Disposition" not in r.headers: url = get_url_from_gdrive_confirmation(r.text) except RuntimeError as e: - log.warning("Cannot unwrap Google Drive URL: ", e) + log.warning("Cannot unwrap Google Drive URL: %s", e) with open(filename, 'wb') as f: with requests.get(url, stream=True) as r: r.raise_for_status() diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index ff856011be..3cbc58c78c 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -1,7 +1,7 @@ import io from os import makedirs, unlink, listdir, path from pathlib import Path -from shutil import move, copyfileobj +from shutil import copyfileobj from re import sub from tempfile import NamedTemporaryFile from contextlib import contextmanager @@ -19,11 +19,13 @@ from ocrd_modelfactory import exif_from_filename, page_from_file from ocrd_utils import ( atomic_write, + config, getLogger, image_from_polygon, coordinates_of_segment, adjust_canvas_to_rotation, adjust_canvas_to_transposition, + scale_coordinates, shift_coordinates, rotate_coordinates, transform_coordinates, @@ -41,7 +43,7 @@ MIME_TO_EXT, MIME_TO_PIL, MIMETYPE_PAGE, - REGEX_PREFIX + REGEX_PREFIX, ) from .workspace_backup import WorkspaceBackupManager @@ -74,7 +76,6 @@ class Workspace(): `OcrdMets` of this workspace. If `None`, then the METS will be read from and written to the filesystem directly. baseurl (string, None) : Base URL to prefix to relative URL. - overwrite_mode (boolean, False) : Whether to force add operations on this workspace globally """ def __init__( @@ -90,14 +91,13 @@ def __init__( self.resolver = resolver self.directory = directory self.mets_target = str(Path(directory, mets_basename)) - self.overwrite_mode = False self.is_remote = bool(mets_server_url) if mets is None: if self.is_remote: mets = ClientSideOcrdMets(mets_server_url, self.directory) if mets.workspace_path != self.directory: - raise ValueError(f"METS server {mets_server_url} workspace directory {mets.workspace_path} differs " - f"from local workspace directory {self.directory}. These are not the same workspaces.") + raise ValueError(f"METS server {mets_server_url} workspace directory '{mets.workspace_path}' differs " + f"from local workspace directory '{self.directory}'. These are not the same workspaces.") else: mets = OcrdMets(filename=self.mets_target) self.mets = mets @@ -111,7 +111,7 @@ def __init__( def __repr__(self): return 'Workspace[remote=%s, directory=%s, baseurl=%s, file_groups=%s, files=%s]' % ( - not not self.is_remote, + self.is_remote, self.directory, self.baseurl, self.mets.file_groups, @@ -122,7 +122,10 @@ def reload_mets(self): """ Reload METS from the filesystem. """ - self.mets = OcrdMets(filename=self.mets_target) + if self.is_remote: + self.mets.reload() + else: + self.mets = OcrdMets(filename=self.mets_target) @deprecated_alias(pageId="page_id") @deprecated_alias(ID="file_id") @@ -242,8 +245,6 @@ def remove_file(self, file_id, force=False, keep_file=False, page_recursive=Fals """ log = getLogger('ocrd.workspace.remove_file') log.debug('Deleting mets:file %s', file_id) - if self.overwrite_mode: - force = True if isinstance(file_id, OcrdFile): file_id = file_id.ID try: @@ -295,9 +296,6 @@ def remove_file_group(self, USE, recursive=False, force=False, keep_files=False, page_same_group (boolean): Remove only images in the same file group as the PAGE-XML. Has no effect unless ``page_recursive`` is `True`. """ - if not force and self.overwrite_mode: - force = True - if (not USE.startswith(REGEX_PREFIX)) and (USE not in self.mets.file_groups) and (not force): raise Exception("No such fileGrp: %s" % USE) @@ -418,8 +416,6 @@ def add_file(self, file_grp, content=None, **kwargs) -> Union[OcrdFile, ClientSi raise ValueError("workspace.add_file must be passed a 'page_id' kwarg, even if it is None.") if content is not None and not kwargs.get('local_filename'): raise Exception("'content' was set but no 'local_filename'") - if self.overwrite_mode: - kwargs['force'] = True with pushd_popd(self.directory): if kwargs.get('local_filename'): @@ -432,6 +428,8 @@ def add_file(self, file_grp, content=None, **kwargs) -> Union[OcrdFile, ClientSi kwargs["pageId"] = kwargs.pop("page_id") if "file_id" in kwargs: kwargs["ID"] = kwargs.pop("file_id") + if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': + kwargs["force"] = True ret = self.mets.add_file(file_grp, **kwargs) @@ -613,7 +611,6 @@ def image_from_page(self, page, page_id, Cropping uses a polygon mask (not just the bounding box rectangle). Areas outside the polygon will be filled according to ``fill``: - \b - if `"background"` (the default), then fill with the median color of the image; - else if `"none"`, then avoid masking polygons where possible @@ -635,6 +632,7 @@ def image_from_page(self, page, page_id, i.e. after cropping to the page's border / bounding box (if any) and deskewing with the page's orientation angle (if any) - `"angle"`: the rotation/reflection angle applied to the image so far, + - `"DPI"`: the pixel density of the original image, - `"features"`: the `AlternativeImage` `@comments` for the image, i.e. names of all applied operations that lead up to this result, * an :py:class:`ocrd_models.ocrd_exif.OcrdExif` instance associated with @@ -655,7 +653,7 @@ def image_from_page(self, page, page_id, log = getLogger('ocrd.workspace.image_from_page') page_image_info = self.resolve_image_exif(page.imageFilename) page_image = self._resolve_image_as_pil(page.imageFilename) - page_coords = dict() + page_coords = {} # use identity as initial affine coordinate transform: page_coords['transform'] = np.eye(3) # interim bbox (updated with each change to the transform): @@ -676,6 +674,13 @@ def image_from_page(self, page, page_id, page_coords['angle'] = 0 # nothing applied yet (depends on filters) log.debug("page '%s' has %s orientation=%d skew=%.2f", page_id, "border," if border else "", orientation, skew) + if page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) + dpi = int(dpi) + log.debug("page '%s' images will use %d DPI from image meta-data", page_id, dpi) + page_coords['DPI'] = dpi # initialize AlternativeImage@comments classes as empty: page_coords['features'] = '' @@ -794,6 +799,11 @@ def image_from_page(self, page, page_id, 'filter="%s" in page "%s"' % ( feature_filter, page_id)) page_image.format = 'PNG' # workaround for tesserocr#194 + # ensure DPI will be set in image meta-data again + if 'DPI' in page_coords: + dpi = page_coords['DPI'] + if 'dpi' not in page_image.info: + page_image.info['dpi'] = (dpi, dpi) return page_image, page_coords, page_image_info def image_from_segment(self, segment, parent_image, parent_coords, @@ -814,6 +824,7 @@ def image_from_segment(self, segment, parent_image, parent_coords, converts from absolute coordinates to those relative to the image, i.e. after applying all operations (starting with the original image) - `"angle"`: the rotation/reflection angle applied to the image so far, + - `"DPI"`: the pixel density of the parent image, - `"features"`: the ``AlternativeImage/@comments`` for the image, i.e. names of all operations that lead up to this result, and Keyword Args: @@ -843,7 +854,6 @@ def image_from_segment(self, segment, parent_image, parent_coords, Cropping uses a polygon mask (not just the bounding box rectangle). Areas outside the polygon will be filled according to `fill`: - \b - if `"background"` (the default), then fill with the median color of the image; - else if `"none"`, then avoid masking polygons where possible @@ -879,6 +889,7 @@ def image_from_segment(self, segment, parent_image, parent_coords, the segment's bounding box, and deskewing with the segment's orientation angle (if any) - `"angle"`: the rotation/reflection angle applied to the image so far, + - `"DPI"`: the pixel density of this image, - `"features"`: the ``AlternativeImage/@comments`` for the image, i.e. names of all applied operations that lead up to this result. @@ -941,6 +952,8 @@ def image_from_segment(self, segment, parent_image, parent_coords, orientation = 0 skew = 0 segment_coords['angle'] = parent_coords['angle'] # nothing applied yet (depends on filters) + if 'DPI' in parent_coords: + segment_coords['DPI'] = parent_coords['DPI'] # not rescaled yet # initialize AlternativeImage@comments classes from parent, except # for those operations that can apply on multiple hierarchy levels: @@ -1048,15 +1061,21 @@ def image_from_segment(self, segment, parent_image, parent_coords, 'filter="%s" in segment "%s"' % ( feature_filter, segment.id)) segment_image.format = 'PNG' # workaround for tesserocr#194 + # ensure DPI will be set in image meta-data again + if 'DPI' in segment_coords: + dpi = segment_coords['DPI'] + if 'dpi' not in segment_image.info: + segment_image.info['dpi'] = (dpi, dpi) return segment_image, segment_coords # pylint: disable=redefined-builtin - def save_image_file(self, image, - file_id, - file_grp, - page_id=None, - mimetype='image/png', - force=False): + def save_image_file(self, image : Image.Image, + file_id : str, + file_grp : str, + file_path : Optional[str] = None, + page_id : Optional[str] = None, + mimetype : str = 'image/png', + force : bool = False) -> str: """Store an image in the filesystem and reference it as new file in the METS. Args: @@ -1064,22 +1083,26 @@ def save_image_file(self, image, file_id (string): `@ID` of the METS `file` to use file_grp (string): `@USE` of the METS `fileGrp` to use Keyword Args: + file_path (string): `@href` of the METS `file/FLocat` to use. page_id (string): `@ID` in the METS physical `structMap` to use mimetype (string): MIME type of the image format to serialize as force (boolean): whether to replace any existing `file` with that `@ID` Serialize the image into the filesystem, and add a `file` for it in the METS. - Use a filename extension based on ``mimetype``. + Use ``file_grp`` as directory and ``file_id`` concatenated with extension + based on ``mimetype`` as file name, unless directly passing ``file_path``. Returns: The (absolute) path of the created file. """ log = getLogger('ocrd.workspace.save_image_file') - if self.overwrite_mode: - force = True + saveargs = {} + if 'dpi' in image.info: + saveargs['dpi'] = image.info['dpi'] image_bytes = io.BytesIO() - image.save(image_bytes, format=MIME_TO_PIL[mimetype]) - file_path = str(Path(file_grp, '%s%s' % (file_id, MIME_TO_EXT[mimetype]))) + image.save(image_bytes, format=MIME_TO_PIL[mimetype], **saveargs) + if file_path is None: + file_path = str(Path(file_grp, '%s%s' % (file_id, MIME_TO_EXT[mimetype]))) out = self.add_file( file_grp, file_id=file_id, @@ -1150,9 +1173,9 @@ def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh # Transpose in affine coordinate transform: # (consistent with image transposition or AlternativeImage below) transposition = { - 90: Image.ROTATE_90, - 180: Image.ROTATE_180, - 270: Image.ROTATE_270 + 90: Image.Transpose.ROTATE_90, + 180: Image.Transpose.ROTATE_180, + 270: Image.Transpose.ROTATE_270 }.get(orientation) # no default segment_coords['transform'] = transpose_coordinates( segment_coords['transform'], transposition, @@ -1220,5 +1243,5 @@ def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwa segment_image = segment_image.resize((int(segment_image.width * factor), int(segment_image.height * factor)), # slowest, but highest quality: - Image.BICUBIC) + Image.Resampling.BICUBIC) return segment_image, segment_coords, segment_xywh diff --git a/src/ocrd/workspace_backup.py b/src/ocrd/workspace_backup.py index 6cc3f1530d..87ee884bd1 100644 --- a/src/ocrd/workspace_backup.py +++ b/src/ocrd/workspace_backup.py @@ -1,6 +1,6 @@ from datetime import datetime from os import makedirs -from os.path import join, basename, getsize, abspath +from os.path import join, basename, getsize from glob import glob from shutil import copy import hashlib diff --git a/src/ocrd_modelfactory/__init__.py b/src/ocrd_modelfactory/__init__.py index 7afc5b1765..828949fe96 100644 --- a/src/ocrd_modelfactory/__init__.py +++ b/src/ocrd_modelfactory/__init__.py @@ -14,9 +14,10 @@ from ocrd_utils import VERSION, MIMETYPE_PAGE, guess_media_type from ocrd_models import OcrdExif, OcrdFile, ClientSideOcrdFile from ocrd_models.ocrd_page import ( - PcGtsType, PageType, MetadataType, + OcrdPage, PcGtsType, PageType, MetadataType, parse, parseEtree ) +from ocrd_utils.deprecate import deprecation_warning __all__ = [ 'exif_from_filename', @@ -39,7 +40,7 @@ def exif_from_filename(image_filename): ocrd_exif = OcrdExif(pil_img) return ocrd_exif -def page_from_image(input_file, with_tree=False): +def page_from_image(input_file : Union[OcrdFile, ClientSideOcrdFile], **kwargs) -> OcrdPage: """ Create :py:class:`~ocrd_models.ocrd_page.OcrdPage` from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` @@ -48,10 +49,9 @@ def page_from_image(input_file, with_tree=False): Arguments: input_file (:py:class:`~ocrd_models.ocrd_file.OcrdFile`): file to open \ and produce a PAGE DOM for - Keyword arguments: - with_tree (boolean): whether to return XML node tree, element-node mapping \ - and reverse mapping, too (cf. :py:func:`ocrd_models.ocrd_page.parseEtree`) """ + if 'with_etree' in kwargs: + deprecation_warning('kwarg "with_etree" is obsolete now, we always return OcrdPage including etree') if not input_file.local_filename: raise ValueError("input_file must have 'local_filename' property") if not Path(input_file.local_filename).exists(): @@ -72,14 +72,12 @@ def page_from_image(input_file, with_tree=False): ), pcGtsId=input_file.ID ) - if not with_tree: - return pcgts - mapping = dict() - etree = pcgts.to_etree(mapping_=mapping) + mapping = {} + etree : ET._Element = pcgts.to_etree(mapping_=mapping) revmap = dict(((node, element) for element, node in mapping.items())) - return pcgts, etree, mapping, revmap + return OcrdPage(pcgts, etree, mapping, revmap) -def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET.Element, dict, dict]]: +def page_from_file(input_file, **kwargs) -> OcrdPage: """ Create :py:class:`~ocrd_models.ocrd_page.OcrdPage` from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` or a file path @@ -88,10 +86,9 @@ def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsT Arguments: input_file (:py:class:`~ocrd_models.ocrd_file.OcrdFile` or `str`): file to open \ and produce a PAGE DOM for - Keyword arguments: - with_tree (boolean): whether to return XML node tree, element-node mapping \ - and reverse mapping, too (cf. :py:func:`ocrd_models.ocrd_page.parseEtree`) """ + if 'with_etree' in kwargs: + deprecation_warning('kwarg "with_etree" is obsolete now, we always return OcrdPage including etree') if not isinstance(input_file, (OcrdFile, ClientSideOcrdFile)): mimetype = guess_media_type(input_file, application_xml=MIMETYPE_PAGE) input_file = OcrdFile(ET.Element("dummy"), @@ -102,7 +99,7 @@ def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsT if not Path(input_file.local_filename).exists(): raise FileNotFoundError("File not found: '%s' (%s)" % (input_file.local_filename, input_file)) if input_file.mimetype.startswith('image'): - return page_from_image(input_file, with_tree=with_tree) + return page_from_image(input_file) if input_file.mimetype == MIMETYPE_PAGE: - return (parseEtree if with_tree else parse)(input_file.local_filename, silence=True) + return OcrdPage(*parseEtree(input_file.local_filename, silence=True)) raise ValueError("Unsupported mimetype '%s'" % input_file.mimetype) diff --git a/src/ocrd_models/__init__.py b/src/ocrd_models/__init__.py index a89ee1dec8..ff4e31798b 100644 --- a/src/ocrd_models/__init__.py +++ b/src/ocrd_models/__init__.py @@ -3,7 +3,8 @@ """ from .ocrd_agent import OcrdAgent, ClientSideOcrdAgent from .ocrd_exif import OcrdExif -from .ocrd_file import OcrdFile, ClientSideOcrdFile +from .ocrd_file import OcrdFile, ClientSideOcrdFile, OcrdFileType from .ocrd_mets import OcrdMets +from .ocrd_page import OcrdPage, OcrdPageType from .ocrd_xml_base import OcrdXmlDocument from .report import ValidationReport diff --git a/src/ocrd_models/constants.py b/src/ocrd_models/constants.py index db6e51e3a2..a67bfecc13 100644 --- a/src/ocrd_models/constants.py +++ b/src/ocrd_models/constants.py @@ -44,7 +44,6 @@ 'ocrd': 'https://ocr-d.de', } -# pylint: disable=bad-whitespace TAG_METS_AGENT = '{%s}agent' % NAMESPACES['mets'] TAG_METS_DIV = '{%s}div' % NAMESPACES['mets'] TAG_METS_FILE = '{%s}file' % NAMESPACES['mets'] diff --git a/src/ocrd_models/ocrd_exif.py b/src/ocrd_models/ocrd_exif.py index 406e60a85a..ab050bae59 100644 --- a/src/ocrd_models/ocrd_exif.py +++ b/src/ocrd_models/ocrd_exif.py @@ -21,6 +21,7 @@ class OcrdExif(): * ``RGB`` for 24-bit truecolor, * ``I`` for 32-bit signed integer grayscale, * ``F`` for floating-point grayscale + (see PIL concept **mode**) resolution (int): pixel density xResolution (int): pixel density @@ -101,7 +102,7 @@ def to_xml(self): Serialize all properties as XML string. """ ret = '' - for k in self.__dict__: - ret += '<%s>%s' % (k, self.__dict__[k], k) + for k, v in self.__dict__.items(): + ret += f'<{k}>{v}' ret += '' return ret diff --git a/src/ocrd_models/ocrd_file.py b/src/ocrd_models/ocrd_file.py index 2315a08ff3..91eac8d8e3 100644 --- a/src/ocrd_models/ocrd_file.py +++ b/src/ocrd_models/ocrd_file.py @@ -230,12 +230,12 @@ class ClientSideOcrdFile: def __init__( self, - el, + el, # pylint: disable=unused-argument mimetype: str = '', pageId: str = '', loctype: str ='OTHER', local_filename: Optional[str] = None, - mets : Any = None, + mets : Any = None, # pylint: disable=unused-argument url: str = '', ID: str = '', fileGrp: str = '' @@ -266,3 +266,5 @@ def __str__(self): for k in ['fileGrp', 'ID', 'mimetype', 'url', 'local_filename'] ]) return '' % (props) + +OcrdFileType = Union[OcrdFile, ClientSideOcrdFile] diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index d6da3e1cda..de068567e2 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -75,7 +75,7 @@ def empty_mets(now : Optional[str] = None, cache_flag : bool = False): def __init__(self, **kwargs) -> None: """ """ - super(OcrdMets, self).__init__(**kwargs) + super().__init__(**kwargs) # XXX If the environment variable OCRD_METS_CACHING is set to "true", # then enable caching, if "false", disable caching, overriding the @@ -194,11 +194,11 @@ def unique_identifier(self, purl : str) -> None: @property def agents(self) -> List[OcrdAgent]: """ - List all :py:class:`ocrd_models.ocrd_agent.OcrdAgent`s + List all :py:class:`ocrd_models.ocrd_agent.OcrdAgent` entries. """ return [OcrdAgent(el_agent) for el_agent in self._tree.getroot().findall('mets:metsHdr/mets:agent', NS)] - def add_agent(self, *args, **kwargs) -> OcrdAgent: + def add_agent(self, **kwargs) -> OcrdAgent: """ Add an :py:class:`ocrd_models.ocrd_agent.OcrdAgent` to the list of agents in the ``metsHdr``. """ @@ -213,12 +213,12 @@ def add_agent(self, *args, **kwargs) -> OcrdAgent: el_agent_last.addnext(el_agent) except StopIteration: el_metsHdr.insert(0, el_agent) - return OcrdAgent(el_agent, *args, **kwargs) + return OcrdAgent(el_agent, **kwargs) @property def file_groups(self) -> List[str]: """ - List the `@USE` of all `mets:fileGrp` entries. + List the ``@USE`` of all ``mets:fileGrp`` entries. """ # WARNING: Actually we cannot return strings in place of elements! @@ -488,11 +488,12 @@ def add_file(self, fileGrp : str, mimetype : Optional[str] = None, url : Optiona f"A file with ID=={ID} already exists {mets_file} but unrelated - cannot mitigate") # To get rid of Python's FutureWarning - checking if v is not None - kwargs = {k: v for k, v in locals().items() if - k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None} + kwargs = {k: v for k, v in locals().items() + if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None} # This separation is needed to reuse the same el_mets_file element in the caching if block el_mets_file = ET.SubElement(el_fileGrp, TAG_METS_FILE) # The caching of the physical page is done in the OcrdFile constructor + # (which calls us back with set_physical_page_for_file) mets_file = OcrdFile(el_mets_file, mets=self, **kwargs) if self._cache_flag: @@ -542,9 +543,9 @@ def remove_one_file(self, ID : Union[str, OcrdFile], fileGrp : str = None) -> Oc # Delete the physical page ref fptrs = [] if self._cache_flag: - for page in self._fptr_cache.keys(): - if ID in self._fptr_cache[page]: - fptrs.append(self._fptr_cache[page][ID]) + for pageId, fptrdict in self._fptr_cache.items(): + if ID in fptrdict: + fptrs.append(fptrdict[ID]) else: fptrs = self._tree.getroot().findall('.//mets:fptr[@FILEID="%s"]' % ID, namespaces=NS) @@ -598,7 +599,16 @@ def get_physical_pages(self, for_fileIds : Optional[List[str]] = None, for_pageI If return_divs is set, returns div memory objects instead of strings of ids """ if for_fileIds is None and for_pageIds is None: + if return_divs: + if self._cache_flag: + return list(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].values()) + + return [x for x in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS)] + return self.physical_pages + # log = getLogger('ocrd.models.ocrd_mets.get_physical_pages') if for_pageIds is not None: ret = [] @@ -700,8 +710,8 @@ def get_physical_pages(self, for_fileIds : Optional[List[str]] = None, for_pageI assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright ret = [None] * len(for_fileIds) if self._cache_flag: - for pageId in self._fptr_cache.keys(): - for fptr in self._fptr_cache[pageId].keys(): + for pageId, fptrdict in self._fptr_cache.items(): + for fptr in fptrdict: if fptr in for_fileIds: index = for_fileIds.index(fptr) if return_divs: @@ -737,10 +747,10 @@ def set_physical_page_for_file(self, pageId : str, ocrd_file : OcrdFile, # delete any existing page mapping for this file.ID fptrs = [] if self._cache_flag: - for page_id in self._fptr_cache.keys(): - if ocrd_file.ID in self._fptr_cache[page_id].keys(): - if self._fptr_cache[page_id][ocrd_file.ID] is not None: - fptrs.append(self._fptr_cache[page_id][ocrd_file.ID]) + for page, fptrdict in self._fptr_cache.items(): + if ocrd_file.ID in fptrdict: + if fptrdict[ocrd_file.ID] is not None: + fptrs.append(fptrdict[ocrd_file.ID]) else: fptrs = self._tree.getroot().findall( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % @@ -791,7 +801,7 @@ def set_physical_page_for_file(self, pageId : str, ocrd_file : OcrdFile, self._fptr_cache[pageId].update({ocrd_file.ID: el_fptr}) def update_physical_page_attributes(self, page_id : str, **kwargs) -> None: - invalid_keys = list(k for k in kwargs.keys() if k not in METS_PAGE_DIV_ATTRIBUTE.names()) + invalid_keys = list(k for k in kwargs if k not in METS_PAGE_DIV_ATTRIBUTE.names()) if invalid_keys: raise ValueError(f"Invalid attribute {invalid_keys}. Allowed values: {METS_PAGE_DIV_ATTRIBUTE.names()}") @@ -812,8 +822,8 @@ def get_physical_page_for_file(self, ocrd_file : OcrdFile) -> Optional[str]: corresponding to the ``mets:file`` :py:attr:`ocrd_file`. """ if self._cache_flag: - for pageId in self._fptr_cache.keys(): - if ocrd_file.ID in self._fptr_cache[pageId].keys(): + for pageId, fptrdict in self._fptr_cache.items(): + if ocrd_file.ID in fptrdict: return pageId else: ret = self._tree.getroot().find( @@ -828,7 +838,7 @@ def remove_physical_page(self, ID : str) -> None: """ mets_div = None if self._cache_flag: - if ID in self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].keys(): + if ID in self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID]: mets_div = [self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][ID]] else: mets_div = self._tree.getroot().xpath( @@ -857,9 +867,9 @@ def remove_physical_page_fptr(self, fileId : str) -> List[str]: # If that's the case then we do not need to iterate 2 loops, just one. mets_fptrs = [] if self._cache_flag: - for page_id in self._fptr_cache.keys(): - if fileId in self._fptr_cache[page_id].keys(): - mets_fptrs.append(self._fptr_cache[page_id][fileId]) + for pageId, fptrdict in self._fptr_cache.items(): + if fileId in fptrdict: + mets_fptrs.append(fptrdict[fileId]) else: mets_fptrs = self._tree.getroot().xpath( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, @@ -894,7 +904,7 @@ def merge(self, other_mets, force : bool = False, Add all files from other_mets. Accepts the same kwargs as :py:func:`find_files` Keyword Args: - force (boolean): Whether to :py:meth:`add_file`s with force (overwriting existing ``mets:file``s) + force (boolean): Whether to do :py:meth:`add_file` with ``force`` (overwriting existing ``mets:file`` entries) fileGrp_mapping (dict): Map :py:attr:`other_mets` fileGrp to fileGrp in this METS fileId_mapping (dict): Map :py:attr:`other_mets` file ID to file ID in this METS pageId_mapping (dict): Map :py:attr:`other_mets` page ID to page ID in this METS @@ -919,4 +929,3 @@ def merge(self, other_mets, force : bool = False, # FIXME: merge structMap logical and structLink as well if after_add_cb: after_add_cb(f_dest) - diff --git a/src/ocrd_models/ocrd_page.py b/src/ocrd_models/ocrd_page.py index b0cc2b3311..3f0cc690fa 100644 --- a/src/ocrd_models/ocrd_page.py +++ b/src/ocrd_models/ocrd_page.py @@ -2,12 +2,15 @@ API to PAGE-XML, generated with generateDS from XML schema. """ from io import StringIO +from typing import Dict, Union +from lxml import etree as ET __all__ = [ 'parse', 'parseEtree', 'parseString', 'OcrdPage', + 'OcrdPageType', "AdvertRegionType", "AlternativeImageType", @@ -174,10 +177,31 @@ """ ) -# add alias for DOM root -OcrdPage = PcGtsType - -def to_xml(el, skip_declaration=False): +class OcrdPage(): + """ + Proxy object for :py:class:`ocrd_models.PcGtsType` (i.e. PRImA PAGE-XML + for page content, rendered as object model by generateDS) that also offers access + to the underlying etree, element-node mapping and reverse mapping, too (cf. + :py:func:`ocrd_models.ocrd_page.parseEtree`) + """ + def __init__( + self, + pcgts : PcGtsType, + etree : ET._Element, + mapping : Dict[str, ET._Element], + revmap : Dict[ET._Element, str], + ): + self._pcgts = pcgts + self.etree = etree + self.mapping = mapping + self.revmap = revmap + + def __getattr__(self, name): + return getattr(self._pcgts, name) + +OcrdPageType = Union[OcrdPage, PcGtsType] + +def to_xml(el, skip_declaration=False) -> str: """ Serialize ``pc:PcGts`` document as string. """ diff --git a/src/ocrd_models/ocrd_page_generateds.py b/src/ocrd_models/ocrd_page_generateds.py index 6fef4c8635..f2b7c0551e 100644 --- a/src/ocrd_models/ocrd_page_generateds.py +++ b/src/ocrd_models/ocrd_page_generateds.py @@ -2,30 +2,28 @@ # -*- coding: utf-8 -*- # -# Generated Wed Nov 3 12:30:32 2021 by generateDS.py version 2.35.20. -# Python 3.6.9 (default, Jan 26 2021, 15:33:00) [GCC 8.4.0] +# Generated Sat Sep 7 14:17:39 2024 by generateDS.py version 2.35.20. +# Python 3.8.17+ (heads/3.8-dirty:1663f8ba84, Aug 15 2023, 18:13:01) [GCC 8.3.0] # # Command line options: # ('-f', '') # ('--root-element', 'PcGts') -# ('-o', 'ocrd_models/ocrd_models/ocrd_page_generateds.py') +# ('-o', 'src/ocrd_models/ocrd_page_generateds.py') # ('--silence', '') # ('--export', 'write etree') # ('--disable-generatedssuper-lookup', '') -# ('--user-methods', 'ocrd_models/ocrd_page_user_methods.py') +# ('--user-methods', 'src/ocrd_page_user_methods.py') # # Command line arguments: -# ocrd_validators/ocrd_validators/page.xsd +# src/ocrd_validators/page.xsd # # Command line: -# /home/kba/monorepo/ocrd_all/venv/bin/generateDS -f --root-element="PcGts" -o "ocrd_models/ocrd_models/ocrd_page_generateds.py" --silence --export="write etree" --disable-generatedssuper-lookup --user-methods="ocrd_models/ocrd_page_user_methods.py" ocrd_validators/ocrd_validators/page.xsd +# /data/ocr-d/ocrd_all/venv38/bin/generateDS -f --root-element="PcGts" -o "src/ocrd_models/ocrd_page_generateds.py" --silence --export="write etree" --disable-generatedssuper-lookup --user-methods="src/ocrd_page_user_methods.py" src/ocrd_validators/page.xsd # # Current working directory (os.getcwd()): # core # -# type: ignore - from itertools import zip_longest import os import sys @@ -223,7 +221,7 @@ def gds_validate_integer_list( try: int(value) except (TypeError, ValueError): - raise_parse_error(node, 'Requires sequence of integer values') + raise_parse_error(node, 'Requires sequence of integer valuess') return values def gds_format_float(self, input_data, input_name=''): return ('%.15f' % input_data).rstrip('0') @@ -1230,9 +1228,10 @@ def __hash__(self): return hash(self.id) @property def id(self): + from ocrd_utils import make_xml_id if hasattr(self, 'pcGtsId'): return self.pcGtsId or '' - return self.imageFilename + return make_xml_id(self.imageFilename) def get_AllAlternativeImagePaths(self, page=True, region=True, line=True, word=True, glyph=True): """ Get all the ``pc:AlternativeImage/@filename`` paths referenced in the PAGE-XML document. @@ -3116,9 +3115,10 @@ def __hash__(self): return hash(self.id) @property def id(self): + from ocrd_utils import make_xml_id if hasattr(self, 'pcGtsId'): return self.pcGtsId or '' - return self.imageFilename + return make_xml_id(self.imageFilename) # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring def _region_class(self, x): # pylint: disable=unused-argument return x.__class__.__name__.replace('RegionType', '') @@ -3314,6 +3314,39 @@ def get_AllTextLines(self, region_order='document', respect_textline_order=True) ret += lines if lo in ['top-to-bottom', 'left-to-right'] else list(reversed(lines)) return ret + def get_ReadingOrderGroups(self) -> dict: + """ + Aggregate recursive ReadingOrder into a dictionary, mapping each regionRef + (i.e. segment `@id`) to its referring group object (i.e one of + + \b + - :py:class:`.RegionRefType` + - :py:class:`.RegionRefIndexedType` + - :py:class:`.OrderedGroupType` + - :py:class:`.OrderedGroupIndexedType` + - :py:class:`.UnoderedGroupType` + - :py:class:`.UnoderedGroupIndexedType` + """ + def get_groupdict(group): + regionrefs = list() + if isinstance(group, (OrderedGroupType, OrderedGroupIndexedType)): + regionrefs = (group.get_RegionRefIndexed() + + group.get_OrderedGroupIndexed() + + group.get_UnorderedGroupIndexed()) + if isinstance(group, (UnorderedGroupType, UnorderedGroupIndexedType)): + regionrefs = (group.get_RegionRef() + + group.get_OrderedGroup() + + group.get_UnorderedGroup()) + refdict = {} + for elem in regionrefs: + refdict[elem.get_regionRef()] = elem + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): + refdict = {**refdict, **get_groupdict(elem)} + return refdict + ro = self.get_ReadingOrder() + if ro is None: + return {} + return get_groupdict(ro.get_OrderedGroup() or ro.get_UnorderedGroup()) def set_orientation(self, orientation): """ Set deskewing angle to given `orientation` number. diff --git a/src/ocrd_models/ocrd_xml_base.py b/src/ocrd_models/ocrd_xml_base.py index 8579a5b407..ea4798c5b9 100644 --- a/src/ocrd_models/ocrd_xml_base.py +++ b/src/ocrd_models/ocrd_xml_base.py @@ -8,8 +8,8 @@ from .utils import xmllint_format -for curie in NAMESPACES: - ET.register_namespace(curie, NAMESPACES[curie]) +for curie, url in NAMESPACES.items(): + ET.register_namespace(curie, url) class OcrdXmlDocument(): """ diff --git a/src/ocrd_page_user_methods.py b/src/ocrd_page_user_methods.py index 8a2332e6e5..fe22dd89ab 100644 --- a/src/ocrd_page_user_methods.py +++ b/src/ocrd_page_user_methods.py @@ -116,6 +116,7 @@ def _add_method(class_re, method_name, file_name=None): _add_method(r'^(PageType)$', 'set_Border'), _add_method(r'^(CoordsType)$', 'set_points'), _add_method(r'^(PageType)$', 'get_AllTextLines'), + _add_method(r'^(PageType)$', 'get_ReadingOrderGroups'), # for some reason, pagecontent.xsd does not declare @orientation at the abstract/base RegionType: _add_method(r'^(PageType|AdvertRegionType|MusicRegionType|MapRegionType|ChemRegionType|MathsRegionType|SeparatorRegionType|ChartRegionType|TableRegionType|GraphicRegionType|LineDrawingRegionType|ImageRegionType|TextRegionType)$', 'set_orientation'), ) diff --git a/src/ocrd_page_user_methods/get_ReadingOrderGroups.py b/src/ocrd_page_user_methods/get_ReadingOrderGroups.py new file mode 100644 index 0000000000..e7d6c02b77 --- /dev/null +++ b/src/ocrd_page_user_methods/get_ReadingOrderGroups.py @@ -0,0 +1,33 @@ +def get_ReadingOrderGroups(self) -> dict: + """ + Aggregate recursive ReadingOrder into a dictionary, mapping each regionRef + (i.e. segment `@id`) to its referring group object (i.e one of + + \b + - :py:class:`.RegionRefType` + - :py:class:`.RegionRefIndexedType` + - :py:class:`.OrderedGroupType` + - :py:class:`.OrderedGroupIndexedType` + - :py:class:`.UnoderedGroupType` + - :py:class:`.UnoderedGroupIndexedType` + """ + def get_groupdict(group): + regionrefs = list() + if isinstance(group, (OrderedGroupType, OrderedGroupIndexedType)): + regionrefs = (group.get_RegionRefIndexed() + + group.get_OrderedGroupIndexed() + + group.get_UnorderedGroupIndexed()) + if isinstance(group, (UnorderedGroupType, UnorderedGroupIndexedType)): + regionrefs = (group.get_RegionRef() + + group.get_OrderedGroup() + + group.get_UnorderedGroup()) + refdict = {} + for elem in regionrefs: + refdict[elem.get_regionRef()] = elem + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): + refdict = {**refdict, **get_groupdict(elem)} + return refdict + ro = self.get_ReadingOrder() + if ro is None: + return {} + return get_groupdict(ro.get_OrderedGroup() or ro.get_UnorderedGroup()) diff --git a/src/ocrd_utils/__init__.py b/src/ocrd_utils/__init__.py index b5bbcae121..c853a34bd3 100644 --- a/src/ocrd_utils/__init__.py +++ b/src/ocrd_utils/__init__.py @@ -13,6 +13,7 @@ :py:meth:`ocrd.workspace.Workspace.image_from_segment`.) * :py:func:`rotate_coordinates`, + :py:func:`scale_coordinates`, :py:func:`shift_coordinates`, :py:func:`transpose_coordinates`, :py:func:`transform_coordinates` @@ -74,6 +75,7 @@ :py:func:`concat_padded`, :py:func:`nth_url_segment`, :py:func:`remove_non_path_from_url`, + :py:func:`parse_json_file_with_comments`, :py:func:`parse_json_string_with_comments`, :py:func:`parse_json_string_or_file`, :py:func:`set_json_key_value_overrides`, @@ -148,6 +150,7 @@ polygon_mask, rotate_coordinates, rotate_image, + scale_coordinates, shift_coordinates, transform_coordinates, transpose_coordinates, @@ -202,6 +205,7 @@ make_xml_id, nth_url_segment, partition_list, + parse_json_file_with_comments, parse_json_string_or_file, parse_json_string_with_comments, sparkline, diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 063af930c8..36399870e2 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -13,6 +13,12 @@ from textwrap import fill, indent +def _validator_boolean(val): + return isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1') + +def _parser_boolean(val): + return bool(val) if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1') + class OcrdEnvVariable(): def __init__(self, name, description, parser=str, validator=lambda val: True, default=[False, None]): @@ -60,7 +66,11 @@ def __init__(self): self._variables = {} def add(self, name, *args, **kwargs): - self._variables[name] = OcrdEnvVariable(name, *args, **kwargs) + var = OcrdEnvVariable(name, *args, **kwargs) + # make visible in ocrd_utils.config docstring (apidoc) + txt = var.describe(wrap_text=False, indent_text=True) + globals()['__doc__'] += "\n\n - " + txt + "\n\n" + self._variables[name] = var return self._variables[name] def has_default(self, name): @@ -68,14 +78,26 @@ def has_default(self, name): raise ValueError(f"Unregistered env variable {name}") return self._variables[name].has_default + def reset_defaults(self): + for name in self._variables: + try: + # we cannot use hasattr, because that delegates to getattr, + # which we override and provide defaults for (which of course + # cannot be removed) + if self.__getattribute__(name): + delattr(self, name) + except AttributeError: + pass + def describe(self, name, *args, **kwargs): if not name in self._variables: raise ValueError(f"Unregistered env variable {name}") return self._variables[name].describe(*args, **kwargs) def __getattr__(self, name): + # will be called if name is not accessible (has not been added directly yet) if not name in self._variables: - raise ValueError(f"Unregistered env variable {name}") + raise AttributeError(f"Unregistered env variable {name}") var_obj = self._variables[name] try: raw_value = self.raw_value(name) @@ -102,21 +124,33 @@ def raw_value(self, name): config.add('OCRD_METS_CACHING', description='If set to `true`, access to the METS file is cached, speeding in-memory search and modification.', - validator=lambda val: val in ('true', 'false', '0', '1'), - parser=lambda val: val in ('true', '1')) + validator=_validator_boolean, + parser=_parser_boolean) config.add('OCRD_MAX_PROCESSOR_CACHE', description="Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers.", parser=int, default=(True, 128)) +config.add('OCRD_MAX_PARALLEL_PAGES', + description="Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set >1, then a METS Server must be used for METS synchronisation.", + parser=int, + default=(True, 1)) + +config.add('OCRD_PROCESSING_PAGE_TIMEOUT', + description="Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.", + parser=int, + default=(True, 0)) + config.add("OCRD_PROFILE", description="""\ Whether to enable gathering runtime statistics on the `ocrd.profile` logger (comma-separated): + - `CPU`: yields CPU and wall-time, - `RSS`: also yields peak memory (resident set size) - `PSS`: also yields peak memory (proportional set size) + """, validator=lambda val : all(t in ('', 'CPU', 'RSS', 'PSS') for t in val.split(',')), default=(True, '')) @@ -125,7 +159,7 @@ def raw_value(self, name): description="If set, then the CPU profile is written to this file for later peruse with a analysis tools like snakeviz") config.add("OCRD_DOWNLOAD_RETRIES", - description="Number of times to retry failed attempts for downloads of workspace files.", + description="Number of times to retry failed attempts for downloads of resources or workspace files.", validator=int, parser=int) @@ -141,6 +175,55 @@ def _ocrd_download_timeout_parser(val): description="Timeout in seconds for connecting or reading (comma-separated) when downloading.", parser=_ocrd_download_timeout_parser) +config.add("OCRD_DOWNLOAD_INPUT", + description="Whether to download files not present locally during processing", + default=(True, True), + validator=_validator_boolean, + parser=_parser_boolean) + +config.add("OCRD_MISSING_INPUT", + description="""\ +How to deal with missing input files (for some fileGrp/pageId) during processing: + + - `SKIP`: ignore and proceed with next page's input + - `ABORT`: throw :py:class:`.MissingInputFile` + +""", + default=(True, 'SKIP'), + validator=lambda val: val in ['SKIP', 'ABORT'], + parser=str) + +config.add("OCRD_MISSING_OUTPUT", + description="""\ +How to deal with missing output files (for some fileGrp/pageId) during processing: + + - `SKIP`: ignore and proceed processing next page + - `COPY`: fall back to copying input PAGE to output fileGrp for page + - `ABORT`: re-throw whatever caused processing to fail + +""", + default=(True, 'SKIP'), + validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'], + parser=str) + +config.add("OCRD_MAX_MISSING_OUTPUTS", + description="Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).", + default=(True, 0.1), + parser=float) + +config.add("OCRD_EXISTING_OUTPUT", + description="""\ +How to deal with already existing output files (for some fileGrp/pageId) during processing: + + - `SKIP`: ignore and proceed processing next page + - `OVERWRITE`: force writing result to output fileGrp for page + - `ABORT`: re-throw :py:class:`FileExistsError` + +""", + default=(True, 'SKIP'), + validator=lambda val: val in ['SKIP', 'OVERWRITE', 'ABORT'], + parser=str) + config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING", description="Default address of Processing Server to connect to (for `ocrd network client processing`).", default=(True, '')) @@ -200,5 +283,5 @@ def _ocrd_download_timeout_parser(val): config.add("OCRD_LOGGING_DEBUG", description="Print information about the logging setup to STDERR", default=(True, False), - validator=lambda val: isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1'), - parser=lambda val: val if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1')) + validator=_validator_boolean, + parser=_parser_boolean) diff --git a/src/ocrd_utils/image.py b/src/ocrd_utils/image.py index 3bc14e6612..6f2524608c 100644 --- a/src/ocrd_utils/image.py +++ b/src/ocrd_utils/image.py @@ -65,10 +65,10 @@ def adjust_canvas_to_transposition(size, method): Return a numpy array of the enlarged width and height. """ - if method in [Image.ROTATE_90, - Image.ROTATE_270, - Image.TRANSPOSE, - Image.TRANSVERSE]: + if method in [Image.Transpose.ROTATE_90, + Image.Transpose.ROTATE_270, + Image.Transpose.TRANSPOSE, + Image.Transpose.TRANSVERSE]: size = size[::-1] return size @@ -348,26 +348,26 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])): calculate the affine coordinate transform corresponding to the composition of both transformations, which is respectively: - - ``PIL.Image.FLIP_LEFT_RIGHT``: + - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``: entails translation to the center, followed by pure reflection about the y-axis, and subsequent translation back - - ``PIL.Image.FLIP_TOP_BOTTOM``: + - ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``: entails translation to the center, followed by pure reflection about the x-axis, and subsequent translation back - - ``PIL.Image.ROTATE_180``: + - ``PIL.Image.Transpose.ROTATE_180``: entails translation to the center, followed by pure reflection about the origin, and subsequent translation back - - ``PIL.Image.ROTATE_90``: + - ``PIL.Image.Transpose.ROTATE_90``: entails translation to the center, followed by pure rotation by 90° counter-clockwise, and subsequent translation back - - ``PIL.Image.ROTATE_270``: + - ``PIL.Image.Transpose.ROTATE_270``: entails translation to the center, followed by pure rotation by 270° counter-clockwise, and subsequent translation back - - ``PIL.Image.TRANSPOSE``: + - ``PIL.Image.Transpose.TRANSPOSE``: entails translation to the center, followed by pure rotation by 90° counter-clockwise and pure reflection about the x-axis, and subsequent translation back - - ``PIL.Image.TRANSVERSE``: + - ``PIL.Image.Transpose.TRANSVERSE``: entails translation to the center, followed by pure rotation by 90° counter-clockwise and pure reflection about the y-axis, and subsequent translation back @@ -388,13 +388,13 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])): [0, 0, 1]]) transform = shift_coordinates(transform, -orig) operations = { - Image.FLIP_LEFT_RIGHT: [refly], - Image.FLIP_TOP_BOTTOM: [reflx], - Image.ROTATE_180: [reflx, refly], - Image.ROTATE_90: [rot90], - Image.ROTATE_270: [rot90, reflx, refly], - Image.TRANSPOSE: [rot90, reflx], - Image.TRANSVERSE: [rot90, refly] + Image.Transpose.FLIP_LEFT_RIGHT: [refly], + Image.Transpose.FLIP_TOP_BOTTOM: [reflx], + Image.Transpose.ROTATE_180: [reflx, refly], + Image.Transpose.ROTATE_90: [rot90], + Image.Transpose.ROTATE_270: [rot90, reflx, refly], + Image.Transpose.TRANSPOSE: [rot90, reflx], + Image.Transpose.TRANSVERSE: [rot90, refly] }.get(method) # no default for operation in operations: transform = np.dot(operation, transform) @@ -411,29 +411,29 @@ def transpose_image(image, method): Given a PIL.Image ``image`` and a transposition mode ``method``, apply the respective operation: - - ``PIL.Image.FLIP_LEFT_RIGHT``: + - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``: all pixels get mirrored at half the width of the image - - ``PIL.Image.FLIP_TOP_BOTTOM``: + - ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``: all pixels get mirrored at half the height of the image - - ``PIL.Image.ROTATE_180``: + - ``PIL.Image.Transpose.ROTATE_180``: all pixels get mirrored at both, the width and half the height of the image, i.e. the image gets rotated by 180° counter-clockwise - - ``PIL.Image.ROTATE_90``: + - ``PIL.Image.Transpose.ROTATE_90``: rows become columns (but counted from the right) and columns become rows, i.e. the image gets rotated by 90° counter-clockwise; width becomes height and vice versa - - ``PIL.Image.ROTATE_270``: + - ``PIL.Image.Transpose.ROTATE_270``: rows become columns and columns become rows (but counted from the bottom), i.e. the image gets rotated by 270° counter-clockwise; width becomes height and vice versa - - ``PIL.Image.TRANSPOSE``: + - ``PIL.Image.Transpose.TRANSPOSE``: rows become columns and vice versa, i.e. all pixels get mirrored at the main diagonal; width becomes height and vice versa - - ``PIL.Image.TRANSVERSE``: + - ``PIL.Image.Transpose.TRANSVERSE``: rows become columns (but counted from the right) and columns become rows (but counted from the bottom), i.e. all pixels get mirrored at the opposite diagonal; diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index bb771fc0ce..52b01883f1 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -5,9 +5,9 @@ Logging can be overridden either programmatically in code using the library or by creating one or more of -- /etc/ocrd_logging.py -- $HOME/ocrd_logging.py -- $PWD/ocrd_logging.py +- ``/etc/ocrd_logging.py`` +- ``$HOME/ocrd_logging.py`` +- ``$PWD/ocrd_logging.py`` These files will be executed in the context of ocrd/ocrd_logging.py, with `logging` global set. @@ -16,20 +16,18 @@ - Try to be less intrusive with OCR-D specific logging conventions to make it easier and less surprising to define logging behavior when using OCR-D/core as a library - - Change setOverrideLogLevel to only override the log level of the ``ocrd`` + - Change :py:meth:`setOverrideLogLevel` to only override the log level of the ``ocrd`` logger and its descendants - - initLogging will set exactly one handler, for the root logger or for the + - :py:meth:`initLogging` will set exactly one handler, for the root logger or for the ``ocrd`` logger. - Child loggers should propagate to the ancestor logging (default - behavior of the logging library - no more PropagationShyLogger) - - disableLogging only removes any handlers from the ``ocrd`` logger + behavior of the logging library - no more ``PropagationShyLogger``) + - :py:meth:`disableLogging` only removes any handlers from the ``ocrd`` logger """ # pylint: disable=no-member from __future__ import absolute_import -from traceback import format_stack - import logging import logging.config from pathlib import Path @@ -48,13 +46,8 @@ 'setOverrideLogLevel', ] -# These are the loggers we add handlers to -ROOT_OCRD_LOGGERS = [ - 'ocrd', - 'ocrd_network' -] - LOGGING_DEFAULTS = { + '': logging.WARNING, 'ocrd': logging.INFO, 'ocrd_network': logging.INFO, # 'ocrd.resolver': logging.INFO, @@ -81,10 +74,10 @@ def tf_disable_interactive_logs(): try: - from os import environ + from os import environ # pylint: disable=import-outside-toplevel # This env variable must be set before importing from Keras environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - from tensorflow.keras.utils import disable_interactive_logging + from tensorflow.keras.utils import disable_interactive_logging # pylint: disable=import-outside-toplevel # Enabled interactive logging throws an exception # due to a call of sys.stdout.flush() disable_interactive_logging() @@ -115,18 +108,15 @@ def setOverrideLogLevel(lvl, silent=not config.OCRD_LOGGING_DEBUG): lvl (string): Log level name. silent (boolean): Whether to log the override call """ - if not _initialized_flag: - initLogging(silent=silent) - ocrd_logger = logging.getLogger('ocrd') - - if lvl is None: - if not silent: - print('[LOGGING] Reset log level override', file=sys.stderr) - ocrd_logger.setLevel(logging.NOTSET) - else: - if not silent: - print(f'[LOGGING] Overriding ocrd log level to {lvl}', file=sys.stderr) - ocrd_logger.setLevel(lvl) + if lvl is not None: + lvl = getLevelName(lvl) + if not _initialized_flag: + initLogging(silent=silent) + # affect all configured loggers + for logger_name in logging.root.manager.loggerDict: + if not silent: + print(f'[LOGGING] Overriding {logger_name} log level to {lvl}', file=sys.stderr) + logging.getLogger(logger_name).setLevel(lvl) def get_logging_config_files(): """ @@ -143,37 +133,28 @@ def get_logging_config_files(): def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_LOGGING_DEBUG): """ - Reset ``ocrd`` logger, read logging configuration if exists, otherwise use basicConfig + Reset ``ocrd`` logger, read logging configuration if exists, otherwise use :py:meth:`logging.basicConfig` - initLogging is to be called by OCR-D/core once, i.e. + This is to be called by OCR-D/core only once, i.e. - for the ``ocrd`` CLI - for the processor wrapper methods Other processes that use OCR-D/core as a library can, but do not have to, use this functionality. Keyword Args: - - builtin_only (bool, False): Whether to search for logging configuration - on-disk (``False``) or only use the - hard-coded config (``True``). For testing - - force_reinit (bool, False): Whether to ignore the module-level - ``_initialized_flag``. For testing only. - - silent (bool, True): Whether to log logging behavior by printing to stderr + - builtin_only (bool): Whether to search for logging configuration + on-disk (``False``) or only use the hard-coded config (``True``). + For testing + - force_reinit (bool): Whether to ignore the module-level ``_initialized_flag``. + For testing only + - silent (bool): Whether to log logging behavior by printing to stderr """ global _initialized_flag - if _initialized_flag and not force_reinit: - return - # disableLogging() - - # https://docs.python.org/3/library/logging.html#logging.disable - # If logging.disable(logging.NOTSET) is called, it effectively removes this - # overriding level, so that logging output again depends on the effective - # levels of individual loggers. - logging.disable(logging.NOTSET) - - # remove all handlers for the ocrd root loggers - for logger_name in ROOT_OCRD_LOGGERS: - for handler in logging.getLogger(logger_name).handlers[:]: - logging.getLogger(logger_name).removeHandler(handler) + if _initialized_flag: + if force_reinit: + disableLogging(silent=silent) + else: + return config_file = None if not builtin_only: @@ -192,8 +173,8 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L ocrd_handler = logging.StreamHandler(stream=sys.stderr) ocrd_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT)) ocrd_handler.setLevel(logging.DEBUG) - for logger_name in ROOT_OCRD_LOGGERS: - logging.getLogger(logger_name).addHandler(ocrd_handler) + root_logger = logging.getLogger('') + root_logger.addHandler(ocrd_handler) for logger_name, logger_level in LOGGING_DEFAULTS.items(): logging.getLogger(logger_name).setLevel(logger_level) _initialized_flag = True @@ -209,22 +190,16 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): if _initialized_flag and not silent: print("[LOGGING] Disabling logging", file=sys.stderr) _initialized_flag = False - # logging.basicConfig(level=logging.CRITICAL) - # logging.disable(logging.ERROR) - # remove all handlers for the ocrd logger - for logger_name in ROOT_OCRD_LOGGERS: - for handler in logging.getLogger(logger_name).handlers[:]: - logging.getLogger(logger_name).removeHandler(handler) - for logger_name in LOGGING_DEFAULTS: - logging.getLogger(logger_name).setLevel(logging.NOTSET) - -# Initializing stream handlers at module level -# would cause message output in all runtime contexts, -# including those which are already run for std output -# (--dump-json, --version, ocrd-tool, bashlib etc). -# So this needs to be an opt-in from the CLIs/decorators: -#initLogging() -# Also, we even have to block log output for libraries -# (like matplotlib/tensorflow) which set up logging -# themselves already: -disableLogging() + # remove all handlers we might have added (via initLogging on builtin or file config) + for logger_name in logging.root.manager.loggerDict: + if not silent: + print(f'[LOGGING] Resetting {logger_name} log level and handlers') + logger = logging.getLogger(logger_name) + logger.setLevel(logging.NOTSET) + for handler in logger.handlers[:]: + logger.removeHandler(handler) + for handler in logging.root.handlers[:]: + logging.root.removeHandler(handler) + # Python default log level is WARNING + logging.root.setLevel(logging.WARNING) + diff --git a/src/ocrd_utils/ocrd_logging.conf b/src/ocrd_utils/ocrd_logging.conf index 5cf161398e..41e6d5af7a 100644 --- a/src/ocrd_utils/ocrd_logging.conf +++ b/src/ocrd_utils/ocrd_logging.conf @@ -34,7 +34,7 @@ keys=defaultFormatter,detailedFormatter # default logger "root" using consoleHandler # [logger_root] -level=INFO +level=WARNING handlers=consoleHandler,fileHandler @@ -56,22 +56,22 @@ handlers=consoleHandler,fileHandler # ocrd loggers [logger_ocrd] level=INFO -handlers=consoleHandler,fileHandler +handlers= qualname=ocrd -propagate=0 [logger_ocrd_network] level=INFO -handlers=consoleHandler,processingServerHandler +#handlers=consoleHandler,processingServerHandler +handlers=processingServerHandler qualname=ocrd_network -propagate=0 +#propagate=0 # # logger tensorflow # [logger_ocrd_tensorflow] level=ERROR -handlers=consoleHandler +handlers= qualname=tensorflow # @@ -79,7 +79,7 @@ qualname=tensorflow # [logger_ocrd_shapely_geos] level=ERROR -handlers=consoleHandler +handlers= qualname=shapely.geos @@ -88,7 +88,7 @@ qualname=shapely.geos # [logger_ocrd_PIL] level=INFO -handlers=consoleHandler +handlers= qualname=PIL # @@ -96,34 +96,32 @@ qualname=PIL # [logger_paramiko] level=INFO -handlers=consoleHandler +handlers= qualname=paramiko -propagate=0 [logger_paramiko_transport] level=INFO -handlers=consoleHandler +handlers= qualname=paramiko.transport -propagate=0 # # uvicorn loggers # [logger_uvicorn] level=INFO -handlers=consoleHandler +handlers= qualname=uvicorn [logger_uvicorn_access] level=WARN -handlers=consoleHandler +handlers= qualname=uvicorn.access [logger_uvicorn_error] level=INFO -handlers=consoleHandler +handlers= qualname=uvicorn.error [logger_multipart] level=INFO -handlers=consoleHandler +handlers= qualname=multipart diff --git a/src/ocrd_utils/os.py b/src/ocrd_utils/os.py index 18463de0c0..70721acbe3 100644 --- a/src/ocrd_utils/os.py +++ b/src/ocrd_utils/os.py @@ -71,9 +71,8 @@ def unzip_file_to_dir(path_to_zip, output_directory): """ Extract a ZIP archive to a directory """ - z = ZipFile(path_to_zip, 'r') - z.extractall(output_directory) - z.close() + with ZipFile(path_to_zip, 'r') as z: + z.extractall(output_directory) @lru_cache() def get_ocrd_tool_json(executable): @@ -87,7 +86,7 @@ def get_ocrd_tool_json(executable): ocrd_tool = ocrd_all_tool[executable] except (JSONDecodeError, OSError, KeyError): try: - ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE).stdout) + ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE, check=False).stdout) except (JSONDecodeError, OSError) as e: getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json produced invalid JSON: {e}') if 'resource_locations' not in ocrd_tool: @@ -102,7 +101,7 @@ def get_moduledir(executable): moduledir = ocrd_all_moduledir[executable] except (JSONDecodeError, OSError, KeyError): try: - moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE).stdout.rstrip('\n') + moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE, check=False).stdout.rstrip('\n') except (JSONDecodeError, OSError) as e: getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}') return moduledir diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py index dea3715bf4..13d03cc5b8 100644 --- a/src/ocrd_utils/str.py +++ b/src/ocrd_utils/str.py @@ -4,9 +4,10 @@ import re import json -from typing import List, Union +from typing import List from .constants import REGEX_FILE_ID, SPARKLINE_CHARS -from .deprecate import deprecation_warning +#from .deprecate import deprecation_warning +from deprecated import deprecated from warnings import warn from numpy import array_split @@ -20,6 +21,7 @@ 'make_file_id', 'make_xml_id', 'nth_url_segment', + 'parse_json_file_with_comments', 'parse_json_string_or_file', 'parse_json_string_with_comments', 'remove_non_path_from_url', @@ -27,6 +29,7 @@ ] +@deprecated(version='3.0', reason='specify input and output file_grp_cardinality in ocrd-tool.json instead') def assert_file_grp_cardinality(grps, n, msg=None): """ Assert that a string of comma-separated fileGrps contains exactly ``n`` entries. @@ -105,10 +108,11 @@ def make_xml_id(idstr: str) -> str: ret = idstr if not REGEX_FILE_ID.fullmatch(ret): ret = ret.replace(':', '_') + ret = ret.replace('/', '_') ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret) ret = re.sub(r'[^\w.-]', r'', ret) return ret - + def nth_url_segment(url, n=-1): """ Return the last /-delimited segment of a URL-like string @@ -160,6 +164,13 @@ def is_string(val): return isinstance(val, str) +def parse_json_file_with_comments(val): + """ + Parse a file of JSON interspersed with #-prefixed full-line comments + """ + with open(val, 'r', encoding='utf-8') as inputf: + return parse_json_string_with_comments(inputf.read()) + def parse_json_string_with_comments(val): """ Parse a string of JSON interspersed with #-prefixed full-line comments @@ -263,4 +274,3 @@ def sparkline(values : List[int]) -> str: # normalize to 0..1 and convert to index in SPARKLINE_CHARS mapped = [int(x / max_value * max_mapping) for x in values] return ''.join(SPARKLINE_CHARS[x] for x in mapped) - diff --git a/src/ocrd_validators/json_validator.py b/src/ocrd_validators/json_validator.py index c920fc7c2d..f21a23afee 100644 --- a/src/ocrd_validators/json_validator.py +++ b/src/ocrd_validators/json_validator.py @@ -3,10 +3,13 @@ """ import json -from jsonschema import Draft6Validator, validators # pylint: disable=import-error +from jsonschema import Draft201909Validator, ValidationError, validators # pylint: disable=import-error from ocrd_models import ValidationReport +class JsonSchemaDeprecationWarning(ValidationError): + pass + # http://python-jsonschema.readthedocs.io/en/latest/faq/ def extend_with_default(validator_class): """ @@ -14,21 +17,22 @@ def extend_with_default(validator_class): """ validate_properties = validator_class.VALIDATORS["properties"] - def set_defaults(validator, properties, instance, schema): + def set_defaults_and_handle_deprecate(validator, properties, instance, schema): """ Set defaults in subschemas """ for prop, subschema in properties.items(): if "default" in subschema: instance.setdefault(prop, subschema["default"]) + if subschema.get('deprecated', False) and instance.get(prop): + yield JsonSchemaDeprecationWarning(f"Property {prop} has been deprecated, ocrd-tool.json should be updated.") - for error in validate_properties(validator, properties, instance, schema): - yield error + yield from validate_properties(validator, properties, instance, schema) - return validators.extend(validator_class, {"properties": set_defaults}) + return validators.extend(validator_class, {"properties": set_defaults_and_handle_deprecate}) -DefaultValidatingDraft6Validator = extend_with_default(Draft6Validator) +DefaultValidatingDraft20199Validator = extend_with_default(Draft201909Validator) # # ------------------------------------------------- @@ -52,13 +56,13 @@ def validate(obj, schema): obj = json.loads(obj) return JsonValidator(schema)._validate(obj) # pylint: disable=protected-access - def __init__(self, schema, validator_class=Draft6Validator): + def __init__(self, schema, validator_class=Draft201909Validator): """ Construct a JsonValidator. Args: schema (dict): - validator_class (Draft6Validator|DefaultValidatingDraft6Validator): + validator_class (Draft20199Validator|DefaultValidatingDraft20199Validator): """ self.validator = validator_class(schema) @@ -74,6 +78,7 @@ def _validate(self, obj): report = ValidationReport() if not self.validator.is_valid(obj): for v in self.validator.iter_errors(obj): + meth = f'add_{"warning" if isinstance(v, JsonSchemaDeprecationWarning) else "error"}' # print(">>>>>>>>> v='%s', obj='%s'" % (v, obj)) - report.add_error("[%s] %s" % ('.'.join(str(vv) for vv in v.path), v.message)) + getattr(report, meth)("[%s] %s" % ('.'.join(str(vv) for vv in v.path), v.message)) return report diff --git a/src/ocrd_validators/ocrd_tool.schema.yml b/src/ocrd_validators/ocrd_tool.schema.yml index 766fd892cc..bdf834b6a6 100644 --- a/src/ocrd_validators/ocrd_tool.schema.yml +++ b/src/ocrd_validators/ocrd_tool.schema.yml @@ -11,7 +11,7 @@ properties: type: string pattern: '^[0-9]+\.[0-9]+\.[0-9]+$' git_url: - description: Github/Gitlab URL + description: GitHub/GitLab URL type: string format: url dockerhub: @@ -29,28 +29,54 @@ properties: - steps - executable - categories - - input_file_grp - # Not required because not all processors produce output files - # - output_file_grp + - input_file_grp_cardinality + - output_file_grp_cardinality properties: executable: description: The name of the CLI executable in $PATH type: string input_file_grp: - description: Input fileGrp@USE this tool expects by default + deprecated: true + description: (DEPRECATED) Input fileGrp@USE this tool expects by default type: array items: type: string # pattern: '^OCR-D-[A-Z0-9-]+$' output_file_grp: - description: Output fileGrp@USE this tool produces by default + deprecated: true + description: (DEPRECATED) Output fileGrp@USE this tool produces by default type: array items: type: string # pattern: '^OCR-D-[A-Z0-9-]+$' + input_file_grp_cardinality: + description: Number of (comma-separated) input fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited) + oneOf: + - type: number + multipleOf: 1 + - type: array + items: + type: number + multipleOf: 1 + minItems: 2 + maxItems: 2 + default: 1 + output_file_grp_cardinality: + description: Number of (comma-separated) output fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited) + oneOf: + - type: number + multipleOf: 1 + - type: array + items: + type: number + multipleOf: 1 + minItems: 2 + maxItems: 2 + default: 1 parameters: description: Object describing the parameters of a tool. Keys are parameter names, values sub-schemas. type: object + default: {} patternProperties: ".*": type: object @@ -82,6 +108,12 @@ properties: maximum: type: number description: Maximum value for number parameters, including the maximum + minProperties: + type: number + description: Minimum number of properties of an object + maxProperties: + type: number + description: Maximum number of properties of an object exclusiveMinimum: type: number description: Minimum value for number parameters, excluding the minimum @@ -95,8 +127,11 @@ properties: type: object description: Describe the properties of an object value additionalProperties: - type: boolean - description: Whether an object value may contain properties not explicitly defined + oneOf: + - type: boolean + description: Whether an object value may contain properties not explicitly defined + - type: object + description: Schema any additional properties need to adhere to required: type: boolean description: Whether this parameter is required @@ -121,9 +156,9 @@ properties: description: "If parameter is reference to file: Whether the file should be cached, e.g. because it is large and won't change." default: false description: - description: Concise description what the tool does + description: Concise description of what the tool does categories: - description: Tools belong to this categories, representing modules within the OCR-D project structure + description: Tools belong to these categories, representing modules within the OCR-D project structure type: array items: type: string @@ -198,7 +233,7 @@ properties: default: 'as-is' path_in_archive: type: string - description: if type is archive, the resource is at this location in the archive + description: If type is archive, the resource is at this location in the archive default: '.' version_range: type: string @@ -206,4 +241,4 @@ properties: default: '>= 0.0.1' size: type: number - description: Size of the resource in bytes + description: "Size of the resource in bytes to be retrieved (for archives: size of the archive)" diff --git a/src/ocrd_validators/ocrd_tool_validator.py b/src/ocrd_validators/ocrd_tool_validator.py index b408bd86e9..00a402c12d 100644 --- a/src/ocrd_validators/ocrd_tool_validator.py +++ b/src/ocrd_validators/ocrd_tool_validator.py @@ -4,7 +4,7 @@ See `specs `_. """ from .constants import OCRD_TOOL_SCHEMA -from .json_validator import JsonValidator +from .json_validator import DefaultValidatingDraft20199Validator, JsonValidator # # ------------------------------------------------- @@ -20,4 +20,7 @@ def validate(obj, schema=OCRD_TOOL_SCHEMA): """ Validate against ``ocrd-tool.json`` schema. """ - return JsonValidator.validate(obj, schema) + return OcrdToolValidator(schema)._validate(obj) # pylint: disable=protected-access + + def __init__(self, schema): + super().__init__(schema, validator_class=DefaultValidatingDraft20199Validator) diff --git a/src/ocrd_validators/page_validator.py b/src/ocrd_validators/page_validator.py index 41ce0b9f94..0459f17811 100644 --- a/src/ocrd_validators/page_validator.py +++ b/src/ocrd_validators/page_validator.py @@ -6,7 +6,7 @@ from shapely.validation import explain_validity from ocrd_utils import getLogger, polygon_from_points, deprecated_alias -from ocrd_models.ocrd_page import parse +from ocrd_models.ocrd_page import OcrdPage, parse from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( @@ -34,50 +34,50 @@ _HIERARCHY = [ # page can contain different types of regions - (PageType, 'get_AdvertRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_ChartRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_ChemRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_CustomRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_GraphicRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_ImageRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_LineDrawingRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_MapRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_MathsRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_MusicRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_NoiseRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_SeparatorRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_TableRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_TextRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_UnknownRegion', None), # pylint: disable=bad-whitespace + (PageType, 'get_AdvertRegion', None), + (PageType, 'get_ChartRegion', None), + (PageType, 'get_ChemRegion', None), + (PageType, 'get_CustomRegion', None), + (PageType, 'get_GraphicRegion', None), + (PageType, 'get_ImageRegion', None), + (PageType, 'get_LineDrawingRegion', None), + (PageType, 'get_MapRegion', None), + (PageType, 'get_MathsRegion', None), + (PageType, 'get_MusicRegion', None), + (PageType, 'get_NoiseRegion', None), + (PageType, 'get_SeparatorRegion', None), + (PageType, 'get_TableRegion', None), + (PageType, 'get_TextRegion', None), + (PageType, 'get_UnknownRegion', None), # all regions can be recursive - (RegionType, 'get_AdvertRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_ChartRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_ChemRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_CustomRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_GraphicRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_ImageRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_LineDrawingRegion', None), # pylint: disable=bad-whitespace - #(RegionType, 'get_MapRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_MathsRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_MusicRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_NoiseRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_SeparatorRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_TableRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_TextRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_UnknownRegion', None), # pylint: disable=bad-whitespace + (RegionType, 'get_AdvertRegion', None), + (RegionType, 'get_ChartRegion', None), + (RegionType, 'get_ChemRegion', None), + (RegionType, 'get_CustomRegion', None), + (RegionType, 'get_GraphicRegion', None), + (RegionType, 'get_ImageRegion', None), + (RegionType, 'get_LineDrawingRegion', None), + #(RegionType, 'get_MapRegion', None), + (RegionType, 'get_MathsRegion', None), + (RegionType, 'get_MusicRegion', None), + (RegionType, 'get_NoiseRegion', None), + (RegionType, 'get_SeparatorRegion', None), + (RegionType, 'get_TableRegion', None), + (RegionType, 'get_TextRegion', None), + (RegionType, 'get_UnknownRegion', None), # only TextRegion can contain TextLine - (TextRegionType, 'get_TextLine', '\n'), # pylint: disable=bad-whitespace - (TextLineType, 'get_Word', ' '), # pylint: disable=bad-whitespace - (WordType, 'get_Glyph', ''), # pylint: disable=bad-whitespace - (GlyphType, None, None), # pylint: disable=bad-whitespace + (TextRegionType, 'get_TextLine', '\n'), + (TextLineType, 'get_Word', ' '), + (WordType, 'get_Glyph', ''), + (GlyphType, None, None), ] _ORDER = [ (None, TextLineOrderSimpleType.BOTTOMTOTOP, ReadingDirectionSimpleType.RIGHTTOLEFT), - (PageType, 'get_textLineOrder', 'get_readingDirection'), # pylint: disable=bad-whitespace - (TextRegionType, 'get_textLineOrder', 'get_readingDirection'), # pylint: disable=bad-whitespace - (TextLineType, None, 'get_readingDirection'), # pylint: disable=bad-whitespace - (WordType, None, 'get_readingDirection'), # pylint: disable=bad-whitespace + (PageType, 'get_textLineOrder', 'get_readingDirection'), + (TextRegionType, 'get_textLineOrder', 'get_readingDirection'), + (TextLineType, None, 'get_readingDirection'), + (WordType, None, 'get_readingDirection'), ] # The following parameters control how tolerant we are with respect to @@ -115,9 +115,9 @@ def __init__(self, tag, ID, file_id, actual, expected): self.file_id = file_id self.actual = actual self.expected = expected - super(ConsistencyError, self).__init__( - "INCONSISTENCY in %s ID '%s' of file '%s': text results '%s' != concatenated '%s'" % ( - tag, ID, file_id, actual, expected)) + super().__init__( + f"INCONSISTENCY in {tag} ID '{ID}' of file '{file_id}': " + f"text results '{actual}' != concatenated '{expected}'") class CoordinateConsistencyError(Exception): """ @@ -141,9 +141,9 @@ def __init__(self, tag, ID, file_id, outer, inner): self.file_id = file_id self.outer = outer self.inner = inner - super(CoordinateConsistencyError, self).__init__( - "INCONSISTENCY in %s ID '%s' of '%s': coords '%s' not within parent coords '%s'" % ( - tag, ID, file_id, inner, outer)) + super().__init__( + f"INCONSISTENCY in {tag} ID '{ID}' of '{file_id}': " + f"coords '{inner}' not within parent coords '{outer}'") class CoordinateValidityError(Exception): """ @@ -166,9 +166,8 @@ def __init__(self, tag, ID, file_id, points, reason='unknown'): self.ID = ID self.file_id = file_id self.points = points - super(CoordinateValidityError, self).__init__( - "INVALIDITY in %s ID '%s' of '%s': coords '%s' - %s" % ( - tag, ID, file_id, points, reason)) + super().__init__( + f"INVALIDITY in {tag} ID '{ID}' of '{file_id}': coords '{points}' - {reason}") def compare_without_whitespace(a, b): """ @@ -177,13 +176,14 @@ def compare_without_whitespace(a, b): return re.sub('\\s+', '', a) == re.sub('\\s+', '', b) def page_get_reading_order(ro, rogroup): - """Add all elements from the given reading order group to the given dictionary. - + """ + Add all elements from the given reading order group to the given dictionary. + Given a dict ``ro`` from layout element IDs to ReadingOrder element objects, and an object ``rogroup`` with additional ReadingOrder element objects, add all references to the dict, traversing the group recursively. """ - regionrefs = list() + regionrefs = [] if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): regionrefs = (rogroup.get_RegionRefIndexed() + rogroup.get_OrderedGroupIndexed() + @@ -236,17 +236,17 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate and whether the coordinates of an element are fully within its parent element coordinates. """ log = getLogger('ocrd.page_validator.validate_consistency') - if isinstance(node, PcGtsType): + if isinstance(node, (PcGtsType, OcrdPage)): # top-level (start recursion) node_id = node.get_pcGtsId() node = node.get_Page() # has no .id if not readingOrder: - readingOrder = dict() + readingOrder = {} ro = node.get_ReadingOrder() if ro: page_get_reading_order(readingOrder, ro.get_OrderedGroup() or ro.get_UnorderedGroup()) if not joinRelations: - joinRelations = list() + joinRelations = [] relations = node.get_Relations() # get RelationsType if relations: relations = relations.get_Relation() # get list of RelationType @@ -358,7 +358,7 @@ def concatenate(nodes, concatenate_with, page_textequiv_strategy, joins=None): if not nodes: return '' if not joins: - joins = list() + joins = [] result = get_text(nodes[0], page_textequiv_strategy) for node, next_node in zip(nodes, nodes[1:]): if (node.id, next_node.id) not in joins: @@ -470,11 +470,11 @@ def validate(filename=None, ocrd_page=None, ocrd_file=None, page = parse(filename, silence=True) file_id = filename else: - raise Exception("At least one of ocrd_page, ocrd_file or filename must be set") + raise ValueError("At least one of ocrd_page, ocrd_file or filename must be set") if page_textequiv_strategy not in ('first'): - raise Exception("page_textequiv_strategy %s not implemented" % page_textequiv_strategy) + raise ValueError("page_textequiv_strategy %s not implemented" % page_textequiv_strategy) if page_textequiv_consistency not in ('strict', 'lax', 'fix', 'off'): - raise Exception("page_textequiv_consistency level %s not implemented" % page_textequiv_consistency) + raise ValueError("page_textequiv_consistency level %s not implemented" % page_textequiv_consistency) report = ValidationReport() log.info("Validating input file '%s'", file_id) validate_consistency(page, page_textequiv_consistency, page_textequiv_strategy, check_baseline, check_coords, report, file_id) diff --git a/src/ocrd_validators/parameter_validator.py b/src/ocrd_validators/parameter_validator.py index 20dd6ff2b7..ca2a7ed8ed 100644 --- a/src/ocrd_validators/parameter_validator.py +++ b/src/ocrd_validators/parameter_validator.py @@ -1,7 +1,7 @@ """ Validate parameters against ocrd-tool.json. """ -from .json_validator import JsonValidator, DefaultValidatingDraft6Validator +from .json_validator import DefaultValidatingDraft20199Validator, JsonValidator # # ------------------------------------------------- @@ -20,7 +20,7 @@ def validate(self, *args, **kwargs): # pylint: disable=arguments-differ obj (dict): schema (dict): """ - return super(ParameterValidator, self)._validate(*args, **kwargs) + return super()._validate(*args, **kwargs) def __init__(self, ocrd_tool): """ @@ -40,9 +40,9 @@ def __init__(self, ocrd_tool): if p[n]['required']: required.append(n) del(p[n]['required']) - super(ParameterValidator, self).__init__({ + super().__init__({ "type": "object", "required": required, "additionalProperties": False, "properties": p - }, DefaultValidatingDraft6Validator) + }, DefaultValidatingDraft20199Validator) diff --git a/src/ocrd_validators/resource_list_validator.py b/src/ocrd_validators/resource_list_validator.py index 72a11c34de..47f3c81a96 100644 --- a/src/ocrd_validators/resource_list_validator.py +++ b/src/ocrd_validators/resource_list_validator.py @@ -4,7 +4,7 @@ See `specs `_. """ from .constants import RESOURCE_LIST_SCHEMA -from .json_validator import JsonValidator, DefaultValidatingDraft6Validator +from .json_validator import DefaultValidatingDraft20199Validator, JsonValidator # # ------------------------------------------------- @@ -16,9 +16,10 @@ class OcrdResourceListValidator(JsonValidator): """ @staticmethod - def validate(obj, schema=RESOURCE_LIST_SCHEMA): + def validate(obj, schema=None): """ Validate against ``resource_list.schema.yml`` schema. """ - return JsonValidator(schema, validator_class=DefaultValidatingDraft6Validator)._validate(obj) - + if schema is None: + schema = RESOURCE_LIST_SCHEMA + return JsonValidator(schema, validator_class=DefaultValidatingDraft20199Validator)._validate(obj) # pylint: disable=protected-access diff --git a/src/ocrd_validators/workspace_validator.py b/src/ocrd_validators/workspace_validator.py index d5be460997..28d45495ea 100644 --- a/src/ocrd_validators/workspace_validator.py +++ b/src/ocrd_validators/workspace_validator.py @@ -103,7 +103,7 @@ def __init__(self, resolver, mets_url, src_dir=None, skip=None, download=False, 'page_xsd'] if check not in self.skip] - self.find_kwargs = dict(include_fileGrp=include_fileGrp, exclude_fileGrp=exclude_fileGrp) + self.find_kwargs = {"include_fileGrp": include_fileGrp, "exclude_fileGrp": exclude_fileGrp} self.src_dir = src_dir self.workspace = None self.mets = None @@ -139,7 +139,7 @@ def _validate(self): self._resolve_workspace() except Exception as e: # pylint: disable=broad-except self.log.warning("Failed to instantiate workspace: %s", e) - self.report.add_error("Failed to instantiate workspace: %s" % e) + self.report.add_error(f"Failed to instantiate workspace: {e}") return self.report with pushd_popd(self.workspace.directory): try: @@ -158,7 +158,7 @@ def _validate(self): if self.page_checks: self._validate_page() except Exception: # pylint: disable=broad-except - self.report.add_error("Validation aborted with exception: %s" % format_exc()) + self.report.add_error(f"Validation aborted with exception: {format_exc()}") return self.report def _resolve_workspace(self): @@ -193,9 +193,9 @@ def _validate_imagefilename(self): page = page_from_file(f).get_Page() imageFilename = page.imageFilename if not self.mets.find_files(url=imageFilename, **self.find_kwargs): - self.report.add_error("PAGE-XML %s : imageFilename '%s' not found in METS" % (f.local_filename, imageFilename)) + self.report.add_error(f"PAGE '{f.ID}': imageFilename '{imageFilename}' not found in METS") if is_local_filename(imageFilename) and not Path(imageFilename).exists(): - self.report.add_warning("PAGE-XML %s : imageFilename '%s' points to non-existent local file" % (f.local_filename, imageFilename)) + self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}' points to non-existent local file") def _validate_dimension(self): """ @@ -210,9 +210,9 @@ def _validate_dimension(self): page = page_from_file(f).get_Page() _, _, exif = self.workspace.image_from_page(page, f.pageId) if page.imageHeight != exif.height: - self.report.add_error("PAGE '%s': @imageHeight != image's actual height (%s != %s)" % (f.ID, page.imageHeight, exif.height)) + self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height ({page.imageHeight} != {exif.height})") if page.imageWidth != exif.width: - self.report.add_error("PAGE '%s': @imageWidth != image's actual width (%s != %s)" % (f.ID, page.imageWidth, exif.width)) + self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width ({page.imageWidth} != {exif.width})") def _validate_multipage(self): """ @@ -229,9 +229,9 @@ def _validate_multipage(self): try: exif = self.workspace.resolve_image_exif(f.local_filename) if exif.n_frames > 1: - self.report.add_error("Image %s: More than 1 frame: %s" % (f.ID, exif.n_frames)) + self.report.add_error(f"Image '{f.ID}': More than 1 frame: {exif.n_frames}") except FileNotFoundError: - self.report.add_error("Image %s: Could not retrieve %s (local_filename=%s, url=%s)" % (f.ID, f.local_filename, f.url)) + self.report.add_error(f"Image '{f.ID}': Could not retrieve (local_filename='{f.local_filename}', url='{f.url}')") return def _validate_pixel_density(self): @@ -250,7 +250,7 @@ def _validate_pixel_density(self): for k in ['xResolution', 'yResolution']: v = exif.__dict__.get(k) if v is None or v <= 72: - self.report.add_notice("Image %s: %s (%s pixels per %s) is suspiciously low" % (f.ID, k, v, exif.resolutionUnit)) + self.report.add_notice(f"Image '{f.ID}': {k} ({v} pixels per {exif.resolutionUnit}) is suspiciously low") def _validate_mets_file_group_names(self): """ @@ -261,7 +261,7 @@ def _validate_mets_file_group_names(self): self.log.debug('_validate_mets_file_group_names') for fileGrp in self.mets.file_groups: if not fileGrp.startswith(FILE_GROUP_PREFIX): - self.report.add_notice("fileGrp USE does not begin with '%s': %s" % (FILE_GROUP_PREFIX, fileGrp)) + self.report.add_notice(f"fileGrp USE '{fileGrp}' does not begin with '{FILE_GROUP_PREFIX}'") else: # OCR-D-FOO-BAR -> ('FOO', 'BAR') # \____/\_/ \_/ @@ -273,9 +273,9 @@ def _validate_mets_file_group_names(self): if '-' in category: category, name = category.split('-', 1) if category not in FILE_GROUP_CATEGORIES: - self.report.add_notice("Unspecified USE category '%s' in fileGrp '%s'" % (category, fileGrp)) + self.report.add_notice(f"Unspecified USE category '{category}' in fileGrp '{fileGrp}'") if name is not None and not re.match(r'^[A-Z0-9-]{3,}$', name): - self.report.add_notice("Invalid USE name '%s' in fileGrp '%s'" % (name, fileGrp)) + self.report.add_notice(f"Invalid USE name '{name}' in fileGrp '{fileGrp}'") def _validate_mets_files(self): """ @@ -288,16 +288,16 @@ def _validate_mets_files(self): self.report.add_error("No files") for f in self.mets.find_files(**self.find_kwargs): if f._el.get('GROUPID'): # pylint: disable=protected-access - self.report.add_notice("File '%s' has GROUPID attribute - document might need an update" % f.ID) + self.report.add_notice(f"File '{f.ID}' has GROUPID attribute - document might need an update") if not (f.url or f.local_filename): - self.report.add_error("File '%s' has neither mets:Flocat[@LOCTYPE='URL']/@xlink:href nor mets:FLocat[@LOCTYPE='OTHER'][@OTHERLOCTYPE='FILE']/xlink:href" % f.ID) + self.report.add_error(f"File '{f.ID}' has neither mets:Flocat[@LOCTYPE='URL']/@xlink:href nor mets:FLocat[@LOCTYPE='OTHER'][@OTHERLOCTYPE='FILE']/xlink:href") continue if f.url and 'url' not in self.skip: if re.match(r'^file:/[^/]', f.url): - self.report.add_error("File '%s' has an invalid (Java-specific) file URL '%s'" % (f.ID, f.url)) + self.report.add_error(f"File '{f.ID}' has an invalid (Java-specific) file URL '{f.url}'") scheme = f.url[0:f.url.index(':')] if scheme not in ('http', 'https', 'file'): - self.report.add_warning("File '%s' has non-HTTP, non-file URL '%s'" % (f.ID, f.url)) + self.report.add_warning(f"File '{f.ID}' has non-HTTP, non-file URL '{f.url}'") def _validate_page(self): """ @@ -323,15 +323,15 @@ def _validate_page(self): if 'dimension' in self.page_checks: _, _, exif = self.workspace.image_from_page(page, f.pageId) if page.imageHeight != exif.height: - self.report.add_error("PAGE '%s': @imageHeight != image's actual height (%s != %s)" % (f.ID, page.imageHeight, exif.height)) + self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height ({page.imageHeight} != {exif.height})") if page.imageWidth != exif.width: - self.report.add_error("PAGE '%s': @imageWidth != image's actual width (%s != %s)" % (f.ID, page.imageWidth, exif.width)) + self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width ({page.imageWidth} != {exif.width})") if 'imagefilename' in self.page_checks: imageFilename = page.imageFilename if not self.mets.find_files(url=imageFilename): - self.report.add_error("PAGE-XML %s : imageFilename '%s' not found in METS" % (f.url, imageFilename)) + self.report.add_error(f"PAGE '{f.ID}': imageFilename '{imageFilename}' not found in METS") if is_local_filename(imageFilename) and not Path(imageFilename).exists(): - self.report.add_warning("PAGE-XML %s : imageFilename '%s' points to non-existent local file" % (f.url, imageFilename)) + self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}' points to non-existent local file") if 'mets_fileid_page_pcgtsid' in self.page_checks and pcgts.pcGtsId != f.ID: self.report.add_warning('pc:PcGts/@pcGtsId differs from mets:file/@ID: "%s" !== "%s"' % (pcgts.pcGtsId or '', f.ID or '')) diff --git a/src/ocrd_validators/xsd_validator.py b/src/ocrd_validators/xsd_validator.py index 81b9457564..92e4502124 100644 --- a/src/ocrd_validators/xsd_validator.py +++ b/src/ocrd_validators/xsd_validator.py @@ -45,7 +45,7 @@ def __init__(self, schema_url): schema_url (str): URI of XML schema to validate against. """ if schema_url not in XSD_PATHS: - raise Exception('XML schema not bundled with OCR-D: %s' % schema_url) + raise ValueError('XML schema not bundled with OCR-D: %s' % schema_url) with open(XSD_PATHS[schema_url], 'r') as f: xmlschema_doc = ET.parse(f) self._xmlschema = ET.XMLSchema(xmlschema_doc) diff --git a/tests/base.py b/tests/base.py index 53f393e08d..9eb1f20db8 100644 --- a/tests/base.py +++ b/tests/base.py @@ -26,8 +26,6 @@ class TestCase(VanillaTestCase): def setUp(self): chdir(dirname(realpath(__file__)) + '/..') - disableLogging() - initLogging(builtin_only=True) class CapturingTestCase(TestCase): """ diff --git a/tests/cli/test_bashlib.py b/tests/cli/test_bashlib.py index ab52b6b1ba..ba7c283e40 100644 --- a/tests/cli/test_bashlib.py +++ b/tests/cli/test_bashlib.py @@ -1,4 +1,6 @@ from contextlib import contextmanager +import re +from typing import Tuple, Union from tests.base import CapturingTestCase as TestCase, main, assets, copy_of_directory import os, sys @@ -20,6 +22,13 @@ from ocrd_utils import pushd_popd +def parse_version(v : str) -> Union[Tuple[int, int, int], Tuple[int, int, int, str]]: + tokens = re.split('((?:a|b|rc)[0-9]+)', v, 1) + version_wo_suffix = tokens[0] + prerelease_suffix = tokens[1] if len(tokens) > 1 else '' + (major, minor, patch) = map(int, version_wo_suffix.split('.')) + return (major, minor, patch, prerelease_suffix) + class TestBashlibCli(TestCase): def invoke_bash(self, script, *args, executable=None): @@ -50,7 +59,7 @@ def invoke_bash(self, script, *args, executable=None): return -1, "", str(e) finally: os.remove(scriptfile.name) - + def setUp(self): self.maxDiff = None super().setUp() @@ -89,7 +98,7 @@ def test_constants_fail(self): def test_input_files(self): with copy_of_directory(assets.path_to('kant_aufklaerung_1784/data')) as wsdir: with pushd_popd(wsdir): - _, out, err = self.invoke_cli(bashlib_cli, ['input-files', '-I', 'OCR-D-IMG']) + _, out, err = self.invoke_cli(bashlib_cli, ['input-files', '-I', 'OCR-D-IMG', '-O', 'OUTPUT']) assert ("[url]='' [local_filename]='OCR-D-IMG/INPUT_0017.tif' [ID]='INPUT_0017' [mimetype]='image/tiff' " "[pageId]='PHYS_0017' [outputFileId]='OUTPUT_PHYS_0017'") in out @@ -101,15 +110,22 @@ def test_bashlib_defs(self): assert 'function' in out def test_bashlib_minversion(self): - exit_code, out, err = self.invoke_bash( - "source $(ocrd bashlib filename) && ocrd__minversion 2.29.0") + exit_code, out, err = self.invoke_bash("source $(ocrd bashlib filename) && ocrd__minversion 2.29.0") assert exit_code == 0 - (major, minor, patch) = map(int, str(VERSION).split('.')) + major, minor, patch, prerelease_suffix = parse_version(VERSION) + + # test normal version with impossible minimum minor version version = "%d.%d.%d" % (major, minor + 1, patch) - exit_code, out, err = self.invoke_bash( - "source $(ocrd bashlib filename) && ocrd__minversion " + version) + exit_code, out, err = self.invoke_bash("source $(ocrd bashlib filename) && ocrd__minversion " + version) + assert exit_code > 0 + assert f"ERROR: ocrd/core is too old ({VERSION} < {version})" in err + + # test non-matching prerelease (the 99th alpha pre-release here) + version = "%d.%d.%dz99" % (major, minor, patch) + assert VERSION != version # assuming we will never have 99 alpha prereleases ^^ + exit_code, out, err = self.invoke_bash("source $(ocrd bashlib filename) && ocrd__minversion " + version) assert exit_code > 0 - assert "ERROR: ocrd/core is too old" in err + assert f"ERROR: ocrd/core is too old ({VERSION} < {version})" in err def test_bashlib_cp_processor(self): # script = (Path(__file__).parent.parent / 'data/bashlib_cp_processor.sh').read_text() diff --git a/tests/cli/test_log.py b/tests/cli/test_log.py index c63d78c318..3d81e8266b 100644 --- a/tests/cli/test_log.py +++ b/tests/cli/test_log.py @@ -6,8 +6,8 @@ from tests.base import CapturingTestCase as TestCase, main, assets, copy_of_directory from ocrd.decorators import ocrd_loglevel -from ocrd_utils import setOverrideLogLevel, logging, disableLogging -import logging as python_logging +from ocrd_utils import disableLogging, initLogging +import logging @click.group() @ocrd_loglevel @@ -18,14 +18,19 @@ def mock_ocrd_cli(log_level): class TestLogCli(TestCase): def _get_log_output(self, *args): - disableLogging() code, out, err = self.invoke_cli(mock_ocrd_cli, args) print({'code': code, 'out': out, 'err': err}) return err + def setUp(self): + super().setUp() + initLogging() + def tearDown(self): if 'OCRD_TOOL_NAME' in ENV: del(ENV['OCRD_TOOL_NAME']) + super().tearDown() + disableLogging() def test_loglevel(self): assert 'DEBUG ocrd.log_cli - foo' not in self._get_log_output('log', 'debug', 'foo') diff --git a/tests/cli/test_validate.py b/tests/cli/test_validate.py index 36ee3e5995..12e87f4dc9 100644 --- a/tests/cli/test_validate.py +++ b/tests/cli/test_validate.py @@ -21,8 +21,8 @@ "ocrd-xyz": { "executable": "ocrd-xyz", "description": "bars all the foos", - "input_file_grp": ["OCR-D-FOO"], - "output_file_grp": ["OCR-D-BAR"], + "input_file_grp_cardinality": [1, 2], + "output_file_grp_cardinality": 1, "categories": ["Layout analysis"], "steps": ["layout/analysis"], "parameters": { @@ -57,24 +57,24 @@ def test_validate_ocrd_tool(self): json_path.write_text(OCRD_TOOL) # normal call - code, _, _ = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) - self.assertEqual(code, 0) + code, out, err = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) + self.assertEqual(code, 0, out + err) # relative path with pushd_popd(tempdir): - code, _, _ = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) - self.assertEqual(code, 0) + code, out, err = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) + self.assertEqual(code, 0, out + err) # default path with pushd_popd(tempdir): - code, _, _ = self.invoke_cli(validate_cli, ['tool-json']) - self.assertEqual(code, 0) + code, out, err = self.invoke_cli(validate_cli, ['tool-json']) + self.assertEqual(code, 0, out + err) def test_validate_parameter(self): with TemporaryDirectory() as tempdir: json_path = Path(tempdir, 'ocrd-tool.json') json_path.write_text(OCRD_TOOL) with pushd_popd(tempdir): - code, _, _ = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) - self.assertEqual(code, 0) + code, out, err = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) + self.assertEqual(code, 0, out + err) def test_validate_page(self): page_path = assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml') @@ -84,19 +84,18 @@ def test_validate_page(self): def test_validate_tasks(self): # simple - code, _, _ = self.invoke_cli(validate_cli, ['tasks', + code, out, err = self.invoke_cli(validate_cli, ['tasks', "sample-processor-required-param -I FOO -O OUT1 -p '{\"param1\": true}'", "sample-processor-required-param -I FOO -O OUT2 -p '{\"param1\": true}'", ]) - self.assertEqual(code, 0) + self.assertEqual(code, 0, out + err) # with workspace code, out, err = self.invoke_cli(validate_cli, ['tasks', '--workspace', assets.path_to('kant_aufklaerung_1784/data'), "sample-processor-required-param -I OCR-D-IMG,OCR-D-GT-PAGE -O OUT1 -p '{\"param1\": true}'", "sample-processor-required-param -I OCR-D-IMG,OCR-D-GT-PAGE -O OUT2 -p '{\"param1\": true}'", ]) - print('code=%s out=%s err=%s' % (code, out, err)) - self.assertEqual(code, 0) + self.assertEqual(code, 0, out + err) if __name__ == '__main__': diff --git a/tests/data/__init__.py b/tests/data/__init__.py index 93a2ea49a9..56779a6119 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -1,12 +1,19 @@ +from functools import cached_property import json import os -from ocrd import Processor -from ocrd_utils import make_file_id +from time import sleep +from pytest import warns +from ocrd import Processor, OcrdPageResult +from ocrd_utils import make_file_id, config DUMMY_TOOL = { 'executable': 'ocrd-test', 'description': 'dolor sit', 'steps': ['recognition/post-correction'], + # as we bypass Processor.metadata with OcrdToolValidator + # we get no default expansion, so add default cardinalities here + 'input_file_grp_cardinality': 1, + 'output_file_grp_cardinality': 1, 'parameters': { 'baz': { 'type': 'string', @@ -17,34 +24,148 @@ } class DummyProcessor(Processor): + @property + def ocrd_tool(self): + return DUMMY_TOOL + + @property + def version(self): + return '0.0.1' + + @property + def executable(self): + return 'ocrd-test' def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = DUMMY_TOOL - kwargs['version'] = '0.0.1' - super(DummyProcessor, self).__init__(*args, **kwargs) + kwargs['download_files'] = False + super().__init__(*args, **kwargs) def process(self): print(json.dumps(self.parameter)) + # override to prevent iterating over empty files + def process_workspace(self, workspace): + with warns(DeprecationWarning, match='should be replaced with process_page'): + self.process() + class DummyProcessorWithRequiredParameters(Processor): - def process(self): pass - def __init__(self, *args, **kwargs): - kwargs['version'] = '0.0.1' - kwargs['ocrd_tool'] = { + @property + def ocrd_tool(self): + return { 'executable': 'ocrd-test', 'steps': ['recognition/post-correction'], 'parameters': { 'i-am-required': {'required': True} } } - super(DummyProcessorWithRequiredParameters, self).__init__(*args, **kwargs) + @property + def version(self): + return '0.0.1' + + @property + def executable(self): + return 'ocrd-test' + + def __init__(self, *args, **kwargs): + kwargs['download_files'] = False + super().__init__(*args, **kwargs) + + def process(self): pass class DummyProcessorWithOutput(Processor): + @cached_property + def ocrd_tool(self): + return DUMMY_TOOL + + @cached_property + def version(self): + return '0.0.1' + + @cached_property + def executable(self): + return 'ocrd-test' def __init__(self, *args, **kwargs): + kwargs['download_files'] = False + super().__init__(*args, **kwargs) + + def process(self): + # print([str(x) for x in self.input_files] + for input_file in self.input_files: + file_id = make_file_id(input_file, self.output_file_grp) + # print(input_file.ID, file_id) + self.workspace.add_file( + file_id=file_id, + file_grp=self.output_file_grp, + page_id=input_file.pageId, + mimetype=input_file.mimetype, + local_filename=os.path.join(self.output_file_grp, file_id), + content='CONTENT', + ) + +class DummyProcessorWithOutputSleep(Processor): + @property + def ocrd_tool(self): + # make deep copy + dummy_tool = json.loads(json.dumps(DUMMY_TOOL)) + dummy_tool['parameters']['sleep'] = {'type': 'number'} + return dummy_tool + + @property + def version(self): + return '0.0.1' + + @property + def executable(self): + return 'ocrd-test' + + def __init__(self, *args, **kwargs): + kwargs['download_files'] = False + super().__init__(*args, **kwargs) + + def process_page_pcgts(self, pcgts, page_id=None): + sleep(self.parameter['sleep']) + return OcrdPageResult(pcgts) + +class DummyProcessorWithOutputFailures(Processor): + @cached_property + def ocrd_tool(self): + return DUMMY_TOOL + + @cached_property + def version(self): + return '0.0.1' + + @cached_property + def executable(self): + return 'ocrd-test' + + def __init__(self, *args, **kwargs): + kwargs['download_files'] = False + super().__init__(*args, **kwargs) + + # no error handling with old process(), so override new API + def process_page_file(self, input_file): + n = self.workspace.mets.physical_pages.index(input_file.pageId) + 1 + if n % 2: + raise Exception(f"intermittent failure on page {input_file.pageId}") + output_file_id = make_file_id(input_file, self.output_file_grp) + self.workspace.add_file(file_id=output_file_id, + file_grp=self.output_file_grp, + page_id=input_file.pageId, + local_filename=os.path.join(self.output_file_grp, output_file_id), + mimetype=input_file.mimetype, + content='CONTENT', + ) + +class DummyProcessorWithOutputLegacy(Processor): + def __init__(self, *args, **kwargs): + kwargs['download_files'] = False kwargs['ocrd_tool'] = DUMMY_TOOL kwargs['version'] = '0.0.1' super().__init__(*args, **kwargs) + if hasattr(self, 'output_file_grp'): + self.setup() def process(self): # print([str(x) for x in self.input_files] @@ -52,14 +173,21 @@ def process(self): file_id = make_file_id(input_file, self.output_file_grp) # print(input_file.ID, file_id) self.workspace.add_file( - ID=file_id, + file_id=file_id, file_grp=self.output_file_grp, - pageId=input_file.pageId, + page_id=input_file.pageId, mimetype=input_file.mimetype, local_filename=os.path.join(self.output_file_grp, file_id), - content='CONTENT') + content='CONTENT', + ) class IncompleteProcessor(Processor): - pass + @property + def executable(self): + return 'ocrd-foo' + + @property + def metadata_rawdict(self): + return {'tools': {self.executable: {}}} diff --git a/tests/data/ocrd-cp.ocrd-tool.json b/tests/data/ocrd-cp.ocrd-tool.json index 728c144c50..948695c06d 100755 --- a/tests/data/ocrd-cp.ocrd-tool.json +++ b/tests/data/ocrd-cp.ocrd-tool.json @@ -1,15 +1,18 @@ { - "version": "1.0", + "version": "1.0.0", "tools": { "ocrd-cp": { "executable": "ocrd-cp", "description": "dummy processor copying", "steps": ["preprocessing/optimization"], "categories": ["Image preprocessing"], + # we allow 1 or 2 input file grps + # the output cardinality gets expanded from default + "input_file_grp_cardinality": [1,2], "parameters": { "message": { "type": "string", - "default": "", + "default": "hello by default", "description": "message to print on stdout" } } diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 739db7625a..89742a507e 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -248,7 +248,7 @@ def test_file_pageid(sbb_sample_01): def test_agent(sbb_sample_01): beforelen = len(sbb_sample_01.agents) - sbb_sample_01.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'YETOTHERSTILL') + sbb_sample_01.add_agent(name='foo bar v0.0.1', _type='OTHER', othertype='OTHER', role='YETOTHERSTILL') assert len(sbb_sample_01.agents) == beforelen + 1 def test_metshdr(): diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index 7dc130809f..97335775d6 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -460,7 +460,7 @@ def test_id(): # TODO: is this *really* desired? # I would expect for a single Page-Element the ID is like from the top-level-Pgts-Container, not like a fileName - assert pcgts.get_Page().id == 'OCR-D-IMG/INPUT_0017.tif' + assert pcgts.get_Page().id == 'OCR-D-IMG_INPUT_0017.tif' if __name__ == '__main__': diff --git a/tests/network/test_integration_4_processing_worker.py b/tests/network/test_integration_4_processing_worker.py index e211bd2381..ae322b0978 100644 --- a/tests/network/test_integration_4_processing_worker.py +++ b/tests/network/test_integration_4_processing_worker.py @@ -1,6 +1,6 @@ from pathlib import Path from pika import BasicProperties -from src.ocrd.processor.builtin.dummy_processor import DummyProcessor, OCRD_TOOL +from src.ocrd.processor.builtin.dummy_processor import DummyProcessor from src.ocrd_network.constants import JobState from src.ocrd_network.database import sync_db_create_workspace, sync_db_create_processing_job from src.ocrd_network.logging_utils import get_processing_job_logging_file_path @@ -25,12 +25,13 @@ def test_processing_worker_process_message(): # wrong reads from the deployed dummy worker (part of the processing server integration test) processor_name = "ocrd-dummy-test" result_queue_name = f"{processor_name}-result" + ocrd_tool = DummyProcessor(None).metadata processing_worker = ProcessingWorker( rabbitmq_addr=test_config.RABBITMQ_URL, mongodb_addr=test_config.DB_URL, processor_name=processor_name, - ocrd_tool=OCRD_TOOL, + ocrd_tool=ocrd_tool, processor_class=DummyProcessor ) processing_worker.connect_publisher(enable_acks=True) diff --git a/tests/processor/test_ocrd_dummy.py b/tests/processor/test_ocrd_dummy.py index 41b585c6b9..b85379e47d 100644 --- a/tests/processor/test_ocrd_dummy.py +++ b/tests/processor/test_ocrd_dummy.py @@ -33,7 +33,7 @@ def test_copies_ok(self): output_files = workspace.mets.find_all_files(fileGrp='OUTPUT') output_files.sort(key=lambda x: x.url) assert output_files[0].local_filename == 'OUTPUT/OUTPUT_PHYS_0001.tif' - assert output_files[1].local_filename == 'OUTPUT/OUTPUT_PHYS_0001.xml' + assert output_files[1].local_filename == 'OUTPUT/OUTPUT_PHYS_0001_PAGE.xml' self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID) assert page_from_file(output_files[1]).get_Page().imageFilename == str(output_files[0].local_filename) self.assertEqual(len(output_files), 6) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 784f68fc3d..06c129c3ca 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -1,3 +1,4 @@ +from functools import cached_property import json from contextlib import ExitStack @@ -5,33 +6,50 @@ from pathlib import Path from os import environ from tests.base import CapturingTestCase as TestCase, assets, main, copy_of_directory # pylint: disable=import-error, no-name-in-module -from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, DummyProcessorWithOutput, IncompleteProcessor +from tests.data import ( + DummyProcessor, + DummyProcessorWithRequiredParameters, + DummyProcessorWithOutput, + DummyProcessorWithOutputLegacy, + DummyProcessorWithOutputSleep, + DummyProcessorWithOutputFailures, + IncompleteProcessor +) +from tests.test_mets_server import fixture_start_mets_server -from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging +from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging, config from ocrd.resolver import Resolver -from ocrd.processor.base import Processor, run_processor, run_cli +from ocrd.processor import Processor, run_processor, run_cli, NonUniqueInputFile +from ocrd.processor.helpers import get_processor from unittest import mock import pytest class TestProcessor(TestCase): + def run(self, result=None): + with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as workdir: + with pushd_popd(workdir): + self.resolver = Resolver() + self.workspace = self.resolver.workspace_from_url('mets.xml') + super().run(result=result) + def setUp(self): super().setUp() - # make sure we get an isolated temporary copy of the testdata each time - # as long as we are not using pytest but unittest, we need to manage contexts - # (enterContext is only supported starting with py311) - with ExitStack() as stack: - self.resolver = Resolver() - self.workdir = stack.enter_context(copy_of_directory(assets.path_to('SBB0000F29300010000/data'))) - stack.enter_context(pushd_popd(self.workdir)) - self.workspace = self.resolver.workspace_from_url('mets.xml') - self.addCleanup(stack.pop_all().close) + initLogging() + + def tearDown(self): + super().tearDown() + config.reset_defaults() + disableLogging() def test_incomplete_processor(self): proc = IncompleteProcessor(None) + proc.input_file_grp = 'OCR-D-IMG' + proc.output_file_grp = 'DUMMY' + proc.page_id = None with self.assertRaises(NotImplementedError): - proc.process() + proc.process_workspace(self.workspace) def test_no_resolver(self): with self.assertRaisesRegex(Exception, 'pass a resolver to create a workspace'): @@ -54,34 +72,101 @@ def test_with_mets_url_input_files(self): input_file_grp='OCR-D-SEG-PAGE', resolver=self.resolver, workspace=self.workspace) + processor.workspace = self.workspace assert len(processor.input_files) == 2 assert [f.mimetype for f in processor.input_files] == [MIMETYPE_PAGE, MIMETYPE_PAGE] def test_parameter(self): with TemporaryDirectory(): - jsonpath = Path('params.json').name + jsonpath = 'params.json' with open(jsonpath, 'w') as f: f.write('{"baz": "quux"}') with open(jsonpath, 'r') as f: + parameter = json.load(f) processor = run_processor( DummyProcessor, - parameter=json.load(f), + parameter=parameter, input_file_grp="OCR-D-IMG", resolver=self.resolver, workspace=self.workspace ) - self.assertEqual(len(processor.input_files), 3) + self.assertEqual(processor.parameter['baz'], 'quux') + processor = get_processor( + DummyProcessor, + parameter=parameter) + with self.assertRaises(TypeError): + processor.parameter['baz'] = 'xuuq' + processor.parameter = { **parameter, 'baz': 'xuuq' } + self.assertEqual(processor.parameter['baz'], 'xuuq') + + def test_instance_caching(self): + class DyingDummyProcessor(DummyProcessor): + max_instances = 10 + def shutdown(self): + # fixme: will only print _after_ pytest exits, so too late for assertions + #print(self.parameter['baz']) + pass + self.capture_out_err() + # customize (as processor implementors would) + firstp = None + for i in range(DyingDummyProcessor.max_instances + 2): + p = get_processor( + DyingDummyProcessor, + parameter={'baz': str(i)}, + instance_caching=True + ) + if i == 0: + firstp = p + lastp = p + p = get_processor(DyingDummyProcessor, + parameter={'baz': '0'}, + instance_caching=True + ) + # should not be cached anymore + self.assertNotEqual(firstp, p) + p = get_processor(DyingDummyProcessor, + parameter={'baz': str(i)}, + instance_caching=True + ) + # should still be cached + self.assertEqual(lastp, p) + from ocrd.processor.helpers import get_cached_processor + get_cached_processor.__wrapped__.cache_clear() + p = get_processor(DyingDummyProcessor, + parameter={'baz': str(i)}, + instance_caching=True + ) + # should not be cached anymore + self.assertNotEqual(lastp, p) + # fixme: will only print _after_ pytest exits, so too late for assertions + #out, err = self.capture_out_err() + #assert '0' in out.split('\n') def test_verify(self): - proc = DummyProcessor(self.workspace) + proc = DummyProcessor(None) + with self.assertRaises(AttributeError): + proc.verify() + proc.workspace = self.workspace + proc.input_file_grp = "OCR-D-IMG" + proc.output_file_grp = "DUMMY" self.assertEqual(proc.verify(), True) def test_json(self): - DummyProcessor(self.workspace, dump_json=True) + DummyProcessor(None).dump_json() def test_params_missing_required(self): - with self.assertRaisesRegex(Exception, 'is a required property'): - DummyProcessorWithRequiredParameters(workspace=self.workspace) + proc = DummyProcessorWithRequiredParameters(None) + assert proc.parameter is None + with self.assertRaisesRegex(ValueError, 'is a required property'): + proc.parameter = {} + with self.assertRaisesRegex(ValueError, 'is a required property'): + get_processor(DummyProcessorWithRequiredParameters) + with self.assertRaisesRegex(ValueError, 'is a required property'): + get_processor(DummyProcessorWithRequiredParameters, parameter={}) + with self.assertRaisesRegex(ValueError, 'is a required property'): + run_processor(DummyProcessorWithRequiredParameters, + workspace=self.workspace, input_file_grp="OCR-D-IMG") + proc.parameter = {'i-am-required': 'foo'} def test_params_preset_resolve(self): with pushd_popd(tempdir=True) as tempdir: @@ -107,12 +192,19 @@ def test_params_preset_resolve(self): overwrite=True) def test_params(self): - proc = Processor(workspace=self.workspace) + class ParamTestProcessor(Processor): + @cached_property + def ocrd_tool(self): + return {} + proc = ParamTestProcessor(None) + self.assertEqual(proc.parameter, None) + # get_processor will set to non-none and validate + proc = get_processor(ParamTestProcessor) self.assertEqual(proc.parameter, {}) def test_run_agent(self): no_agents_before = len(self.workspace.mets.agents) - run_processor(DummyProcessor, workspace=self.workspace) + run_processor(DummyProcessor, workspace=self.workspace, input_file_grp="OCR-D-IMG") self.assertEqual(len(self.workspace.mets.agents), no_agents_before + 1, 'one more agent') # print(self.workspace.mets.agents[no_agents_before]) @@ -125,27 +217,86 @@ def test_run_input(self): def test_run_output0(self): with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002') run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", output_file_grp="OCR-D-OUT") assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 2 + def test_run_output_legacy(self): + ws = self.workspace + run_processor(DummyProcessorWithOutputLegacy, + workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT") + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + + def test_run_output_missing(self): + ws = self.workspace + # do not raise for number of failures: + config.OCRD_MAX_MISSING_OUTPUTS = -1 + config.OCRD_MISSING_OUTPUT = 'SKIP' + run_processor(DummyProcessorWithOutputFailures, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT") + # only half succeed + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) // 2 + config.OCRD_MISSING_OUTPUT = 'ABORT' + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' + with pytest.raises(Exception) as exc: + run_processor(DummyProcessorWithOutputFailures, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT") + assert "intermittent" in str(exc.value) + config.OCRD_MISSING_OUTPUT = 'COPY' + config.OCRD_EXISTING_OUTPUT = 'SKIP' + run_processor(DummyProcessorWithOutputFailures, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT") + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + # do raise for number of failures: + config.OCRD_MAX_MISSING_OUTPUTS = 0.4 + config.OCRD_MISSING_OUTPUT = 'SKIP' + with pytest.raises(Exception) as exc: + run_processor(DummyProcessorWithOutputFailures, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT") + assert "too many failures" in str(exc.value) + + def test_run_output_timeout(self): + ws = self.workspace + # do not raise for number of failures: + config.OCRD_MAX_MISSING_OUTPUTS = -1 + config.OCRD_MISSING_OUTPUT = 'ABORT' + config.OCRD_PROCESSING_PAGE_TIMEOUT = 3 + run_processor(DummyProcessorWithOutputSleep, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + parameter={"sleep": 1}) + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' + config.OCRD_PROCESSING_PAGE_TIMEOUT = 1 + with pytest.raises(TimeoutError) as exc: + run_processor(DummyProcessorWithOutputSleep, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + parameter={"sleep": 3}) + def test_run_output_overwrite(self): with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002') - ws.overwrite_mode = True - ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, ID='OCR-D-OUT_phys_0001', pageId='phys_0001') - ws.overwrite_mode = False + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002') + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' + ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, file_id='OCR-D-OUT_phys_0001', page_id='phys_0001') + config.OCRD_EXISTING_OUTPUT = 'ABORT' with pytest.raises(Exception) as exc: run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", output_file_grp="OCR-D-OUT") - assert str(exc.value) == "File with ID='OCR-D-OUT_phys_0001' already exists" - ws.overwrite_mode = True + assert "already exists" in str(exc.value) + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", output_file_grp="OCR-D-OUT") @@ -153,7 +304,9 @@ def test_run_output_overwrite(self): def test_run_cli(self): with TemporaryDirectory() as tempdir: - run_processor(DummyProcessor, workspace=self.workspace) + run_processor(DummyProcessor, workspace=self.workspace, + input_file_grp='OCR-D-IMG', + output_file_grp='OUTPUT') run_cli( 'echo', mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'), @@ -173,7 +326,10 @@ def test_run_cli(self): ) def test_zip_input_files(self): - class ZipTestProcessor(Processor): pass + class ZipTestProcessor(Processor): + @property + def ocrd_tool(self): + return {} with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') @@ -182,7 +338,10 @@ class ZipTestProcessor(Processor): pass ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar4', page_id='phys_0002') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + proc = ZipTestProcessor(None) + proc.workspace = ws + proc.input_file_grp = 'GRP1,GRP2' + proc.page_id = page_id tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] assert ('foobar1', 'foobar2') in tuples assert ('foobar3', 'foobar4') in tuples @@ -193,7 +352,10 @@ class ZipTestProcessor(Processor): pass assert ('foobar3', 'foobar4') in tuples def test_zip_input_files_multi_mixed(self): - class ZipTestProcessor(Processor): pass + class ZipTestProcessor(Processor): + @property + def ocrd_tool(self): + return {} with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') @@ -204,7 +366,10 @@ class ZipTestProcessor(Processor): pass ws.add_file('GRP2', mimetype='image/tiff', file_id='foobar4', page_id='phys_0002') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + proc = ZipTestProcessor(None) + proc.workspace = ws + proc.input_file_grp = 'GRP1,GRP2' + proc.page_id = page_id print("unfiltered") tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] assert ('foobar1', 'foobar2') in tuples @@ -215,23 +380,32 @@ class ZipTestProcessor(Processor): pass ws.add_file('GRP2', mimetype='image/tiff', file_id='foobar4dup', page_id='phys_0002') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + proc = ZipTestProcessor(None) + proc.workspace = ws + proc.input_file_grp = 'GRP1,GRP2' + proc.page_id = page_id tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(on_error='first')] assert ('foobar1', 'foobar2') in tuples assert ('foobar3', 'foobar4') in tuples tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')] assert ('foobar3', None) in tuples - with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."): + with self.assertRaisesRegex(NonUniqueInputFile, "Could not determine unique input file"): tuples = proc.zip_input_files(on_error='abort') ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2dup', page_id='phys_0001') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) - with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"): + proc = ZipTestProcessor(None) + proc.workspace = ws + proc.input_file_grp = 'GRP1,GRP2' + proc.page_id = page_id + with self.assertRaisesRegex(NonUniqueInputFile, "Could not determine unique input file"): tuples = proc.zip_input_files() def test_zip_input_files_require_first(self): - class ZipTestProcessor(Processor): pass + class ZipTestProcessor(Processor): + @property + def ocrd_tool(self): + return {} self.capture_out_err() with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) @@ -239,10 +413,67 @@ class ZipTestProcessor(Processor): pass ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0001') for page_id in [None, 'phys_0001']: with self.subTest(page_id=page_id): - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + proc = ZipTestProcessor(None) + proc.workspace = ws + proc.input_file_grp = 'GRP1,GRP2' + proc.page_id = page_id assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] r = self.capture_out_err() - assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err + assert 'ERROR ocrd.processor.base - Found no file for page phys_0001 in file group GRP1' in r.err + +def test_run_output_metsserver(start_mets_server): + mets_server_url, ws = start_mets_server + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 0 + # do not raise for number of failures: + config.OCRD_MAX_MISSING_OUTPUTS = -1 + run_processor(DummyProcessorWithOutputSleep, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + parameter={"sleep": 0}, + mets_server_url=mets_server_url) + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' + run_processor(DummyProcessorWithOutputSleep, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + parameter={"sleep": 0}, + mets_server_url=mets_server_url) + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + config.OCRD_EXISTING_OUTPUT = 'ABORT' + with pytest.raises(Exception) as exc: + run_processor(DummyProcessorWithOutputSleep, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + parameter={"sleep": 0}, + mets_server_url=mets_server_url) + assert "already exists" in str(exc.value) + config.reset_defaults() + +# 2s (+ 2s tolerance) instead of 3*3s (+ 2s tolerance) +# fixme: pytest-timeout does not shut down / finalize the fixture properly +# (regardless of method or func_only), so the next test in the suite +# does not execute ("previous item was not torn down properly") +# so we must instead wait for completion and assert on the time spent... +#@pytest.mark.timeout(timeout=4, func_only=True, method="signal") +def test_run_output_parallel(start_mets_server): + import time + mets_server_url, ws = start_mets_server + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 0 + # do not raise for single-page timeout + config.OCRD_PROCESSING_PAGE_TIMEOUT = -1 + # do not raise for number of failures: + config.OCRD_MAX_MISSING_OUTPUTS = -1 + config.OCRD_MAX_PARALLEL_PAGES = 3 + start_time = time.time() + run_processor(DummyProcessorWithOutputSleep, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + parameter={"sleep": 2}, + mets_server_url=mets_server_url) + run_time = time.time() - start_time + assert run_time < 3, f"run_processor took {run_time}s" + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + config.reset_defaults() if __name__ == "__main__": main(__file__) diff --git a/tests/test_decorators.py b/tests/test_decorators.py index 5ab2880053..561fdc762d 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -15,7 +15,7 @@ ocrd_loglevel, ocrd_cli_wrap_processor, ) # pylint: disable=protected-access -from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging, get_logging_config_files +from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging, get_logging_config_files, config @click.command() @ocrd_cli_options @@ -41,18 +41,20 @@ def cli_dummy_processor(*args, **kwargs): class TestDecorators(TestCase): - def setUp(self): - super().setUp() + def tearDown(self): + super().tearDown() + config.reset_defaults() disableLogging() def test_minimal(self): - exit_code, out, err = self.invoke_cli(cli_with_ocrd_cli_options, ['-l', 'DEBUG']) - print(out, err) - assert not exit_code + initLogging() + code, out, err = self.invoke_cli(cli_with_ocrd_cli_options, ['-l', 'DEBUG']) + assert not code, (out, err) def test_loglevel_invalid(self): - code, _, err = self.invoke_cli(cli_with_ocrd_loglevel, ['--log-level', 'foo']) - assert code + initLogging() + code, out, err = self.invoke_cli(cli_with_ocrd_loglevel, ['--log-level', 'foo']) + assert code, (out, err) import click if int(click.__version__[0]) < 8: assert 'invalid choice: foo' in err @@ -63,7 +65,7 @@ def test_loglevel_override(self): if get_logging_config_files(): pytest.skip(f"ocrd_logging.conf found at {get_logging_config_files()}, skipping logging test") import logging - disableLogging() + assert logging.getLogger('').getEffectiveLevel() == logging.WARNING assert logging.getLogger('ocrd').getEffectiveLevel() == logging.WARNING initLogging() assert logging.getLogger('ocrd').getEffectiveLevel() == logging.INFO diff --git a/tests/test_logging.py b/tests/test_logging.py index 2e4e0861b5..091fc25bee 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -26,16 +26,22 @@ class TestLogging(TestCase): def setUp(self): pass # do not chdir + def tearDown(self): + super().tearDown() + disableLogging() + def test_loglevel_inheritance(self): initLogging(builtin_only=True) ocrd_logger = logging.getLogger('ocrd') assert ocrd_logger.getEffectiveLevel() == logging.INFO some_logger = getLogger('ocrd.foo') + assert some_logger.level == logging.NOTSET assert some_logger.getEffectiveLevel() == logging.INFO setOverrideLogLevel('ERROR') assert ocrd_logger.getEffectiveLevel() == logging.ERROR assert some_logger.getEffectiveLevel() == logging.ERROR another_logger = getLogger('ocrd.bar') + assert another_logger.level == logging.NOTSET assert another_logger.getEffectiveLevel() == logging.ERROR def test_getLevelName(self): @@ -139,7 +145,7 @@ def testProcessorProfiling(self): getLogger('ocrd.process.profile').setLevel('DEBUG') getLogger('ocrd.process.profile').addHandler(ch) - run_processor(DummyProcessor, resolver=Resolver(), mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) + run_processor(DummyProcessor, input_file_grp='OCR-D-IMG', resolver=Resolver(), mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) log_contents = log_capture_string.getvalue() log_capture_string.close() diff --git a/tests/test_logging_conf.py b/tests/test_logging_conf.py index f8e0e9e894..0717674103 100644 --- a/tests/test_logging_conf.py +++ b/tests/test_logging_conf.py @@ -21,74 +21,67 @@ # sys.path.append(os.path.dirname(os.path.realpath(__file__)) + '/../ocrd') TEST_ROOT = pathlib.Path(os.path.dirname(os.path.abspath(__file__))).parent -def resetLogging(): - disableLogging() - initLogging() - - @pytest.fixture(name="logging_conf") -def _fixture_logging_conf(tmpdir): +def _fixture_logging_conf(tmpdir, capfd): path_logging_conf_orig = os.path.join( str(TEST_ROOT), 'src', 'ocrd_utils', 'ocrd_logging.conf') path_logging_conf_dest = os.path.join(str(tmpdir), 'ocrd_logging.conf') shutil.copy(path_logging_conf_orig, path_logging_conf_dest) - return str(tmpdir) + with pushd_popd(tmpdir): + with capfd.disabled(): + initLogging() + yield str(tmpdir) + disableLogging() -def test_configured_dateformat(logging_conf, capsys): +def test_configured_dateformat(logging_conf, capfd): """Ensure example ocrd_logging.conf is valid and produces desired record format""" # arrange - with pushd_popd(logging_conf): - resetLogging() - test_logger = getLogger('') + test_logger = getLogger('ocrd') - # act - test_logger.info("test logger initialized") + # act + test_logger.info("test logger initialized") - log_info_output = capsys.readouterr().err - must_not_match = r"^\d{4}-\d{2}-\d{2}.*" - assert not re.match(must_not_match, log_info_output) - match_pattern = r"^\d{2}:\d{2}:\d{2}.*" - assert re.match(match_pattern, log_info_output) + log_info_output = capfd.readouterr().err + must_not_match = r"^\d{4}-\d{2}-\d{2}.*" + assert not re.match(must_not_match, log_info_output) + match_pattern = r"^\d{2}:\d{2}:\d{2}.*" + assert re.match(match_pattern, log_info_output), log_info_output -def test_configured_tensorflow_logger_present(logging_conf, capsys): +def test_configured_tensorflow_logger_present(logging_conf, capfd): """Ensure example ocrd_logging.conf is valid and contains logger tensorflow""" # arrange - os.chdir(logging_conf) - resetLogging() logger_under_test = getLogger('tensorflow') # act info logger_under_test.info("tensorflow logger initialized") - log_info_output = capsys.readouterr().err + log_info_output = capfd.readouterr().err assert not log_info_output # act error logger_under_test.error("tensorflow has error") - log_error_output = capsys.readouterr().err + log_error_output = capfd.readouterr().err assert log_error_output -def test_configured_shapely_logger_present(logging_conf, capsys): +def test_configured_shapely_logger_present(logging_conf, capfd): """Ensure example ocrd_logging.conf is valid and contains logger shapely.geos""" # arrange - os.chdir(logging_conf) - resetLogging() logger_under_test = getLogger('shapely.geos') # act info logger_under_test.info("shapely.geos logger initialized") - log_info_output = capsys.readouterr().err + log_info_output = capfd.readouterr().err assert not log_info_output # act error logger_under_test.error("shapely alert") - log_error_output = capsys.readouterr().err + log_error_output = capfd.readouterr().err assert log_error_output if __name__ == '__main__': diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 58ff6e2a9b..3bb96535c0 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -22,50 +22,65 @@ from requests.exceptions import ConnectionError from ocrd import Resolver, OcrdMetsServer, Workspace -from ocrd_utils import pushd_popd, MIMETYPE_PAGE +from ocrd_utils import pushd_popd, MIMETYPE_PAGE, initLogging, setOverrideLogLevel, disableLogging, getLogger -WORKSPACE_DIR = '/tmp/ocrd-mets-server' TRANSPORTS = ['/tmp/ocrd-mets-server.sock', 'http://127.0.0.1:12345'] @fixture(scope='function', name='start_mets_server', params=TRANSPORTS) -def fixture_start_mets_server(request) -> Iterable[Tuple[str, Workspace]]: - def _start_mets_server(*args, **kwargs): - mets_server = OcrdMetsServer(*args, **kwargs) - mets_server.startup() +def fixture_start_mets_server(request, tmpdir) -> Iterable[Tuple[str, Workspace]]: + initLogging() + #setOverrideLogLevel(10) + logger = getLogger('ocrd') + tmpdir = str(tmpdir) mets_server_url = request.param if mets_server_url == TRANSPORTS[0]: if exists(mets_server_url): remove(mets_server_url) - if exists(WORKSPACE_DIR): - rmtree(WORKSPACE_DIR, ignore_errors=True) - - copytree(assets.path_to('SBB0000F29300010000/data'), WORKSPACE_DIR) - workspace = Workspace(Resolver(), WORKSPACE_DIR) - p = Process(target=_start_mets_server, kwargs={'workspace': workspace, 'url': request.param}) + if exists(tmpdir): + rmtree(tmpdir, ignore_errors=True) + + copytree(assets.path_to('SBB0000F29300010000/data'), tmpdir) + workspace = Workspace(Resolver(), tmpdir) + class MetsServerProcess(Process): + def __init__(self, *args, **kwargs): + self.server = OcrdMetsServer(*args, **kwargs) + super().__init__() + def run(self): + self.server.startup() + def terminate(self): + self.server.workspace.save_mets() + super().terminate() + p = MetsServerProcess(workspace=workspace, url=request.param) p.start() + logger.info("started METS Server") sleep(1) # sleep to start up server - yield mets_server_url, Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) + workspace_server = Workspace(Resolver(), tmpdir, mets_server_url=mets_server_url) + yield mets_server_url, workspace_server p.terminate() - rmtree(WORKSPACE_DIR, ignore_errors=True) - -def add_file_server(x): - mets_server_url, i = x - workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) + p.join() + logger.info("terminated METS Server") + rmtree(tmpdir, ignore_errors=True) + disableLogging() + +def add_file_server(x, force=False): + mets_server_url, directory, i = x + workspace_server = Workspace(Resolver(), directory, mets_server_url=mets_server_url) workspace_server.add_file( + 'FOO', local_filename=f'local_filename{i}', mimetype=MIMETYPE_PAGE, page_id=f'page{i}', - file_grp='FOO', file_id=f'FOO_page{i}_foo{i}', # url=f'url{i}' + force=force ) def add_agent_server(x): - mets_server_url, i = x - workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) + mets_server_url, directory, i = x + workspace_server = Workspace(Resolver(), directory, mets_server_url=mets_server_url) workspace_server.mets.add_agent( name=f'proc{i}', _type='baz', @@ -82,7 +97,10 @@ def test_mets_server_add_file(start_mets_server): # add NO_FILES files in parallel with Pool() as pool: - pool.map(add_file_server, zip(repeat(mets_server_url), range(NO_FILES))) + pool.map(add_file_server, zip( + repeat(mets_server_url), + repeat(workspace_server.directory), + range(NO_FILES))) assert set(workspace_server.mets.file_groups) == set( [ 'OCR-D-IMG', @@ -107,7 +125,7 @@ def test_mets_server_add_file(start_mets_server): assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == NO_FILES # not yet synced - workspace_file = Workspace(Resolver(), WORKSPACE_DIR) + workspace_file = Workspace(Resolver(), workspace_server.directory) assert len(workspace_file.mets.find_all_files(fileGrp='FOO')) == 0 # sync @@ -116,6 +134,19 @@ def test_mets_server_add_file(start_mets_server): assert len(workspace_file.mets.find_all_files(fileGrp='FOO')) == NO_FILES +def test_mets_server_add_file_overwrite(start_mets_server): + mets_server_url, workspace_server = start_mets_server + + add_file_server((mets_server_url, workspace_server.directory, 5)) + + assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == 1 + + with raises(RuntimeError, match="already exists"): + add_file_server((mets_server_url, workspace_server.directory, 5)) + + add_file_server((mets_server_url, workspace_server.directory, 5), force=True) + assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == 1 + def test_mets_server_add_agents(start_mets_server): NO_AGENTS = 30 @@ -125,13 +156,16 @@ def test_mets_server_add_agents(start_mets_server): # add NO_AGENTS agents in parallel with Pool() as pool: - pool.map(add_agent_server, zip(repeat(mets_server_url), list(range(NO_AGENTS)))) + pool.map(add_agent_server, zip( + repeat(mets_server_url), + repeat(workspace_server.directory), + list(range(NO_AGENTS)))) assert len(workspace_server.mets.agents) == NO_AGENTS + no_agents_before # XXX not a tuple assert workspace_server.mets.agents[-1].notes[0][0] == {'{https://ocr-d.de}foo': 'bar'} - workspace_file = Workspace(Resolver(), WORKSPACE_DIR) + workspace_file = Workspace(Resolver(), workspace_server.directory) assert len(workspace_file.mets.agents) == no_agents_before # sync @@ -142,7 +176,7 @@ def test_mets_server_add_agents(start_mets_server): def test_mets_server_str(start_mets_server): mets_server_url, workspace_server = start_mets_server - workspace_server = Workspace(Resolver(), WORKSPACE_DIR, mets_server_url=mets_server_url) + workspace_server = Workspace(Resolver(), workspace_server.directory, mets_server_url=mets_server_url) f = next(workspace_server.find_files()) assert str(f) == '' a = workspace_server.mets.agents[0] @@ -182,7 +216,7 @@ def test_mets_server_socket_stop(start_mets_server): assert True, 'No stop conditions to test for TCP server' else: assert Path(mets_server_url).exists() - assert workspace_server.mets.workspace_path == WORKSPACE_DIR + assert workspace_server.mets.workspace_path == workspace_server.directory workspace_server.mets.stop() with raises(ConnectionError): workspace_server.mets.file_groups @@ -236,7 +270,7 @@ def test_reload(start_mets_server : Tuple[str, Workspace]): assert len(workspace_server.mets.find_all_files()) == 35, '35 files total' assert len(workspace_server_copy.mets.find_all_files()) == 35, '35 files total' - workspace_server_copy.add_file('FOO', ID='foo', mimetype='foo/bar', local_filename='mets.xml', pageId='foo') + workspace_server_copy.add_file('FOO', file_id='foo', mimetype='foo/bar', local_filename='mets.xml', page_id='foo') assert len(workspace_server.mets.find_all_files()) == 35, '35 files total' assert len(workspace_server_copy.mets.find_all_files()) == 36, '36 files total' diff --git a/tests/test_resolver.py b/tests/test_resolver.py index 16dfd03d56..c2575b6086 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -292,20 +292,21 @@ def test_resolve_mets_arguments(): https://github.com/OCR-D/core/issues/517 """ resolver = Resolver() - assert resolver.resolve_mets_arguments(None, None, None, None) == (str(Path.cwd()), str(Path.cwd() / 'mets.xml'), 'mets.xml', None) - assert resolver.resolve_mets_arguments('/', None, 'mets.xml', None) == ('/', '/mets.xml', 'mets.xml', None) - assert resolver.resolve_mets_arguments('/foo', '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) - assert resolver.resolve_mets_arguments(None, '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) - assert resolver.resolve_mets_arguments('/foo', 'foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) - assert resolver.resolve_mets_arguments('/foo', 'http://bar/foo.xml', None, None) == ('/foo', 'http://bar/foo.xml', 'foo.xml', None) - with pytest.raises(ValueError, match="Use either --mets or --mets-basename, not both"): - resolver.resolve_mets_arguments('/', '/foo/bar', 'foo.xml', None) - with pytest.raises(ValueError, match="inconsistent with --directory"): - resolver.resolve_mets_arguments('/foo', '/bar/foo.xml', None, None) - with pytest.warns(DeprecationWarning): - resolver.resolve_mets_arguments('/foo', None, 'not_mets.xml', None) - with pytest.raises(ValueError, match=r"--mets is an http\(s\) URL but no --directory was given"): - resolver.resolve_mets_arguments(None, 'http://bar/foo.xml', None, None) + with pytest.warns(DeprecationWarning, match='--mets-basename'): + assert resolver.resolve_mets_arguments(None, None, None, None) == (str(Path.cwd()), str(Path.cwd() / 'mets.xml'), 'mets.xml', None) + assert resolver.resolve_mets_arguments('/', None, 'mets.xml', None) == ('/', '/mets.xml', 'mets.xml', None) + assert resolver.resolve_mets_arguments('/foo', '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) + assert resolver.resolve_mets_arguments(None, '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) + assert resolver.resolve_mets_arguments('/foo', 'foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) + assert resolver.resolve_mets_arguments('/foo', 'http://bar/foo.xml', None, None) == ('/foo', 'http://bar/foo.xml', 'foo.xml', None) + with pytest.raises(ValueError, match="Use either --mets or --mets-basename, not both"): + resolver.resolve_mets_arguments('/', '/foo/bar', 'foo.xml', None) + with pytest.raises(ValueError, match="inconsistent with --directory"): + resolver.resolve_mets_arguments('/foo', '/bar/foo.xml', None, None) + with pytest.warns(DeprecationWarning): + resolver.resolve_mets_arguments('/foo', None, 'not_mets.xml', None) + with pytest.raises(ValueError, match=r"--mets is an http\(s\) URL but no --directory was given"): + resolver.resolve_mets_arguments(None, 'http://bar/foo.xml', None, None) if __name__ == '__main__': main(__file__) diff --git a/tests/test_utils.py b/tests/test_utils.py index 89ff6d90f3..dea7ad7942 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -242,12 +242,16 @@ def test_set_json_key_value_overrides(): def test_assert_file_grp_cardinality(): with raises(AssertionError, match="Expected exactly 5 output file groups, but '.'FOO', 'BAR'.' has 2"): - assert_file_grp_cardinality('FOO,BAR', 5) + with warns(DeprecationWarning, match="file_grp_cardinality in ocrd-tool.json instead"): + assert_file_grp_cardinality('FOO,BAR', 5) with raises(AssertionError, match="Expected exactly 1 output file group, but '.'FOO', 'BAR'.' has 2"): - assert_file_grp_cardinality('FOO,BAR', 1) - assert_file_grp_cardinality('FOO,BAR', 2) + with warns(DeprecationWarning, match="file_grp_cardinality in ocrd-tool.json instead"): + assert_file_grp_cardinality('FOO,BAR', 1) + with warns(DeprecationWarning, match="file_grp_cardinality in ocrd-tool.json instead"): + assert_file_grp_cardinality('FOO,BAR', 2) with raises(AssertionError, match="Expected exactly 1 output file group .foo bar., but '.'FOO', 'BAR'.' has 2"): - assert_file_grp_cardinality('FOO,BAR', 1, 'foo bar') + with warns(DeprecationWarning, match="file_grp_cardinality in ocrd-tool.json instead"): + assert_file_grp_cardinality('FOO,BAR', 1, 'foo bar') def test_make_file_id_simple(): f = create_ocrd_file('MAX', ID="MAX_0012") diff --git a/tests/test_workspace.py b/tests/test_workspace.py index c8df9b444b..ad9cd15575 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -87,10 +87,10 @@ def test_workspace_add_file_overwrite(plain_workspace): plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id='phys1', local_filename=fpath) with pytest.raises(FileExistsError) as fn_exc: plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id=None, local_filename=fpath) - assert str(fn_exc.value) == "File with file_id='ID1' already exists" + assert "already exists" in str(fn_exc.value) with pytest.raises(FileExistsError) as fn_exc: plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id='phys2', local_filename=fpath, force=True) - assert 'cannot mitigate' in str(fn_exc.value) + assert 'cannot mitigate' in str(fn_exc.value) plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT2', page_id='phys1', local_filename=fpath, force=True) f = plain_workspace.mets.find_all_files()[0] @@ -270,9 +270,9 @@ def test_remove_file_force(sbb_data_workspace): # TODO check semantics - can a non-existent thing be removed? assert not sbb_data_workspace.remove_file('non-existing-id', force=True) - # should also succeed - sbb_data_workspace.overwrite_mode = True - assert not sbb_data_workspace.remove_file('non-existing-id', force=False) + with pytest.raises(FileNotFoundError) as not_found_exc: + sbb_data_workspace.remove_file('non-existing-id', force=False) + assert "not found in METS" in str(not_found_exc.value) def test_remove_file_remote_not_available_raises_exception(plain_workspace): @@ -292,9 +292,9 @@ def test_remove_file_remote(plain_workspace): assert plain_workspace.remove_file('page1_img', force=True) # TODO check returned value - # should also "succeed", because overwrite_mode is set which also sets 'force' to 'True' - plain_workspace.overwrite_mode = True - assert not plain_workspace.remove_file('page1_img') + with pytest.raises(FileNotFoundError) as not_found_exc: + plain_workspace.remove_file('page1_img') + assert "not found in METS" in str(not_found_exc.value) def test_rename_file_group(tmp_path): @@ -341,9 +341,6 @@ def test_remove_file_group_force(sbb_data_workspace): # check function and tests semantics # should succeed assert not sbb_data_workspace.remove_file_group('I DO NOT EXIST', force=True) - # should also succeed - sbb_data_workspace.overwrite_mode = True - assert not sbb_data_workspace.remove_file_group('I DO NOT EXIST', force=False) def test_remove_file_group_rmdir(sbb_data_tmp, sbb_data_workspace): @@ -417,7 +414,7 @@ def test_save_image_file_invalid_mimetype_raises_exception(plain_workspace): # act raise with pytest.raises(KeyError) as key_exc: - plain_workspace.save_image_file(img, 'page1_img', 'IMG', 'page1', 'ceci/nest/pas/une/mimetype') + plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='ceci/nest/pas/une/mimetype') assert "'ceci/nest/pas/une/mimetype'" == str(key_exc.value) @@ -428,13 +425,18 @@ def test_save_image_file(plain_workspace): img = Image.new('RGB', (1000, 1000)) # act - assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', 'page1', 'image/jpeg') + assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg') assert exists(join(plain_workspace.directory, 'IMG', 'page1_img.jpg')) # should succeed - assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', 'page1', 'image/jpeg', force=True) - # should also succeed - plain_workspace.overwrite_mode = True - assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', 'page1', 'image/jpeg') + assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg', force=True) + # should fail + with pytest.raises(FileExistsError) as exists_exc: + plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg') + assert "neither force nor ignore are set" in str(exists_exc.value) + + # check file_path kwarg + assert plain_workspace.save_image_file(img, 'page1_img2', 'IMG', page_id='page1', file_path='IMG/page1_img2.png') + assert exists(join(plain_workspace.directory, 'IMG', 'page1_img2.png')) @pytest.fixture(name='workspace_kant_aufklaerung') @@ -484,8 +486,10 @@ def test_image_from_page_basic(workspace_gutachten_data): pcgts = parseString(f.read().encode('utf8'), silence=True) # act + assert - _, info, _ = workspace_gutachten_data.image_from_page(pcgts.get_Page(), page_id='PHYS_0017', feature_selector='clipped', feature_filter='cropped') - assert info['features'] == 'binarized,clipped' + img, coords, _ = workspace_gutachten_data.image_from_page(pcgts.get_Page(), page_id='PHYS_0017', feature_selector='clipped', feature_filter='cropped') + assert coords['features'] == 'binarized,clipped' + assert isinstance(img.info.get('dpi', None), tuple) + assert img.info['dpi'][0] == coords['DPI'] _, info, _ = workspace_gutachten_data.image_from_page(pcgts.get_Page(), page_id='PHYS_0017') assert info['features'] == 'binarized,clipped' @@ -526,6 +530,7 @@ def test_deskewing(plain_workspace): skew = 4.625 image = Image.new('L', size) image = polygon_mask(image, poly) + image.info['dpi'] = (300, 300) #image.show(title='image') pixels = np.count_nonzero(np.array(image) > 0) name = 'foo0' @@ -536,9 +541,12 @@ def test_deskewing(plain_workspace): Coords=CoordsType(points=points_from_polygon(poly)), orientation=-skew) page.add_TextRegion(region) - page_image, page_coords, _ = plain_workspace.image_from_page(page, '') + page_image, page_coords, page_info = plain_workspace.image_from_page(page, '') #page_image.show(title='page_image') assert list(image.getdata()) == list(page_image.getdata()) + assert 'dpi' in page_image.info + assert round(page_image.info['dpi'][0]) == 300 + assert page_coords['DPI'] == 300 assert np.all(page_coords['transform'] == np.eye(3)) reg_image, reg_coords = plain_workspace.image_from_segment(region, page_image, page_coords, feature_filter='deskewed', fill=0) @@ -547,6 +555,7 @@ def test_deskewing(plain_workspace): assert reg_image.height == xywh['h'] == 335 assert reg_coords['transform'][0, 2] == -xywh['x'] assert reg_coords['transform'][1, 2] == -xywh['y'] + assert round(reg_image.info['dpi'][0]) == 300 # same fg after cropping to minimal bbox reg_pixels = np.count_nonzero(np.array(reg_image) > 0) assert pixels == reg_pixels @@ -558,6 +567,7 @@ def test_deskewing(plain_workspace): assert reg_coords['transform'][0, 1] != 0 assert reg_coords['transform'][1, 0] != 0 assert 'deskewed' in reg_coords['features'] + assert round(reg_image.info['dpi'][0]) == 300 # same fg after cropping to minimal bbox (roughly - due to aliasing) reg_pixels = np.count_nonzero(np.array(reg_image) > 0) assert np.abs(pixels - reg_pixels) / pixels < 0.005 @@ -579,6 +589,7 @@ def test_deskewing(plain_workspace): assert reg_image2.height == reg_image.height assert np.allclose(reg_coords2['transform'], reg_coords['transform']) assert reg_coords2['features'] == reg_coords['features'] + assert round(reg_image2.info['dpi'][0]) == 300 # same fg after cropping to minimal bbox (roughly - due to aliasing) reg_pixels2 = np.count_nonzero(np.array(reg_image) > 0) assert reg_pixels2 == reg_pixels @@ -673,7 +684,7 @@ def test_merge_overwrite(tmp_path): ws1.add_file('X', page_id='X', mimetype='X', file_id='id123', local_filename='X/X', content='ws1') ws2.add_file('X', page_id='X', mimetype='X', file_id='id456', local_filename='X/X', content='ws2') ws1.merge(ws2) - assert "would overwrite" == str(exc.value) + assert "would overwrite" in str(exc.value) def test_merge_with_filter(plain_workspace, tmp_path): # arrange @@ -734,7 +745,7 @@ def _fixture_metsDocumentID(tmp_path): def test_agent_before_metsDocumentID(workspace_metsDocumentID): report = WorkspaceValidator.validate(Resolver(), mets_url=workspace_metsDocumentID.mets_target) assert report.is_valid - workspace_metsDocumentID.mets.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'OTHER') + workspace_metsDocumentID.mets.add_agent(name='foo bar v0.0.1', _type='OTHER', othertype='OTHER', role='OTHER') workspace_metsDocumentID.save_mets() report = WorkspaceValidator.validate(Resolver(), mets_url=workspace_metsDocumentID.mets_target) print(report.errors) diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index 99595a864c..a94eb5d3cc 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -57,3 +57,11 @@ def test_OCRD_PROFILE(): with temp_env_var('OCRD_PROFILE', 'some other value'): with raises(ValueError, match="'OCRD_PROFILE' set to invalid value 'some other value'"): config.OCRD_PROFILE + +def test_defaults(): + default = config.OCRD_MAX_PROCESSOR_CACHE + print(type(default)) + config.OCRD_MAX_PROCESSOR_CACHE = 2 + assert config.OCRD_MAX_PROCESSOR_CACHE == 2 + config.reset_defaults() + assert config.OCRD_MAX_PROCESSOR_CACHE == default diff --git a/tests/validator/test_json_validator.py b/tests/validator/test_json_validator.py index 8a8387d4b6..d81c894f97 100644 --- a/tests/validator/test_json_validator.py +++ b/tests/validator/test_json_validator.py @@ -1,5 +1,5 @@ from tests.base import TestCase, main -from ocrd_validators.json_validator import JsonValidator, DefaultValidatingDraft6Validator +from ocrd_validators.json_validator import JsonValidator, DefaultValidatingDraft20199Validator class TestParameterValidator(TestCase): @@ -15,23 +15,23 @@ def setUp(self): } } } - self.defaults_validator = JsonValidator(self.schema, DefaultValidatingDraft6Validator) + self.defaults_validator = JsonValidator(self.schema, DefaultValidatingDraft20199Validator) super().setUp() def test_validate_string(self): report = JsonValidator.validate('{}', {}) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_defaults_set(self): obj = {'bar': 2000} report = self.defaults_validator._validate(obj) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) self.assertEqual(obj, {'foo': 3000, 'bar': 2000}) def test_properr(self): obj = {'bar': 100, 'quux': {}} report = self.defaults_validator._validate(obj) - self.assertFalse(report.is_valid) + self.assertFalse(report.is_valid, str(report.to_xml())) self.assertEqual(len(report.errors), 1) diff --git a/tests/validator/test_ocrd_tool_validator.py b/tests/validator/test_ocrd_tool_validator.py index 3ad40d8645..df19e8e64c 100644 --- a/tests/validator/test_ocrd_tool_validator.py +++ b/tests/validator/test_ocrd_tool_validator.py @@ -12,8 +12,8 @@ "ocrd-xyz": { "executable": "ocrd-xyz", "description": "bars all the foos", - "input_file_grp": ["OCR-D-FOO"], - "output_file_grp": ["OCR-D-BAR"], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "categories": ["Layout analysis"], "steps": ["layout/analysis"] } @@ -29,7 +29,7 @@ def setUp(self): def test_smoke(self): report = OcrdToolValidator.validate(self.ocrd_tool) - self.assertEqual(report.is_valid, True) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_additional_props(self): self.ocrd_tool['not-allowed'] = 'YUP' @@ -48,7 +48,7 @@ def test_file_param_ok(self): ocrd_tool = json.loads(skeleton) ocrd_tool['tools']['ocrd-xyz']['parameters'] = {"file-param": {"description": "...", "type": "string", "content-type": 'application/rdf+xml'}} report = OcrdToolValidator.validate(ocrd_tool) - self.assertEqual(report.is_valid, True) + self.assertTrue(report.is_valid, str(report.to_xml())) # Not restricted anymore since spec 3.3.0 # def test_file_param_bad_content_types(self): diff --git a/tests/validator/test_page_validator.py b/tests/validator/test_page_validator.py index 79e92d90fa..e6aaff1523 100644 --- a/tests/validator/test_page_validator.py +++ b/tests/validator/test_page_validator.py @@ -16,9 +16,10 @@ def test_validate_err(self): PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_strategy='best') # test with deprecated name with self.assertRaisesRegex(Exception, 'page_textequiv_strategy best not implemented'): - PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, strategy='best') + with self.assertWarnsRegex(DeprecationWarning, r'use page_textequiv_strategy'): + PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, strategy='best') with self.assertRaisesRegex(Exception, 'page_textequiv_consistency level superstrictest not implemented'): - PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_consistency='superstrictest', strategy='first') + PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_consistency='superstrictest', page_textequiv_strategy='first') def test_validate_filename(self): report = PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME) @@ -44,7 +45,7 @@ def test_validate_lax(self): report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 26, '26 textequiv consistency errors - strict') - report = PageValidator.validate(ocrd_page=ocrd_page, strictness='lax') + report = PageValidator.validate(ocrd_page=ocrd_page, page_textequiv_consistency='lax') self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 1, '1 textequiv consistency errors - lax') def test_validate_multi_textequiv_first(self): @@ -89,7 +90,7 @@ def test_fix(self): ocrd_page = parse(FAULTY_GLYPH_PAGE_FILENAME, silence=True) report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors') - PageValidator.validate(ocrd_page=ocrd_page, strictness='fix') + PageValidator.validate(ocrd_page=ocrd_page, page_textequiv_consistency='fix') report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 0, 'no more textequiv consistency errors') diff --git a/tests/validator/test_parameter_validator.py b/tests/validator/test_parameter_validator.py index f0d9d41d2c..297a149064 100644 --- a/tests/validator/test_parameter_validator.py +++ b/tests/validator/test_parameter_validator.py @@ -42,7 +42,7 @@ def test_default_assignment(self): }) obj = {'baz': '23'} report = validator.validate(obj) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) self.assertEqual(obj, {'baz': '23', "num-param": 1}) def test_min_max(): diff --git a/tests/validator/test_resource_list_validator.py b/tests/validator/test_resource_list_validator.py index eb95d9b1ea..cc63c30ea7 100644 --- a/tests/validator/test_resource_list_validator.py +++ b/tests/validator/test_resource_list_validator.py @@ -22,8 +22,7 @@ def reslist(): def test_resource_list_validator(reslist): report = OcrdResourceListValidator.validate(reslist) - print(report.errors) - assert report.is_valid == True + assert report.is_valid, str(report.to_xml()) if __name__ == '__main__': main(__file__) diff --git a/tests/validator/test_workspace_validator.py b/tests/validator/test_workspace_validator.py index bc516d5a53..2e63bb5495 100644 --- a/tests/validator/test_workspace_validator.py +++ b/tests/validator/test_workspace_validator.py @@ -90,7 +90,7 @@ def test_validate_file_groups_non_ocrd(self): self.assertEqual(len(report.errors), 1) self.assertIn('No files', report.errors[0]) self.assertEqual(len(report.notices), 1) - self.assertIn("USE does not begin with 'OCR-D-'", report.notices[0]) + self.assertIn("fileGrp USE 'FOO' does not begin with 'OCR-D-'", report.notices[0]) def test_validate_file_groups_unspecified(self): with TemporaryDirectory() as tempdir: diff --git a/tests/validator/test_xsd_validator.py b/tests/validator/test_xsd_validator.py index d0150338dd..50b3851ffc 100644 --- a/tests/validator/test_xsd_validator.py +++ b/tests/validator/test_xsd_validator.py @@ -37,22 +37,22 @@ def test_mets_empty(self): def test_validate_simple_protected_str(self): val = XsdValidator(XSD_METS_URL) report = val._validate(self.ws.mets.to_xml()) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_validate_simple_protected_doc(self): val = XsdValidator(XSD_METS_URL) report = val._validate(self.ws.mets._tree) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_validate_simple_static_doc(self): report = XsdValidator.validate(XSD_METS_URL, self.ws.mets._tree) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) class TestXsdPageValidator(TestCase): def test_validate_page_simple_static_doc(self): report = XsdPageValidator.validate(simple_page) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) if __name__ == '__main__': main(__file__)