From 833dac713c1dae3f7c4a3d931abdef866f670213 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 10 Jun 2024 23:49:42 +0200 Subject: [PATCH 001/228] deprecate Processor.process() --- src/ocrd/__init__.py | 2 +- src/ocrd/processor/base.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/ocrd/__init__.py b/src/ocrd/__init__.py index 62b6ffbc0a..9aa507b2c0 100644 --- a/src/ocrd/__init__.py +++ b/src/ocrd/__init__.py @@ -14,7 +14,7 @@ """ -from ocrd.processor.base import run_processor, run_cli, Processor +from ocrd.processor.base import run_processor, run_cli, Processor, ResourceNotFoundError from ocrd_models import OcrdMets, OcrdExif, OcrdFile, OcrdAgent from ocrd.resolver import Resolver from ocrd_validators import * diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 931d945d45..6b10d61b06 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -18,8 +18,9 @@ import sys import tarfile import io -from ocrd.workspace import Workspace +from deprecated import deprecated +from ocrd.workspace import Workspace from ocrd_utils import ( VERSION as OCRD_VERSION, MIMETYPE_PAGE, @@ -175,6 +176,9 @@ def __init__( if not report.is_valid: raise Exception("Invalid parameters %s" % report.errors) self.parameter = parameter + # workaround for deprecated#72 (deprecation does not work for subclasses): + setattr(self, 'process', + deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process'))) def show_help(self, subcommand=None): print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand)) @@ -188,6 +192,7 @@ def verify(self): """ return True + @deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()') def process(self) -> None: """ Process the :py:attr:`workspace` From 3f4c7f99a70bcbb881c4eed43315eacf8117fbdc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 11 Jun 2024 00:36:29 +0200 Subject: [PATCH 002/228] fix #274: no default -I / -O --- src/ocrd/decorators/__init__.py | 2 ++ src/ocrd/decorators/ocrd_cli_options.py | 7 ++----- src/ocrd/processor/base.py | 7 ++----- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index 811587a10d..cbeadc8d7b 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -71,6 +71,8 @@ def ocrd_cli_wrap_processor( initLogging() LOG = getLogger('ocrd.cli_wrap_processor') + assert kwargs['input_file_grp'] is not None + assert kwargs['output_file_grp'] is not None # LOG.info('kwargs=%s' % kwargs) if 'parameter' in kwargs: # Disambiguate parameter file/literal, and resolve file diff --git a/src/ocrd/decorators/ocrd_cli_options.py b/src/ocrd/decorators/ocrd_cli_options.py index f329558388..e640a20032 100644 --- a/src/ocrd/decorators/ocrd_cli_options.py +++ b/src/ocrd/decorators/ocrd_cli_options.py @@ -29,11 +29,8 @@ def cli(mets_url): option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME), option('-w', '--working-dir', help="Working Directory"), option('-U', '--mets-server-url', help="METS server URL. Starts with http:// then TCP, otherwise unix socket path"), - # TODO OCR-D/core#274 - # option('-I', '--input-file-grp', required=True), - # option('-O', '--output-file-grp', required=True), - option('-I', '--input-file-grp', default='INPUT'), - option('-O', '--output-file-grp', default='OUTPUT'), + option('-I', '--input-file-grp', default=None), + option('-O', '--output-file-grp', default=None), option('-g', '--page-id'), option('--overwrite', is_flag=True, default=False), option('--profile', is_flag=True, default=False), diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 6b10d61b06..b0cb1e26af 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -67,11 +67,8 @@ def __init__( workspace : Workspace, ocrd_tool=None, parameter=None, - # TODO OCR-D/core#274 - # input_file_grp=None, - # output_file_grp=None, - input_file_grp="INPUT", - output_file_grp="OUTPUT", + input_file_grp=None, + output_file_grp=None, page_id=None, resolve_resource=None, show_resource=None, From d2b5df3a0ad0293b258149dad242cb56964206c0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 18 Jun 2024 11:19:20 +0200 Subject: [PATCH 003/228] workspace.download: fix typo in exception --- src/ocrd/workspace.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 41ea8e9005..8ce42a070d 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -208,7 +208,7 @@ def download_file(self, f, _recursion_count=0): self.baseurl, f.local_filename) url = '%s/%s' % (self.baseurl, f.local_filename) else: - raise FileNotFoundError(f"'local_filename' {f.local_filename} points to non-existing file," + raise FileNotFoundError(f"'local_filename' {f.local_filename} points to non-existing file, " "and no 'url' to download and no 'baseurl' set on workspace - nothing we can do.") file_path = Path(f.local_filename) self.resolver.download_to_directory(self.directory, url, subdir=file_path.parent, basename=file_path.name) @@ -219,7 +219,7 @@ def download_file(self, f, _recursion_count=0): f.local_filename = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename) return f # If neither f.local_filename nor f.url is set, fail - raise ValueError("OcrdFile {f} has neither 'url' nor 'local_filename', so cannot be downloaded") + raise ValueError(f"OcrdFile {f} has neither 'url' nor 'local_filename', so cannot be downloaded") def remove_file(self, file_id, force=False, keep_file=False, page_recursive=False, page_same_group=False): """ From 9827c4d18d42f36a94c65621442be29a98e7254e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 15:00:33 +0200 Subject: [PATCH 004/228] Processor: factor-out show_resource(), delegate to resolve_resource() --- src/ocrd/processor/base.py | 41 +++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index b0cb1e26af..263f81d631 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -129,27 +129,22 @@ def __init__( for res in self.list_all_resources(): print(res) return - if resolve_resource or show_resource: - initLogging() + if resolve_resource: try: - res_fname = self.resolve_resource(resolve_resource or show_resource) + res = self.resolve_resource(resolve_resource) + print(res) + except ResourceNotFoundError as e: + log = getLogger('ocrd.processor.base') + log.critical(e.message) + sys.exit(1) + return + if show_resource: + try: + self.show_resource(show_resource) except ResourceNotFoundError as e: log = getLogger('ocrd.processor.base') log.critical(e.message) sys.exit(1) - if resolve_resource: - print(res_fname) - return - fpath = Path(res_fname) - if fpath.is_dir(): - with pushd_popd(fpath): - fileobj = io.BytesIO() - with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball: - tarball.add('.') - fileobj.seek(0) - copyfileobj(fileobj, sys.stdout.buffer) - else: - sys.stdout.buffer.write(fpath.read_bytes()) return if show_help: self.show_help(subcommand=subcommand) @@ -235,6 +230,7 @@ def resolve_resource(self, val): Args: val (string): resource value to resolve """ + initLogging() executable = self.ocrd_tool['executable'] log = getLogger('ocrd.processor.base') if exists(val): @@ -252,6 +248,19 @@ def resolve_resource(self, val): return ret[0] raise ResourceNotFoundError(val, executable) + def show_resource(self, val): + res_fname = self.resolve_resource(val) + fpath = Path(res_fname) + if fpath.is_dir(): + with pushd_popd(fpath): + fileobj = io.BytesIO() + with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball: + tarball.add('.') + fileobj.seek(0) + copyfileobj(fileobj, sys.stdout.buffer) + else: + sys.stdout.buffer.write(fpath.read_bytes()) + def list_all_resources(self): """ List all resources found in the filesystem and matching content-type by filename suffix From 38fd4aafdcafee803fce03a12aa4810cf4a2fba6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 15:00:59 +0200 Subject: [PATCH 005/228] Processor: add setup(), run once in get_processor() --- src/ocrd/processor/base.py | 11 ++++++++++- src/ocrd/processor/builtin/dummy_processor.py | 5 +++++ src/ocrd/processor/helpers.py | 4 +++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 263f81d631..5338f729c9 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -184,6 +184,16 @@ def verify(self): """ return True + def setup(self) -> None: + """ + Prepare the processor for actual data processing, + prior to changing to the workspace directory but + after parsing parameters. + + (Override this to load models into memory etc.) + """ + pass + @deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()') def process(self) -> None: """ @@ -197,7 +207,6 @@ def process(self) -> None: """ raise NotImplementedError() - def add_metadata(self, pcgts): """ Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 774332a733..9223118c90 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -83,6 +83,11 @@ def __init__(self, *args, **kwargs): kwargs['version'] = '0.0.3' super(DummyProcessor, self).__init__(*args, **kwargs) + def setup(self): + super().setup() + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + @click.command() @ocrd_cli_options def cli(*args, **kwargs): diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index f5b6010636..9b74671ca3 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -398,11 +398,13 @@ def get_processor( cached_processor.input_file_grp = input_file_grp cached_processor.output_file_grp = output_file_grp return cached_processor - return processor_class( + processor = processor_class( workspace=workspace, page_id=page_id, input_file_grp=input_file_grp, output_file_grp=output_file_grp, parameter=parameter ) + processor.setup() + return processor raise ValueError("Processor class is not known") From 580988ad5c6422bbb7eaaa68c783b4ac156d30ee Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 13:50:36 +0200 Subject: [PATCH 006/228] ocrd_cli_wrap_processor: fix workspace arg (not a kwarg) --- src/ocrd/decorators/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index cbeadc8d7b..3d07957021 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -48,11 +48,11 @@ def ocrd_cli_wrap_processor( **kwargs ): if not sys.argv[1:]: - processorClass(workspace=None, show_help=True) + processorClass(None, show_help=True) sys.exit(1) if dump_json or dump_module_dir or help or version or show_resource or list_resources: processorClass( - workspace=None, + None, dump_json=dump_json, dump_module_dir=dump_module_dir, show_help=help, From 224dfc5098e9912b9c2bf87f851a52e79b51250b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 14:00:32 +0200 Subject: [PATCH 007/228] =?UTF-8?q?Processor:=20refactor=20processing=20AP?= =?UTF-8?q?I=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - add method `process_workspace(workspace)` as a replacement for passing `workspace` in the constructor and then calling `process` (implemented by subclasses): implement in the superclass - loop over input files - delegate processing to new method `process_page_file()` if possible - otherwise fall back to old `process()` outside of loop - download input files when needed if `self.download` - add method `process_page_file()` as single-page processing procedure on OcrdFiles: implement in the superclass for the most frequent/default use-case of - (multi-) image/PAGE input files - (single) PAGE output files - delegate to new method `process_page_pcgts()` if available - add PAGE processing metadata - set PAGE PcGtsId - handle `make_file_id` and `workspace.add_file` - add method `process_page_pcgts()` as single-page processing function on OcrdPage: to be implemented only by subclasses - constructor: add kwarg `download_files` controlling `self.download` (see above) --- src/ocrd/processor/base.py | 117 ++++++++++++++++++++++++++++++++++--- 1 file changed, 109 insertions(+), 8 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 5338f729c9..78bc47c479 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -15,6 +15,7 @@ import os from os import getcwd from pathlib import Path +from typing import Optional import sys import tarfile import io @@ -32,9 +33,11 @@ list_all_resources, get_processor_resource_types, resource_filename, + make_file_id, ) from ocrd_validators import ParameterValidator -from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType +from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType, OcrdPage, to_xml +from ocrd_modelfactory import page_from_file # XXX imports must remain for backwards-compatibility from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import @@ -64,12 +67,15 @@ class Processor(): def __init__( self, - workspace : Workspace, + # FIXME: deprecate in favor of process_workspace(workspace) + workspace : Optional[Workspace], ocrd_tool=None, parameter=None, input_file_grp=None, output_file_grp=None, page_id=None, + download_files=True, + # FIXME: deprecate all the following in favor of respective methods resolve_resource=None, show_resource=None, list_resources=False, @@ -99,6 +105,7 @@ def __init__( output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output. page_id (string): comma-separated list of METS physical ``page`` IDs to process \ (or empty for all pages). + download_files (boolean): Whether input files will be downloaded prior to processing. resolve_resource (string): If not ``None``, then instead of processing, resolve \ given resource by name and print its full path to stdout. show_resource (string): If not ``None``, then instead of processing, resolve \ @@ -154,15 +161,17 @@ def __init__( self.show_version() return self.workspace = workspace - # FIXME HACK would be better to use pushd_popd(self.workspace.directory) - # but there is no way to do that in process here since it's an - # overridden method. chdir is almost always an anti-pattern. if self.workspace: + # FIXME deprecate setting this and calling process() over using process_workspace() + # which uses pushd_popd(self.workspace.directory) + # (because there is no way to do that in process() since it's an + # overridden method. chdir is almost always an anti-pattern.) self.old_pwd = getcwd() os.chdir(self.workspace.directory) self.input_file_grp = input_file_grp self.output_file_grp = output_file_grp self.page_id = None if page_id == [] or page_id is None else page_id + self.download = download_files parameterValidator = ParameterValidator(ocrd_tool) report = parameterValidator.validate(parameter) if not report.is_valid: @@ -197,17 +206,109 @@ def setup(self) -> None: @deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()') def process(self) -> None: """ - Process the :py:attr:`workspace` + Process all files of the :py:attr:`workspace` from the given :py:attr:`input_file_grp` to the given :py:attr:`output_file_grp` - for the given :py:attr:`page_id` + for the given :py:attr:`page_id` (or all pages) under the given :py:attr:`parameter`. (This contains the main functionality and needs to be overridden by subclasses.) """ raise NotImplementedError() - def add_metadata(self, pcgts): + def process_workspace(self, workspace: Workspace) -> None: + """ + Process all files of the given ``workspace``, + from the given :py:attr:`input_file_grp` + to the given :py:attr:`output_file_grp` + for the given :py:attr:`page_id` (or all pages) + under the given :py:attr:`parameter`. + + (This will iterate over pages and files, calling + :py:meth:`process_page`, handling exceptions.) + """ + # assert self.input_file_grp is not None + # assert self.output_file_grp is not None + # input_file_grps = self.input_file_grp.split(',') + # for input_file_grp in input_file_grps: + # assert input_file_grp in workspace.mets.file_groups + log = getLogger('ocrd.processor.base') + with pushd_popd(workspace.directory): + self.workspace = workspace + try: + # FIXME: add page parallelization by running multiprocessing.Pool (#322) + for input_file_tuple in self.zip_input_files(on_error='abort'): + # FIXME: add error handling by catching exceptions in various ways (#579) + # for example: + # - ResourceNotFoundError → use ResourceManager to download (once), then retry + # - transient (I/O or OOM) error → maybe sleep, retry + # - persistent (data) error → skip / dummy / raise + input_files = [None] * len(input_file_tuple) + for i, input_file in enumerate(input_file_tuple): + if i == 0: + log.info("processing page %s", input_file.pageId) + elif input_file is None: + # file/page not found in this file grp + continue + input_files[i] = input_file + if not self.download: + continue + try: + input_files[i] = self.workspace.download_file(input_file) + except ValueError as e: + log.error(repr(e)) + log.warning("skipping file %s for page %s", input_file, input_file.pageId) + self.process_page_file(*input_files) + except NotImplementedError: + # fall back to deprecated method + self.process() + + def process_page_file(self, *input_files) -> None: + """ + Process the given ``input_files`` of the :py:attr:`workspace`, + representing one physical page (passed as one opened + :py:class:`~ocrd_models.OcrdFile` per input fileGrp) + under the given :py:attr:`parameter`, and make sure the + results get added accordingly. + + (This uses process_page_pcgts, but can be overridden by subclasses + to handle cases like multiple fileGrps, non-PAGE input etc.) + """ + log = getLogger('ocrd.processor.base') + input_pcgts = [None] * len(input_files) + for i, input_file in enumerate(input_files): + # FIXME: what about non-PAGE input like image or JSON ??? + log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId) + try: + input_pcgts[i] = page_from_file(input_file) + except ValueError as e: + log.info("non-PAGE input for page %s: %s", input_file.pageId, e) + output_pcgts = self.process_page_pcgts(*input_pcgts) + output_file_id = make_file_id(input_files[0], self.output_file_grp) + output_pcgts.set_pcGtsId(output_file_id) + self.add_metadata(output_pcgts) + # FIXME: what about save_image_file in process_page ??? + # FIXME: what about non-PAGE output like JSON ??? + self.workspace.add_file(file_id=output_file_id, + file_grp=self.output_file_grp, + page_id=input_files[0].pageId, + local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), + mimetype=MIMETYPE_PAGE, + content=to_xml(output_pcgts)) + + def process_page_pcgts(self, *input_pcgts) -> OcrdPage: + """ + Process the given ``input_pcgts`` of the :py:attr:`workspace`, + representing one physical page (passed as one parsed + :py:class:`~ocrd_models.OcrdPage` per input fileGrp) + under the given :py:attr:`parameter`, and return the + resulting :py:class:`~ocrd_models.OcrdPage`. + + (This contains the main functionality and must be overridden by subclasses.) + """ + raise NotImplementedError() + + def add_metadata(self, pcgts: OcrdPage) -> None: """ Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``. From 9714aaba47f74d5023255b20f0d9136eaf6cc12e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 14:56:28 +0200 Subject: [PATCH 008/228] DummyProcessor: re-implement via new process_page_* - implement `process_page_pcgts` with behaviour for `copy_files=False` - override superclass `process_page_file` with behaviour for `copy_files=True` - remove old `process` implementation --- src/ocrd/processor/builtin/dummy_processor.py | 75 ++++++++----------- 1 file changed, 32 insertions(+), 43 deletions(-) diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 9223118c90..d16e182719 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -24,59 +24,48 @@ class DummyProcessor(Processor): Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group """ - def process(self) -> None: + def process_page_pcgts(self, *input_pcgts): + # nothing to do here + return input_pcgts[0] + + def process_page_file(self, *input_files): LOG = getLogger('ocrd.dummy') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - copy_files = self.parameter['copy_files'] - for input_file in self.input_files: - input_file = self.workspace.download_file(input_file) + input_file = input_files[0] + if self.parameter['copy_files'] and input_file.mimetype != MIMETYPE_PAGE: + # we need to mimic the actual copying in addition to the PAGE boilerplate file_id = make_file_id(input_file, self.output_file_grp) ext = MIME_TO_EXT.get(input_file.mimetype, '') local_filename = join(self.output_file_grp, file_id + ext) - pcgts = page_from_file(self.workspace.download_file(input_file)) - pcgts.set_pcGtsId(file_id) - self.add_metadata(pcgts) - if input_file.mimetype == MIMETYPE_PAGE: - LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) - # Source file is PAGE-XML: Write out in-memory PcGtsType - self.workspace.add_file( + LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) + with open(input_file.local_filename, 'rb') as f: + content = f.read() + output_file = self.workspace.add_file( file_id=file_id, file_grp=self.output_file_grp, page_id=input_file.pageId, mimetype=input_file.mimetype, local_filename=local_filename, - content=to_xml(pcgts).encode('utf-8')) - else: - # Source file is not PAGE-XML: Copy byte-by-byte unless copy_files is False - if not copy_files: - LOG.info("Not copying %s because it is not a PAGE-XML file and copy_files was false" % input_file.local_filename) - else: - LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) - with open(input_file.local_filename, 'rb') as f: - content = f.read() - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - mimetype=input_file.mimetype, - local_filename=local_filename, - content=content) - if input_file.mimetype.startswith('image/'): - # write out the PAGE-XML representation for this image - page_file_id = file_id + '_PAGE' - pcgts.set_pcGtsId(page_file_id) - pcgts.get_Page().set_imageFilename(local_filename if copy_files else input_file.local_filename) - page_filename = join(self.output_file_grp, file_id + '.xml') - LOG.info("Add PAGE-XML %s generated for %s at %s", page_file_id, file_id, page_filename) - self.workspace.add_file( - file_id=page_file_id, - file_grp=self.output_file_grp, - page_id=input_file.pageId, - mimetype=MIMETYPE_PAGE, - local_filename=page_filename, - content=to_xml(pcgts).encode('utf-8')) + content=content) + file_id = file_id + '_PAGE' + pcgts = page_from_file(output_file) + pcgts = self.process_page_pcgts(pcgts) + pcgts.set_pcGtsId(file_id) + self.add_metadata(pcgts) + LOG.info("Add PAGE-XML %s generated for %s", file_id, output_file) + self.workspace.add_file(file_id=file_id, + file_grp=self.output_file_grp, + page_id=input_file.pageId, + local_filename=join(self.output_file_grp, file_id + '.xml'), + mimetype=MIMETYPE_PAGE, + content=to_xml(pcgts)) + else: + if self.parameter['copy_files']: + LOG.info("Not copying %s because it is a PAGE-XML file, which gets identity-transformed", input_file.local_filename) + else: + LOG.info("Not copying %s because it is not a PAGE-XML file and copy_files was false", input_file.local_filename) + # we can rely on base implementation verbatim + super().process_page_file(input_file) def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dummy'] From e5d4736fd73f1e6a765141a7679a710de6009c7f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 14:18:06 +0200 Subject: [PATCH 009/228] =?UTF-8?q?run=5Fprocessor:=20adapt=20to=20process?= =?UTF-8?q?=E2=86=92process=5Fworkspace?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/ocrd/processor/helpers.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 9b74671ca3..b4b798706b 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -83,7 +83,6 @@ def run_processor( log = getLogger('ocrd.processor.helpers.run_processor') log.debug("Running processor %s", processorClass) - old_cwd = getcwd() processor = get_processor( processor_class=processorClass, parameter=parameter, @@ -93,8 +92,6 @@ def run_processor( output_file_grp=output_file_grp, instance_caching=instance_caching ) - processor.workspace = workspace - chdir(processor.workspace.directory) ocrd_tool = processor.ocrd_tool name = '%s v%s' % (ocrd_tool['executable'], processor.version) @@ -107,7 +104,7 @@ def run_processor( backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil' from memory_profiler import memory_usage try: - mem_usage = memory_usage(proc=processor.process, + mem_usage = memory_usage(proc=processor.process_workspace(workspace), # only run process once max_iterations=1, interval=.1, timeout=None, timestamps=True, @@ -118,8 +115,6 @@ def run_processor( except Exception as err: log.exception("Failure in processor '%s'" % ocrd_tool['executable']) raise err - finally: - chdir(old_cwd) mem_usage_values = [mem for mem, _ in mem_usage] mem_output = 'memory consumption: ' mem_output += sparkline(mem_usage_values) @@ -127,12 +122,10 @@ def run_processor( logProfile.info(mem_output) else: try: - processor.process() + processor.process_workspace(workspace) except Exception as err: log.exception("Failure in processor '%s'" % ocrd_tool['executable']) raise err - finally: - chdir(old_cwd) t1_wall = perf_counter() - t0_wall t1_cpu = process_time() - t0_cpu From 809a01b452069f6524c894f1cc0360e8dc5a1edf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 14:21:12 +0200 Subject: [PATCH 010/228] test DummyProcessor: adapt to new `download` default by setting `download_files=False` in tests (because they are not actually in the filesystem) --- tests/data/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index 93a2ea49a9..113305e2b7 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -21,6 +21,7 @@ class DummyProcessor(Processor): def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = DUMMY_TOOL kwargs['version'] = '0.0.1' + kwargs['download_files'] = False super(DummyProcessor, self).__init__(*args, **kwargs) def process(self): @@ -37,6 +38,7 @@ def __init__(self, *args, **kwargs): 'i-am-required': {'required': True} } } + kwargs['download_files'] = False super(DummyProcessorWithRequiredParameters, self).__init__(*args, **kwargs) class DummyProcessorWithOutput(Processor): @@ -44,6 +46,7 @@ class DummyProcessorWithOutput(Processor): def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = DUMMY_TOOL kwargs['version'] = '0.0.1' + kwargs['download_files'] = False super().__init__(*args, **kwargs) def process(self): From dfe7f8ef223e8ebcb6baae35efce702b3166bd64 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 14:25:33 +0200 Subject: [PATCH 011/228] test DummyProcessor: override process_workspace() by delegating to process() directly --- tests/data/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index 113305e2b7..d1edd2296e 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -27,6 +27,10 @@ def __init__(self, *args, **kwargs): def process(self): print(json.dumps(self.parameter)) + # override to prevent iterating over empty files + def process_workspace(self, workspace): + self.process() + class DummyProcessorWithRequiredParameters(Processor): def process(self): pass def __init__(self, *args, **kwargs): From 1550668518923203646de04bd8ffce8ec143a2ec Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 14:27:17 +0200 Subject: [PATCH 012/228] test builtin ocrd-dummy: adapt to consistent filename --- tests/processor/test_ocrd_dummy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/processor/test_ocrd_dummy.py b/tests/processor/test_ocrd_dummy.py index 41b585c6b9..b85379e47d 100644 --- a/tests/processor/test_ocrd_dummy.py +++ b/tests/processor/test_ocrd_dummy.py @@ -33,7 +33,7 @@ def test_copies_ok(self): output_files = workspace.mets.find_all_files(fileGrp='OUTPUT') output_files.sort(key=lambda x: x.url) assert output_files[0].local_filename == 'OUTPUT/OUTPUT_PHYS_0001.tif' - assert output_files[1].local_filename == 'OUTPUT/OUTPUT_PHYS_0001.xml' + assert output_files[1].local_filename == 'OUTPUT/OUTPUT_PHYS_0001_PAGE.xml' self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID) assert page_from_file(output_files[1]).get_Page().imageFilename == str(output_files[0].local_filename) self.assertEqual(len(output_files), 6) From 75809b1949dfc5385a7c5156bbae2aace0b77c94 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 14:31:58 +0200 Subject: [PATCH 013/228] test processor: adapt to `input_file_grp` required --- tests/processor/test_processor.py | 6 ++++-- tests/test_logging.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 784f68fc3d..740846e895 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -112,7 +112,7 @@ def test_params(self): def test_run_agent(self): no_agents_before = len(self.workspace.mets.agents) - run_processor(DummyProcessor, workspace=self.workspace) + run_processor(DummyProcessor, workspace=self.workspace, input_file_grp="OCR-D-IMG") self.assertEqual(len(self.workspace.mets.agents), no_agents_before + 1, 'one more agent') # print(self.workspace.mets.agents[no_agents_before]) @@ -153,7 +153,9 @@ def test_run_output_overwrite(self): def test_run_cli(self): with TemporaryDirectory() as tempdir: - run_processor(DummyProcessor, workspace=self.workspace) + run_processor(DummyProcessor, workspace=self.workspace, + input_file_grp='OCR-D-IMG', + output_file_grp='OUTPUT') run_cli( 'echo', mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'), diff --git a/tests/test_logging.py b/tests/test_logging.py index 2e4e0861b5..c2b6913b10 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -139,7 +139,7 @@ def testProcessorProfiling(self): getLogger('ocrd.process.profile').setLevel('DEBUG') getLogger('ocrd.process.profile').addHandler(ch) - run_processor(DummyProcessor, resolver=Resolver(), mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) + run_processor(DummyProcessor, input_file_grp='OCR-D-IMG', resolver=Resolver(), mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) log_contents = log_capture_string.getvalue() log_capture_string.close() From c429da5deeddc7400e5de83fc897c70700cdfd4a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 14:32:45 +0200 Subject: [PATCH 014/228] test processor: adapt to `self.workspace` only during run_processor --- tests/processor/test_processor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 740846e895..d65c5b3d41 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -54,12 +54,13 @@ def test_with_mets_url_input_files(self): input_file_grp='OCR-D-SEG-PAGE', resolver=self.resolver, workspace=self.workspace) + processor.workspace = self.workspace assert len(processor.input_files) == 2 assert [f.mimetype for f in processor.input_files] == [MIMETYPE_PAGE, MIMETYPE_PAGE] def test_parameter(self): with TemporaryDirectory(): - jsonpath = Path('params.json').name + jsonpath = 'params.json' with open(jsonpath, 'w') as f: f.write('{"baz": "quux"}') with open(jsonpath, 'r') as f: @@ -70,7 +71,7 @@ def test_parameter(self): resolver=self.resolver, workspace=self.workspace ) - self.assertEqual(len(processor.input_files), 3) + self.assertEqual(processor.parameter['baz'], 'quux') def test_verify(self): proc = DummyProcessor(self.workspace) From 295cdb63797bed56e2ae724ba9a8911454dca832 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 26 Jun 2024 14:02:55 +0200 Subject: [PATCH 015/228] Workspace.save_image_file: add kwarg file_path for predetermined local_filename --- src/ocrd/workspace.py | 20 ++++++++++++-------- tests/test_workspace.py | 11 +++++++---- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 8ce42a070d..5b7db48c58 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -1047,12 +1047,13 @@ def image_from_segment(self, segment, parent_image, parent_coords, return segment_image, segment_coords # pylint: disable=redefined-builtin - def save_image_file(self, image, - file_id, - file_grp, - page_id=None, - mimetype='image/png', - force=False): + def save_image_file(self, image : Image, + file_id : str, + file_grp : str, + file_path : Optional[str] = None, + page_id : Optional[str] = None, + mimetype : str = 'image/png', + force : bool = False) -> str: """Store an image in the filesystem and reference it as new file in the METS. Args: @@ -1060,12 +1061,14 @@ def save_image_file(self, image, file_id (string): `@ID` of the METS `file` to use file_grp (string): `@USE` of the METS `fileGrp` to use Keyword Args: + file_path (string): `@href` of the METS `file/FLocat` to use. page_id (string): `@ID` in the METS physical `structMap` to use mimetype (string): MIME type of the image format to serialize as force (boolean): whether to replace any existing `file` with that `@ID` Serialize the image into the filesystem, and add a `file` for it in the METS. - Use a filename extension based on ``mimetype``. + Use ``file_grp`` as directory and ``file_id`` concatenated with extension + based on ``mimetype`` as file name, unless directly passing ``file_path``. Returns: The (absolute) path of the created file. @@ -1075,7 +1078,8 @@ def save_image_file(self, image, force = True image_bytes = io.BytesIO() image.save(image_bytes, format=MIME_TO_PIL[mimetype]) - file_path = str(Path(file_grp, '%s%s' % (file_id, MIME_TO_EXT[mimetype]))) + if file_path is None: + file_path = str(Path(file_grp, '%s%s' % (file_id, MIME_TO_EXT[mimetype]))) out = self.add_file( file_grp, file_id=file_id, diff --git a/tests/test_workspace.py b/tests/test_workspace.py index c8df9b444b..0f325f5ba0 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -417,7 +417,7 @@ def test_save_image_file_invalid_mimetype_raises_exception(plain_workspace): # act raise with pytest.raises(KeyError) as key_exc: - plain_workspace.save_image_file(img, 'page1_img', 'IMG', 'page1', 'ceci/nest/pas/une/mimetype') + plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='ceci/nest/pas/une/mimetype') assert "'ceci/nest/pas/une/mimetype'" == str(key_exc.value) @@ -428,13 +428,16 @@ def test_save_image_file(plain_workspace): img = Image.new('RGB', (1000, 1000)) # act - assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', 'page1', 'image/jpeg') + assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg') assert exists(join(plain_workspace.directory, 'IMG', 'page1_img.jpg')) # should succeed - assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', 'page1', 'image/jpeg', force=True) + assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg', force=True) # should also succeed plain_workspace.overwrite_mode = True - assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', 'page1', 'image/jpeg') + assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg') + # check file_path kwarg + assert plain_workspace.save_image_file(img, 'page1_img2', 'IMG', page_id='page1', file_path='IMG/page1_img2.png') + assert exists(join(plain_workspace.directory, 'IMG', 'page1_img2.png')) @pytest.fixture(name='workspace_kant_aufklaerung') From e2cbcb94eb5130bd2be937fa4d5fca119331e123 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 26 Jun 2024 14:04:01 +0200 Subject: [PATCH 016/228] Processor.process_page_pcgts: add kwargs and allow returning derived images --- src/ocrd/processor/base.py | 25 +++++++++++++++---- src/ocrd/processor/builtin/dummy_processor.py | 2 +- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 78bc47c479..ddbf32b022 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -276,27 +276,37 @@ def process_page_file(self, *input_files) -> None: """ log = getLogger('ocrd.processor.base') input_pcgts = [None] * len(input_files) + page_id = input_files[0].pageId for i, input_file in enumerate(input_files): # FIXME: what about non-PAGE input like image or JSON ??? log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId) try: input_pcgts[i] = page_from_file(input_file) except ValueError as e: - log.info("non-PAGE input for page %s: %s", input_file.pageId, e) - output_pcgts = self.process_page_pcgts(*input_pcgts) + log.info("non-PAGE input for page %s: %s", page_id, e) output_file_id = make_file_id(input_files[0], self.output_file_grp) + output_pcgts = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id) + if isinstance(output_pcgts, (list, tuple)): + output_images = output_pcgts[1:] + output_pcgts = output_pcgts[0] + for output_image_pil, output_image_id, output_image_path in output_images: + self.workspace.save_image_file( + output_image_pil, + output_image_id, + self.output_file_grp, + page_id=page_id, + file_path=output_image_path) output_pcgts.set_pcGtsId(output_file_id) self.add_metadata(output_pcgts) - # FIXME: what about save_image_file in process_page ??? # FIXME: what about non-PAGE output like JSON ??? self.workspace.add_file(file_id=output_file_id, file_grp=self.output_file_grp, - page_id=input_files[0].pageId, + page_id=page_id, local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(output_pcgts)) - def process_page_pcgts(self, *input_pcgts) -> OcrdPage: + def process_page_pcgts(self, *input_pcgts, output_file_id : str = None, page_id : str = None) -> OcrdPage: """ Process the given ``input_pcgts`` of the :py:attr:`workspace`, representing one physical page (passed as one parsed @@ -304,6 +314,11 @@ def process_page_pcgts(self, *input_pcgts) -> OcrdPage: under the given :py:attr:`parameter`, and return the resulting :py:class:`~ocrd_models.OcrdPage`. + Optionally, return a list or tuple of the :py:class:`~ocrd_models.OcrdPage` + and one or more lists or tuples of :py:class:`PIL.Image` (image data), + :py:class:str (file ID) and :py:class:str (file path) of derived images + to be annotated along with the resulting PAGE file. + (This contains the main functionality and must be overridden by subclasses.) """ raise NotImplementedError() diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index d16e182719..9916d70aea 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -24,7 +24,7 @@ class DummyProcessor(Processor): Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group """ - def process_page_pcgts(self, *input_pcgts): + def process_page_pcgts(self, *input_pcgts, output_file_id=None, page_id=None): # nothing to do here return input_pcgts[0] From 20a6a1cda0af286e7832595d6161bba13492bd4d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 26 Jun 2024 23:44:37 +0200 Subject: [PATCH 017/228] Workspace.save_image_file: save DPI metadata, too --- src/ocrd/workspace.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 5b7db48c58..4a7eea432c 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -1076,8 +1076,11 @@ def save_image_file(self, image : Image, log = getLogger('ocrd.workspace.save_image_file') if self.overwrite_mode: force = True + saveargs = dict() + if 'dpi' in image.info: + saveargs['dpi'] = image.info['dpi'] image_bytes = io.BytesIO() - image.save(image_bytes, format=MIME_TO_PIL[mimetype]) + image.save(image_bytes, format=MIME_TO_PIL[mimetype], **saveargs) if file_path is None: file_path = str(Path(file_grp, '%s%s' % (file_id, MIME_TO_EXT[mimetype]))) out = self.add_file( From 679ad85f6191e1529c4e739ddead15724be84134 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 26 Jun 2024 23:46:31 +0200 Subject: [PATCH 018/228] Workspace.image_from_*: annotate 'DPI' in result dict and ensure it's used in meta-data of resulting image --- src/ocrd/workspace.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 4a7eea432c..bd9e4c5025 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -631,6 +631,7 @@ def image_from_page(self, page, page_id, i.e. after cropping to the page's border / bounding box (if any) and deskewing with the page's orientation angle (if any) - `"angle"`: the rotation/reflection angle applied to the image so far, + - `"DPI"`: the pixel density of the original image, - `"features"`: the `AlternativeImage` `@comments` for the image, i.e. names of all applied operations that lead up to this result, * an :py:class:`ocrd_models.ocrd_exif.OcrdExif` instance associated with @@ -672,6 +673,13 @@ def image_from_page(self, page, page_id, page_coords['angle'] = 0 # nothing applied yet (depends on filters) log.debug("page '%s' has %s orientation=%d skew=%.2f", page_id, "border," if border else "", orientation, skew) + if page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) + dpi = int(dpi) + log.debug("page '%s' images will use %d DPI from image meta-data", page_id, dpi) + page_coords['DPI'] = dpi # initialize AlternativeImage@comments classes as empty: page_coords['features'] = '' @@ -790,6 +798,11 @@ def image_from_page(self, page, page_id, 'filter="%s" in page "%s"' % ( feature_filter, page_id)) page_image.format = 'PNG' # workaround for tesserocr#194 + # ensure DPI will be set in image meta-data again + if 'DPI' in page_coords: + dpi = page_coords['DPI'] + if 'dpi' not in page_image.info: + page_image.info['dpi'] = (dpi, dpi) return page_image, page_coords, page_image_info def image_from_segment(self, segment, parent_image, parent_coords, @@ -810,6 +823,7 @@ def image_from_segment(self, segment, parent_image, parent_coords, converts from absolute coordinates to those relative to the image, i.e. after applying all operations (starting with the original image) - `"angle"`: the rotation/reflection angle applied to the image so far, + - `"DPI"`: the pixel density of the parent image, - `"features"`: the ``AlternativeImage/@comments`` for the image, i.e. names of all operations that lead up to this result, and Keyword Args: @@ -875,6 +889,7 @@ def image_from_segment(self, segment, parent_image, parent_coords, the segment's bounding box, and deskewing with the segment's orientation angle (if any) - `"angle"`: the rotation/reflection angle applied to the image so far, + - `"DPI"`: the pixel density of this image, - `"features"`: the ``AlternativeImage/@comments`` for the image, i.e. names of all applied operations that lead up to this result. @@ -937,6 +952,8 @@ def image_from_segment(self, segment, parent_image, parent_coords, orientation = 0 skew = 0 segment_coords['angle'] = parent_coords['angle'] # nothing applied yet (depends on filters) + if 'DPI' in parent_coords: + segment_coords['DPI'] = parent_coords['DPI'] # not rescaled yet # initialize AlternativeImage@comments classes from parent, except # for those operations that can apply on multiple hierarchy levels: @@ -1044,6 +1061,11 @@ def image_from_segment(self, segment, parent_image, parent_coords, 'filter="%s" in segment "%s"' % ( feature_filter, segment.id)) segment_image.format = 'PNG' # workaround for tesserocr#194 + # ensure DPI will be set in image meta-data again + if 'DPI' in segment_coords: + dpi = segment_coords['DPI'] + if 'dpi' not in segment_image.info: + segment_image.info['dpi'] = (dpi, dpi) return segment_image, segment_coords # pylint: disable=redefined-builtin From 565a3d9806793cede166c8dd2d342a35e294e1db Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 26 Jun 2024 23:47:44 +0200 Subject: [PATCH 019/228] test_workspace: adapt to image_from_* DPI and add assertions --- tests/test_workspace.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/test_workspace.py b/tests/test_workspace.py index 0f325f5ba0..2fe5f450a0 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -487,8 +487,10 @@ def test_image_from_page_basic(workspace_gutachten_data): pcgts = parseString(f.read().encode('utf8'), silence=True) # act + assert - _, info, _ = workspace_gutachten_data.image_from_page(pcgts.get_Page(), page_id='PHYS_0017', feature_selector='clipped', feature_filter='cropped') - assert info['features'] == 'binarized,clipped' + img, coords, _ = workspace_gutachten_data.image_from_page(pcgts.get_Page(), page_id='PHYS_0017', feature_selector='clipped', feature_filter='cropped') + assert coords['features'] == 'binarized,clipped' + assert isinstance(img.info.get('dpi', None), tuple) + assert img.info['dpi'][0] == coords['DPI'] _, info, _ = workspace_gutachten_data.image_from_page(pcgts.get_Page(), page_id='PHYS_0017') assert info['features'] == 'binarized,clipped' @@ -529,6 +531,7 @@ def test_deskewing(plain_workspace): skew = 4.625 image = Image.new('L', size) image = polygon_mask(image, poly) + image.info['dpi'] = (300, 300) #image.show(title='image') pixels = np.count_nonzero(np.array(image) > 0) name = 'foo0' @@ -539,9 +542,12 @@ def test_deskewing(plain_workspace): Coords=CoordsType(points=points_from_polygon(poly)), orientation=-skew) page.add_TextRegion(region) - page_image, page_coords, _ = plain_workspace.image_from_page(page, '') + page_image, page_coords, page_info = plain_workspace.image_from_page(page, '') #page_image.show(title='page_image') assert list(image.getdata()) == list(page_image.getdata()) + assert 'dpi' in page_image.info + assert round(page_image.info['dpi'][0]) == 300 + assert page_coords['DPI'] == 300 assert np.all(page_coords['transform'] == np.eye(3)) reg_image, reg_coords = plain_workspace.image_from_segment(region, page_image, page_coords, feature_filter='deskewed', fill=0) @@ -550,6 +556,7 @@ def test_deskewing(plain_workspace): assert reg_image.height == xywh['h'] == 335 assert reg_coords['transform'][0, 2] == -xywh['x'] assert reg_coords['transform'][1, 2] == -xywh['y'] + assert round(reg_image.info['dpi'][0]) == 300 # same fg after cropping to minimal bbox reg_pixels = np.count_nonzero(np.array(reg_image) > 0) assert pixels == reg_pixels @@ -561,6 +568,7 @@ def test_deskewing(plain_workspace): assert reg_coords['transform'][0, 1] != 0 assert reg_coords['transform'][1, 0] != 0 assert 'deskewed' in reg_coords['features'] + assert round(reg_image.info['dpi'][0]) == 300 # same fg after cropping to minimal bbox (roughly - due to aliasing) reg_pixels = np.count_nonzero(np.array(reg_image) > 0) assert np.abs(pixels - reg_pixels) / pixels < 0.005 @@ -582,6 +590,7 @@ def test_deskewing(plain_workspace): assert reg_image2.height == reg_image.height assert np.allclose(reg_coords2['transform'], reg_coords['transform']) assert reg_coords2['features'] == reg_coords['features'] + assert round(reg_image2.info['dpi'][0]) == 300 # same fg after cropping to minimal bbox (roughly - due to aliasing) reg_pixels2 = np.count_nonzero(np.array(reg_image) > 0) assert reg_pixels2 == reg_pixels From 46f81aa75e42c692742b0e98de248e9ee44bfbfd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 6 Jul 2024 18:57:39 +0200 Subject: [PATCH 020/228] autoload ocrd-tool.json and version from dist, executable name from entry point in stack --- src/ocrd/processor/base.py | 56 ++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index ddbf32b022..a572b26cef 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -17,6 +17,7 @@ from pathlib import Path from typing import Optional import sys +import inspect import tarfile import io from deprecated import deprecated @@ -33,7 +34,9 @@ list_all_resources, get_processor_resource_types, resource_filename, + resource_string, make_file_id, + deprecation_warning ) from ocrd_validators import ParameterValidator from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType, OcrdPage, to_xml @@ -65,6 +68,38 @@ class Processor(): a number of optional or mandatory parameters. """ + @property + def metadata(self): + """the ocrd-tool.json dict of the package""" + if hasattr(self, '_metadata'): + return self._metadata + self._metadata = json.loads(resource_string(self.__module__.split('.')[0], 'ocrd-tool.json')) + return self._metadata + + @property + def version(self): + """the version of the package""" + if hasattr(self, '_version'): + return self._version + self._version = self.metadata['version'] + return self._version + + @property + def executable(self): + """the executable name of this processor tool""" + if hasattr(self, '_executable'): + return self._executable + self._executable = os.path.basename(inspect.stack()[-1].filename) + return self._executable + + @property + def ocrd_tool(self): + """the ocrd-tool.json dict of this processor tool""" + if hasattr(self, '_ocrd_tool'): + return self._ocrd_tool + self._ocrd_tool = self.metadata['tools'][self.executable] + return self._ocrd_tool + def __init__( self, # FIXME: deprecate in favor of process_workspace(workspace) @@ -97,8 +132,6 @@ def __init__( Can be ``None`` even for processing (esp. on multiple workspaces), \ but then needs to be set before running. Keyword Args: - ocrd_tool (string): JSON of the ocrd-tool description for that processor. \ - Can be ``None`` for processing, but needs to be set before running. parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \ Can be ``None`` even for processing, but then needs to be set before running. input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input. @@ -123,11 +156,17 @@ def __init__( dump_module_dir (boolean): If true, then instead of processing, print :py:attr:`moduledir` \ on stdout. """ - self.ocrd_tool = ocrd_tool - if parameter is None: - parameter = {} + if ocrd_tool is not None: + deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - " + "use or override metadata/executable/ocrd-tool properties instead") + self._ocrd_tool = ocrd_tool + self._executable = ocrd_tool['executable'] + if version is not None: + deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - " + "use or override metadata/version properties instead") + self._version = version if dump_json: - print(json.dumps(ocrd_tool, indent=True)) + print(json.dumps(self.ocrd_tool, indent=True)) return if dump_module_dir: print(self.moduledir) @@ -156,7 +195,6 @@ def __init__( if show_help: self.show_help(subcommand=subcommand) return - self.version = version if show_version: self.show_version() return @@ -172,7 +210,9 @@ def __init__( self.output_file_grp = output_file_grp self.page_id = None if page_id == [] or page_id is None else page_id self.download = download_files - parameterValidator = ParameterValidator(ocrd_tool) + if parameter is None: + parameter = {} + parameterValidator = ParameterValidator(self.ocrd_tool) report = parameterValidator.validate(parameter) if not report.is_valid: raise Exception("Invalid parameters %s" % report.errors) From 4dd83aaa25f3f13660700a258ab5abfa1887c2cf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 6 Jul 2024 19:00:06 +0200 Subject: [PATCH 021/228] adapt to new Processor init (override metadata/version/executable name) --- src/ocrd/cli/bashlib.py | 16 ++++++--- src/ocrd/cli/ocrd_tool.py | 36 ++++++++++++++----- src/ocrd/processor/builtin/dummy_processor.py | 15 +++++--- 3 files changed, 50 insertions(+), 17 deletions(-) diff --git a/src/ocrd/cli/bashlib.py b/src/ocrd/cli/bashlib.py index 1def4638c7..8b79d82fbc 100644 --- a/src/ocrd/cli/bashlib.py +++ b/src/ocrd/cli/bashlib.py @@ -108,11 +108,17 @@ def bashlib_input_files(**kwargs): raise FileNotFoundError(msg) resolver = Resolver() workspace = resolver.workspace_from_url(mets, working_dir) - processor = Processor(workspace, - ocrd_tool=None, - page_id=kwargs['page_id'], - input_file_grp=kwargs['input_file_grp'], - output_file_grp=kwargs['output_file_grp']) + class BashlibProcessor(Processor): + @property + def ocrd_tool(self): + return {} + @property + def executable(self): + return '' + processor = BashlibProcessor(workspace, + page_id=kwargs['page_id'], + input_file_grp=kwargs['input_file_grp'], + output_file_grp=kwargs['output_file_grp']) for input_files in processor.zip_input_files(mimetype=None, on_error='abort'): # ensure all input files exist locally (without persisting them in the METS) # - this mimics the default behaviour of all Pythonic processors diff --git a/src/ocrd/cli/ocrd_tool.py b/src/ocrd/cli/ocrd_tool.py index 2a7fa99ec9..b9807b0d72 100644 --- a/src/ocrd/cli/ocrd_tool.py +++ b/src/ocrd/cli/ocrd_tool.py @@ -100,10 +100,15 @@ def ocrd_tool_tool_description(ctx): def ocrd_tool_tool_list_resources(ctx): class BashProcessor(Processor): @property + def metadata(self): + return ctx.json + @property + def executable(self): + return ctx.tool_name + @property def moduledir(self): return os.path.dirname(ctx.filename) - BashProcessor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name], - list_resources=True) + BashProcessor(None, list_resources=True) @ocrd_tool_tool.command('resolve-resource', help="Get a tool's file resource full path name") @click.argument('res_name') @@ -111,10 +116,15 @@ def moduledir(self): def ocrd_tool_tool_resolve_resource(ctx, res_name): class BashProcessor(Processor): @property + def metadata(self): + return ctx.json + @property + def executable(self): + return ctx.tool_name + @property def moduledir(self): return os.path.dirname(ctx.filename) - BashProcessor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name], - resolve_resource=res_name) + BashProcessor(None, resolve_resource=res_name) @ocrd_tool_tool.command('show-resource', help="Dump a tool's file resource") @click.argument('res_name') @@ -122,24 +132,34 @@ def moduledir(self): def ocrd_tool_tool_show_resource(ctx, res_name): class BashProcessor(Processor): @property + def metadata(self): + return ctx.json + @property + def executable(self): + return ctx.tool_name + @property def moduledir(self): return os.path.dirname(ctx.filename) - BashProcessor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name], - show_resource=res_name) + BashProcessor(None, show_resource=res_name) @ocrd_tool_tool.command('help', help="Generate help for processors") @click.argument('subcommand', required=False) @pass_ocrd_tool def ocrd_tool_tool_params_help(ctx, subcommand): class BashProcessor(Processor): + @property + def metadata(self): + return ctx.json + @property + def executable(self): + return ctx.tool_name # set docstrings to empty __doc__ = None # HACK: override the module-level docstring, too getmodule(OcrdToolCtx).__doc__ = None def process(self): return super() - BashProcessor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name], - show_help=True, subcommand=subcommand) + BashProcessor(None, show_help=True, subcommand=subcommand) # ---------------------------------------------------------------------- # ocrd ocrd-tool tool categories diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 9916d70aea..424c05772c 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -67,10 +67,17 @@ def process_page_file(self, *input_files): # we can rely on base implementation verbatim super().process_page_file(input_file) - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dummy'] - kwargs['version'] = '0.0.3' - super(DummyProcessor, self).__init__(*args, **kwargs) + @property + def metadata(self): + return OCRD_TOOL + + @property + def executable(self): + return 'ocrd-dummy' + + @property + def version(self): + return '0.0.3' def setup(self): super().setup() From 4cafbcc88f7ce1a1e89da3fd327746932632b687 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 6 Jul 2024 19:00:36 +0200 Subject: [PATCH 022/228] tests: adapt to new Processor init (override metadata/version/executable name) --- tests/data/__init__.py | 52 ++++++++++++++++++++++++------- tests/processor/test_processor.py | 21 ++++++++++--- 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index d1edd2296e..ff403ebef6 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -17,12 +17,21 @@ } class DummyProcessor(Processor): + @property + def ocrd_tool(self): + return DUMMY_TOOL + + @property + def version(self): + return '0.0.1' + + @property + def executable(self): + return 'ocrd-test' def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = DUMMY_TOOL - kwargs['version'] = '0.0.1' kwargs['download_files'] = False - super(DummyProcessor, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) def process(self): print(json.dumps(self.parameter)) @@ -32,24 +41,43 @@ def process_workspace(self, workspace): self.process() class DummyProcessorWithRequiredParameters(Processor): - def process(self): pass - def __init__(self, *args, **kwargs): - kwargs['version'] = '0.0.1' - kwargs['ocrd_tool'] = { + @property + def ocrd_tool(self): + return { 'executable': 'ocrd-test', 'steps': ['recognition/post-correction'], 'parameters': { 'i-am-required': {'required': True} } } + @property + def version(self): + return '0.0.1' + + @property + def executable(self): + return 'ocrd-test' + + def __init__(self, *args, **kwargs): kwargs['download_files'] = False - super(DummyProcessorWithRequiredParameters, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) + + def process(self): pass class DummyProcessorWithOutput(Processor): + @property + def ocrd_tool(self): + return DUMMY_TOOL + + @property + def version(self): + return '0.0.1' + + @property + def executable(self): + return 'ocrd-test' def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = DUMMY_TOOL - kwargs['version'] = '0.0.1' kwargs['download_files'] = False super().__init__(*args, **kwargs) @@ -67,6 +95,8 @@ def process(self): content='CONTENT') class IncompleteProcessor(Processor): - pass + @property + def ocrd_tool(self): + return {} diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index d65c5b3d41..e0ebfbb1d5 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -108,7 +108,11 @@ def test_params_preset_resolve(self): overwrite=True) def test_params(self): - proc = Processor(workspace=self.workspace) + class ParamTestProcessor(Processor): + @property + def ocrd_tool(self): + return {} + proc = ParamTestProcessor(self.workspace) self.assertEqual(proc.parameter, {}) def test_run_agent(self): @@ -176,7 +180,10 @@ def test_run_cli(self): ) def test_zip_input_files(self): - class ZipTestProcessor(Processor): pass + class ZipTestProcessor(Processor): + @property + def ocrd_tool(self): + return {} with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') @@ -196,7 +203,10 @@ class ZipTestProcessor(Processor): pass assert ('foobar3', 'foobar4') in tuples def test_zip_input_files_multi_mixed(self): - class ZipTestProcessor(Processor): pass + class ZipTestProcessor(Processor): + @property + def ocrd_tool(self): + return {} with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') @@ -234,7 +244,10 @@ class ZipTestProcessor(Processor): pass tuples = proc.zip_input_files() def test_zip_input_files_require_first(self): - class ZipTestProcessor(Processor): pass + class ZipTestProcessor(Processor): + @property + def ocrd_tool(self): + return {} self.capture_out_err() with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) From 9c9a4c92258f76a146dc7d96fb262063649e2457 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 29 Jul 2024 13:18:36 +0200 Subject: [PATCH 023/228] generate_processor_help: include process_workspace docstring, too --- src/ocrd/processor/helpers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index b4b798706b..d94bec1247 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -230,6 +230,8 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None) doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n' if processor_instance.__doc__: doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n' + if processor_instance.process_workspace.__doc__: + doc_help += '\n' + inspect.cleandoc(processor_instance.process_workspace.__doc__) + '\n' if processor_instance.process.__doc__: doc_help += '\n' + inspect.cleandoc(processor_instance.process.__doc__) + '\n' if doc_help: From aa0bd68dc20e601e34f659c51b542308720b52c1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 8 Aug 2024 17:55:56 +0200 Subject: [PATCH 024/228] get_processor: also run setup if instance_caching --- src/ocrd/processor/helpers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index d94bec1247..e1de22770e 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -369,7 +369,9 @@ def get_cached_processor(parameter: dict, processor_class): """ if processor_class: dict_params = dict(parameter) if parameter else None - return processor_class(workspace=None, parameter=dict_params) + processor = processor_class(workspace=None, parameter=dict_params) + processor.setup() + return processor return None From 99d16281d63a3c641a26e01979b8a93841107bef Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Aug 2024 10:12:05 +0200 Subject: [PATCH 025/228] ocrd-tool CLI: pass class in context --- src/ocrd/cli/ocrd_tool.py | 70 +++++++++++++-------------------------- 1 file changed, 23 insertions(+), 47 deletions(-) diff --git a/src/ocrd/cli/ocrd_tool.py b/src/ocrd/cli/ocrd_tool.py index b9807b0d72..dacefab008 100644 --- a/src/ocrd/cli/ocrd_tool.py +++ b/src/ocrd/cli/ocrd_tool.py @@ -31,6 +31,25 @@ def __init__(self, filename): self.content = f.read() self.json = loads(self.content) + class BashProcessor(Processor): + @property + def metadata(inner_self): + return self.json + @property + def executable(inner_self): + return self.tool_name + @property + def moduledir(inner_self): + return os.path.dirname(self.filename) + # set docstrings to empty + __doc__ = None + # HACK: override the module-level docstring, too + getmodule(OcrdToolCtx).__doc__ = None + def process(inner_self): + return super() + + self.processor = BashProcessor + pass_ocrd_tool = click.make_pass_decorator(OcrdToolCtx) # ---------------------------------------------------------------------- @@ -98,68 +117,25 @@ def ocrd_tool_tool_description(ctx): @ocrd_tool_tool.command('list-resources', help="List tool's file resources") @pass_ocrd_tool def ocrd_tool_tool_list_resources(ctx): - class BashProcessor(Processor): - @property - def metadata(self): - return ctx.json - @property - def executable(self): - return ctx.tool_name - @property - def moduledir(self): - return os.path.dirname(ctx.filename) - BashProcessor(None, list_resources=True) + ctx.processor(None, list_resources=True) @ocrd_tool_tool.command('resolve-resource', help="Get a tool's file resource full path name") @click.argument('res_name') @pass_ocrd_tool def ocrd_tool_tool_resolve_resource(ctx, res_name): - class BashProcessor(Processor): - @property - def metadata(self): - return ctx.json - @property - def executable(self): - return ctx.tool_name - @property - def moduledir(self): - return os.path.dirname(ctx.filename) - BashProcessor(None, resolve_resource=res_name) + ctx.processor(None, resolve_resource=res_name) @ocrd_tool_tool.command('show-resource', help="Dump a tool's file resource") @click.argument('res_name') @pass_ocrd_tool def ocrd_tool_tool_show_resource(ctx, res_name): - class BashProcessor(Processor): - @property - def metadata(self): - return ctx.json - @property - def executable(self): - return ctx.tool_name - @property - def moduledir(self): - return os.path.dirname(ctx.filename) - BashProcessor(None, show_resource=res_name) + ctx.processor(None, show_resource=res_name) @ocrd_tool_tool.command('help', help="Generate help for processors") @click.argument('subcommand', required=False) @pass_ocrd_tool def ocrd_tool_tool_params_help(ctx, subcommand): - class BashProcessor(Processor): - @property - def metadata(self): - return ctx.json - @property - def executable(self): - return ctx.tool_name - # set docstrings to empty - __doc__ = None - # HACK: override the module-level docstring, too - getmodule(OcrdToolCtx).__doc__ = None - def process(self): - return super() - BashProcessor(None, show_help=True, subcommand=subcommand) + ctx.processor(None, show_help=True, subcommand=subcommand) # ---------------------------------------------------------------------- # ocrd ocrd-tool tool categories From 12231b8ee8e581c071d2ffb7d10a3b261a4369ed Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 12 Aug 2024 10:26:59 +0200 Subject: [PATCH 026/228] use more specific exception if parameters are invalid Co-authored-by: Konstantin Baierer --- src/ocrd/processor/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index a572b26cef..5860a28614 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -215,9 +215,9 @@ def __init__( parameterValidator = ParameterValidator(self.ocrd_tool) report = parameterValidator.validate(parameter) if not report.is_valid: - raise Exception("Invalid parameters %s" % report.errors) + raise ValueError("Invalid parameters %s" % report.errors) self.parameter = parameter - # workaround for deprecated#72 (deprecation does not work for subclasses): + # workaround for deprecated#72 (@deprecated decorator does not work for subclasses): setattr(self, 'process', deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process'))) From d112f8ffb4885287cc35761805621a9c4eb0592a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 12 Aug 2024 10:31:14 +0200 Subject: [PATCH 027/228] run_processor w/ mem_usage: pass as args tuple Co-authored-by: Konstantin Baierer --- src/ocrd/processor/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index e1de22770e..d9edaaa258 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -104,7 +104,7 @@ def run_processor( backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil' from memory_profiler import memory_usage try: - mem_usage = memory_usage(proc=processor.process_workspace(workspace), + mem_usage = memory_usage(proc=(processor.process_workspace, [workspace], {})), # only run process once max_iterations=1, interval=.1, timeout=None, timestamps=True, From 319ceaa4e56c11c9e21f6b0e9c872d5a6a09e039 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Aug 2024 11:22:09 +0200 Subject: [PATCH 028/228] Processor.process_workspace: add fileGrp assertions --- src/ocrd/processor/base.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 5860a28614..f1ecd8def0 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -246,12 +246,12 @@ def setup(self) -> None: @deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()') def process(self) -> None: """ - Process all files of the :py:attr:`workspace` + Process all files of the :py:attr:`workspace` from the given :py:attr:`input_file_grp` to the given :py:attr:`output_file_grp` for the given :py:attr:`page_id` (or all pages) under the given :py:attr:`parameter`. - + (This contains the main functionality and needs to be overridden by subclasses.) """ raise NotImplementedError() @@ -267,11 +267,11 @@ def process_workspace(self, workspace: Workspace) -> None: (This will iterate over pages and files, calling :py:meth:`process_page`, handling exceptions.) """ - # assert self.input_file_grp is not None - # assert self.output_file_grp is not None - # input_file_grps = self.input_file_grp.split(',') - # for input_file_grp in input_file_grps: - # assert input_file_grp in workspace.mets.file_groups + assert self.input_file_grp is not None + assert self.output_file_grp is not None + input_file_grps = self.input_file_grp.split(',') + for input_file_grp in input_file_grps: + assert input_file_grp in workspace.mets.file_groups log = getLogger('ocrd.processor.base') with pushd_popd(workspace.directory): self.workspace = workspace From 80590a9b8ce0804b0bc73a47f9424967dc3d39b8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 12 Aug 2024 12:52:40 +0200 Subject: [PATCH 029/228] process_page_pcgts: add (variadic) type checks Co-authored-by: Konstantin Baierer --- src/ocrd/processor/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index f1ecd8def0..170b1643a3 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -346,7 +346,7 @@ def process_page_file(self, *input_files) -> None: mimetype=MIMETYPE_PAGE, content=to_xml(output_pcgts)) - def process_page_pcgts(self, *input_pcgts, output_file_id : str = None, page_id : str = None) -> OcrdPage: + def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPage: """ Process the given ``input_pcgts`` of the :py:attr:`workspace`, representing one physical page (passed as one parsed From 68ae8ff382adf4bf6662f29829ac2d96989e628d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Aug 2024 17:34:39 +0200 Subject: [PATCH 030/228] run_processor: fix typo --- src/ocrd/processor/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index d9edaaa258..92846a6f0d 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -104,7 +104,7 @@ def run_processor( backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil' from memory_profiler import memory_usage try: - mem_usage = memory_usage(proc=(processor.process_workspace, [workspace], {})), + mem_usage = memory_usage(proc=(processor.process_workspace, [workspace], {}), # only run process once max_iterations=1, interval=.1, timeout=None, timestamps=True, From 2a18883d7883c8897eabb3242edceb282a4db673 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Aug 2024 17:36:30 +0200 Subject: [PATCH 031/228] Processor init: deprecate passing workspace --- src/ocrd/processor/base.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 170b1643a3..3cf132278b 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -129,8 +129,8 @@ def __init__( Args: workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \ - Can be ``None`` even for processing (esp. on multiple workspaces), \ - but then needs to be set before running. + Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ + before processing. Keyword Args: parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \ Can be ``None`` even for processing, but then needs to be set before running. @@ -200,10 +200,8 @@ def __init__( return self.workspace = workspace if self.workspace: - # FIXME deprecate setting this and calling process() over using process_workspace() - # which uses pushd_popd(self.workspace.directory) - # (because there is no way to do that in process() since it's an - # overridden method. chdir is almost always an anti-pattern.) + deprecation_warning("Passing a workspace argument other than 'None' to Processor " + "is deprecated - pass as argument to process_workspace instead") self.old_pwd = getcwd() os.chdir(self.workspace.directory) self.input_file_grp = input_file_grp From b9338b4c8418a0a572358fe265ada7e3cb7dcff1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Aug 2024 17:37:08 +0200 Subject: [PATCH 032/228] docs: fix relative VERSION path --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index 3ab2e1826f..f1f8f5e555 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,7 +15,7 @@ # import os # import sys # # sys.path.insert(0, os.path.abspath('..')) -with open('VERSION', encoding='utf-8') as f: +with open('../VERSION', encoding='utf-8') as f: VERSION = f.read() From 6ca6a4086b786fdd6bfbf60fb3b8ae5c21b398e3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Aug 2024 17:37:29 +0200 Subject: [PATCH 033/228] docs: do/not exclude tests/src --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index f1f8f5e555..917c5c62ca 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -72,7 +72,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path . -exclude_patterns = [u'build', 'Thumbs.db', '.DS_Store', 'src', 'venv'] +exclude_patterns = [u'build', 'Thumbs.db', '.DS_Store', 'tests', 'venv'] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' From bc9ec057639df7e85b4748735f451d64ce1dd836 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Aug 2024 17:37:44 +0200 Subject: [PATCH 034/228] docs: add ocrd_network module --- docs/index.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index 96a4e98360..67bba66fe0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -7,9 +7,10 @@ OCR-D/core ocrd ocrd_utils + ocrd_modelfactory ocrd_models ocrd_validators - ocrd_modelfactory + ocrd_network Indices and tables From 54f1d88e1a233e7b93db5356332aac21389004d1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Aug 2024 17:43:21 +0200 Subject: [PATCH 035/228] docs:regenerated rST --- docs/api/ocrd_network/ocrd_network.deployer.rst | 7 ------- .../ocrd_network.deployment_utils.rst | 7 ------- docs/api/ocrd_network/ocrd_network.logging.rst | 7 ------- .../ocrd_network/ocrd_network.logging_utils.rst | 7 +++++++ .../ocrd_network.rabbitmq_utils.helpers.rst | 7 +++++++ .../ocrd_network.rabbitmq_utils.rst | 1 + docs/api/ocrd_network/ocrd_network.rst | 7 +++---- .../ocrd_network.runtime_data.config_parser.rst | 7 +++++++ ..._network.runtime_data.connection_clients.rst | 7 +++++++ .../ocrd_network.runtime_data.deployer.rst | 7 +++++++ .../ocrd_network.runtime_data.hosts.rst | 7 +++++++ ...ocrd_network.runtime_data.network_agents.rst | 7 +++++++ ...rd_network.runtime_data.network_services.rst | 7 +++++++ .../ocrd_network/ocrd_network.runtime_data.rst | 17 +++++++++++++++-- .../ocrd_network.tcp_to_uds_mets_proxy.rst | 7 +++++++ 15 files changed, 82 insertions(+), 27 deletions(-) delete mode 100644 docs/api/ocrd_network/ocrd_network.deployer.rst delete mode 100644 docs/api/ocrd_network/ocrd_network.deployment_utils.rst delete mode 100644 docs/api/ocrd_network/ocrd_network.logging.rst create mode 100644 docs/api/ocrd_network/ocrd_network.logging_utils.rst create mode 100644 docs/api/ocrd_network/ocrd_network.rabbitmq_utils.helpers.rst create mode 100644 docs/api/ocrd_network/ocrd_network.runtime_data.config_parser.rst create mode 100644 docs/api/ocrd_network/ocrd_network.runtime_data.connection_clients.rst create mode 100644 docs/api/ocrd_network/ocrd_network.runtime_data.deployer.rst create mode 100644 docs/api/ocrd_network/ocrd_network.runtime_data.hosts.rst create mode 100644 docs/api/ocrd_network/ocrd_network.runtime_data.network_agents.rst create mode 100644 docs/api/ocrd_network/ocrd_network.runtime_data.network_services.rst create mode 100644 docs/api/ocrd_network/ocrd_network.tcp_to_uds_mets_proxy.rst diff --git a/docs/api/ocrd_network/ocrd_network.deployer.rst b/docs/api/ocrd_network/ocrd_network.deployer.rst deleted file mode 100644 index 205a331ba2..0000000000 --- a/docs/api/ocrd_network/ocrd_network.deployer.rst +++ /dev/null @@ -1,7 +0,0 @@ -ocrd\_network.deployer module -============================= - -.. automodule:: ocrd_network.deployer - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.deployment_utils.rst b/docs/api/ocrd_network/ocrd_network.deployment_utils.rst deleted file mode 100644 index cc1f315ac5..0000000000 --- a/docs/api/ocrd_network/ocrd_network.deployment_utils.rst +++ /dev/null @@ -1,7 +0,0 @@ -ocrd\_network.deployment\_utils module -====================================== - -.. automodule:: ocrd_network.deployment_utils - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.logging.rst b/docs/api/ocrd_network/ocrd_network.logging.rst deleted file mode 100644 index d2ac721d14..0000000000 --- a/docs/api/ocrd_network/ocrd_network.logging.rst +++ /dev/null @@ -1,7 +0,0 @@ -ocrd\_network.logging module -============================ - -.. automodule:: ocrd_network.logging - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.logging_utils.rst b/docs/api/ocrd_network/ocrd_network.logging_utils.rst new file mode 100644 index 0000000000..561ce00193 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.logging_utils.rst @@ -0,0 +1,7 @@ +ocrd\_network.logging\_utils module +=================================== + +.. automodule:: ocrd_network.logging_utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.helpers.rst b/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.helpers.rst new file mode 100644 index 0000000000..e13ff897a9 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.helpers.rst @@ -0,0 +1,7 @@ +ocrd\_network.rabbitmq\_utils.helpers module +============================================ + +.. automodule:: ocrd_network.rabbitmq_utils.helpers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.rst b/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.rst index 36b581a337..63fd6f89aa 100644 --- a/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.rst +++ b/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.rst @@ -15,5 +15,6 @@ Submodules ocrd_network.rabbitmq_utils.connector ocrd_network.rabbitmq_utils.constants ocrd_network.rabbitmq_utils.consumer + ocrd_network.rabbitmq_utils.helpers ocrd_network.rabbitmq_utils.ocrd_messages ocrd_network.rabbitmq_utils.publisher diff --git a/docs/api/ocrd_network/ocrd_network.rst b/docs/api/ocrd_network/ocrd_network.rst index ae12ae1f5d..4497702751 100644 --- a/docs/api/ocrd_network/ocrd_network.rst +++ b/docs/api/ocrd_network/ocrd_network.rst @@ -15,6 +15,7 @@ Subpackages ocrd_network.cli ocrd_network.models ocrd_network.rabbitmq_utils + ocrd_network.runtime_data Submodules ---------- @@ -25,15 +26,13 @@ Submodules ocrd_network.client ocrd_network.constants ocrd_network.database - ocrd_network.deployer - ocrd_network.deployment_utils - ocrd_network.logging + ocrd_network.logging_utils ocrd_network.param_validators ocrd_network.process_helpers ocrd_network.processing_server ocrd_network.processing_worker ocrd_network.processor_server - ocrd_network.runtime_data ocrd_network.server_cache ocrd_network.server_utils + ocrd_network.tcp_to_uds_mets_proxy ocrd_network.utils diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.config_parser.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.config_parser.rst new file mode 100644 index 0000000000..e56ad31f89 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.config_parser.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.config\_parser module +================================================= + +.. automodule:: ocrd_network.runtime_data.config_parser + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.connection_clients.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.connection_clients.rst new file mode 100644 index 0000000000..2fd62e5ef2 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.connection_clients.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.connection\_clients module +====================================================== + +.. automodule:: ocrd_network.runtime_data.connection_clients + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.deployer.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.deployer.rst new file mode 100644 index 0000000000..62abe20db3 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.deployer.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.deployer module +=========================================== + +.. automodule:: ocrd_network.runtime_data.deployer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.hosts.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.hosts.rst new file mode 100644 index 0000000000..8f9001c381 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.hosts.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.hosts module +======================================== + +.. automodule:: ocrd_network.runtime_data.hosts + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.network_agents.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.network_agents.rst new file mode 100644 index 0000000000..1a597caad1 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.network_agents.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.network\_agents module +================================================== + +.. automodule:: ocrd_network.runtime_data.network_agents + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.network_services.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.network_services.rst new file mode 100644 index 0000000000..d72e67c9d6 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.network_services.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.network\_services module +==================================================== + +.. automodule:: ocrd_network.runtime_data.network_services + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.rst index fefa00b492..cdf45f6b6e 100644 --- a/docs/api/ocrd_network/ocrd_network.runtime_data.rst +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.rst @@ -1,7 +1,20 @@ -ocrd\_network.runtime\_data module -================================== +ocrd\_network.runtime\_data package +=================================== .. automodule:: ocrd_network.runtime_data :members: :undoc-members: :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + ocrd_network.runtime_data.config_parser + ocrd_network.runtime_data.connection_clients + ocrd_network.runtime_data.deployer + ocrd_network.runtime_data.hosts + ocrd_network.runtime_data.network_agents + ocrd_network.runtime_data.network_services diff --git a/docs/api/ocrd_network/ocrd_network.tcp_to_uds_mets_proxy.rst b/docs/api/ocrd_network/ocrd_network.tcp_to_uds_mets_proxy.rst new file mode 100644 index 0000000000..fa6e607f94 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.tcp_to_uds_mets_proxy.rst @@ -0,0 +1,7 @@ +ocrd\_network.tcp\_to\_uds\_mets\_proxy module +============================================== + +.. automodule:: ocrd_network.tcp_to_uds_mets_proxy + :members: + :undoc-members: + :show-inheritance: From 67633f53f87725181ce14fbe5e97915b9d0faf2a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:12:49 +0200 Subject: [PATCH 036/228] test_mets_server: fix arg vs kwarg --- tests/test_mets_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index da0b958946..61752b6ed4 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -52,10 +52,10 @@ def add_file_server(x): mets_server_url, i = x workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) workspace_server.add_file( + 'FOO', local_filename=f'local_filename{i}', mimetype=MIMETYPE_PAGE, page_id=f'page{i}', - file_grp='FOO', file_id=f'FOO_page{i}_foo{i}', # url=f'url{i}' ) From 751a1fe1bead708b6b184ca68ed361feef0f4d42 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:13:46 +0200 Subject: [PATCH 037/228] mets_server: ClientSideOcrdMets needs OcrdMets-like kwargs (without deprecation) --- src/ocrd/mets_server.py | 19 +++++++++---------- tests/test_mets_server.py | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index d7edec5ec1..5131f3f05c 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -247,11 +247,9 @@ def add_agent(self, *args, **kwargs): ).json() return OcrdAgentModel.create(**kwargs) - @deprecated_alias(ID="file_id") - @deprecated_alias(pageId="page_id") - @deprecated_alias(fileGrp="file_grp") def find_files(self, **kwargs): self.log.debug("find_files(%s)", kwargs) + # translate from native OcrdMets kwargs to OcrdMetsServer REST params if "pageId" in kwargs: kwargs["page_id"] = kwargs.pop("pageId") if "ID" in kwargs: @@ -277,14 +275,14 @@ def find_files(self, **kwargs): def find_all_files(self, *args, **kwargs): return list(self.find_files(*args, **kwargs)) - @deprecated_alias(pageId="page_id") - @deprecated_alias(ID="file_id") def add_file( - self, file_grp, content=None, file_id=None, url=None, local_filename=None, mimetype=None, page_id=None, **kwargs + self, file_grp, content=None, ID=None, url=None, local_filename=None, mimetype=None, pageId=None, **kwargs ): data = OcrdFileModel.create( - file_id=file_id, file_grp=file_grp, page_id=page_id, mimetype=mimetype, url=url, - local_filename=local_filename + file_grp=file_grp, + # translate from native OcrdMets kwargs to OcrdMetsServer REST params + file_id=ID, page_id=pageId, + mimetype=mimetype, url=url, local_filename=local_filename ) if not self.multiplexing_mode: @@ -297,8 +295,9 @@ def add_file( raise RuntimeError(f"Add file failed: Msg: {r['error']}") return ClientSideOcrdFile( - None, ID=file_id, fileGrp=file_grp, url=url, pageId=page_id, mimetype=mimetype, - local_filename=local_filename + None, fileGrp=file_grp, + ID=ID, pageId=pageId, + url=url, mimetype=mimetype, local_filename=local_filename ) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 61752b6ed4..b1350ed663 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -233,7 +233,7 @@ def test_reload(start_mets_server : Tuple[str, Workspace]): assert len(workspace_server.mets.find_all_files()) == 35, '35 files total' assert len(workspace_server_copy.mets.find_all_files()) == 35, '35 files total' - workspace_server_copy.add_file('FOO', ID='foo', mimetype='foo/bar', local_filename='mets.xml', pageId='foo') + workspace_server_copy.add_file('FOO', file_id='foo', mimetype='foo/bar', local_filename='mets.xml', page_id='foo') assert len(workspace_server.mets.find_all_files()) == 35, '35 files total' assert len(workspace_server_copy.mets.find_all_files()) == 36, '36 files total' From 86d956938068a2f5e9fdffc7d1fb81f9080b54ac Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:27:39 +0200 Subject: [PATCH 038/228] =?UTF-8?q?Processor/CLI=20decorator:=20:fire:=20s?= =?UTF-8?q?eparate=20kwargs=20and=20constructor=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `Processor.__init__`: remove non-processing kwargs - `Processor.__init__`: deprecate processing kwargs (passing file groups etc) - `Processor`: define members for all non-processing calls - `ocrd_cli_wrap_processor`: non-processing calls instead of init kwargs - `run_processor` and `get_processor` and `cli.bashlib` and `cli.ocrd_tool`: always set processing attributes _after_ init - `Processor.process_workspace`: delegate fileGrp checking to `verify` (still empty) - `DummyProcessor.setup`: no more fileGrp assertions here (too early!) (This is meant to ensure that existing processor implementations, i.e. subclasses of `Processor` do not call `setup` in the constructor anymore. That way, v3.0 will stay backwards compatible in more respects and thus adopting it along the way will become easier.) --- src/ocrd/cli/bashlib.py | 10 +- src/ocrd/cli/ocrd_tool.py | 8 +- src/ocrd/decorators/__init__.py | 65 +++++---- src/ocrd/processor/base.py | 134 ++++++++---------- src/ocrd/processor/builtin/dummy_processor.py | 5 - src/ocrd/processor/helpers.py | 27 ++-- 6 files changed, 122 insertions(+), 127 deletions(-) diff --git a/src/ocrd/cli/bashlib.py b/src/ocrd/cli/bashlib.py index 8b79d82fbc..2c57bb412a 100644 --- a/src/ocrd/cli/bashlib.py +++ b/src/ocrd/cli/bashlib.py @@ -115,10 +115,12 @@ def ocrd_tool(self): @property def executable(self): return '' - processor = BashlibProcessor(workspace, - page_id=kwargs['page_id'], - input_file_grp=kwargs['input_file_grp'], - output_file_grp=kwargs['output_file_grp']) + processor = BashlibProcessor(None) + # go half way of the normal run_processor / process_workspace call tree + processor.workspace = workspace + processor.page_id = kwargs['page_id'] + processor.input_file_grp = kwargs['input_file_grp'] + processor.output_file_grp = kwargs['output_file_grp'] for input_files in processor.zip_input_files(mimetype=None, on_error='abort'): # ensure all input files exist locally (without persisting them in the METS) # - this mimics the default behaviour of all Pythonic processors diff --git a/src/ocrd/cli/ocrd_tool.py b/src/ocrd/cli/ocrd_tool.py index dacefab008..929fe47cca 100644 --- a/src/ocrd/cli/ocrd_tool.py +++ b/src/ocrd/cli/ocrd_tool.py @@ -117,25 +117,25 @@ def ocrd_tool_tool_description(ctx): @ocrd_tool_tool.command('list-resources', help="List tool's file resources") @pass_ocrd_tool def ocrd_tool_tool_list_resources(ctx): - ctx.processor(None, list_resources=True) + ctx.processor(None).list_resources() @ocrd_tool_tool.command('resolve-resource', help="Get a tool's file resource full path name") @click.argument('res_name') @pass_ocrd_tool def ocrd_tool_tool_resolve_resource(ctx, res_name): - ctx.processor(None, resolve_resource=res_name) + ctx.processor(None).resolve_resource(res_name) @ocrd_tool_tool.command('show-resource', help="Dump a tool's file resource") @click.argument('res_name') @pass_ocrd_tool def ocrd_tool_tool_show_resource(ctx, res_name): - ctx.processor(None, show_resource=res_name) + ctx.processor(None).show_resource(res_name) @ocrd_tool_tool.command('help', help="Generate help for processors") @click.argument('subcommand', required=False) @pass_ocrd_tool def ocrd_tool_tool_params_help(ctx, subcommand): - ctx.processor(None, show_help=True, subcommand=subcommand) + ctx.processor(None).show_help(subcommand=subcommand) # ---------------------------------------------------------------------- # ocrd ocrd-tool tool categories diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index 3d07957021..d9d1fb69dd 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -36,6 +36,7 @@ def ocrd_cli_wrap_processor( profile_file=None, version=False, overwrite=False, + resolve_resource=None, show_resource=None, list_resources=False, # ocrd_network params start # @@ -47,20 +48,42 @@ def ocrd_cli_wrap_processor( # ocrd_network params end # **kwargs ): + # FIXME: remove workspace arg entirely + processor = processorClass(None) if not sys.argv[1:]: - processorClass(None, show_help=True) + processor.show_help(subcommand=subcommand) sys.exit(1) - if dump_json or dump_module_dir or help or version or show_resource or list_resources: - processorClass( - None, - dump_json=dump_json, - dump_module_dir=dump_module_dir, - show_help=help, - subcommand=subcommand, - show_version=version, - show_resource=show_resource, - list_resources=list_resources - ) + if help: + processor.show_help(subcommand=subcommand) + sys.exit() + if version: + processor.show_version() + sys.exit() + if dump_json: + processor.dump_json() + sys.exit() + if dump_module_dir: + processor.dump_module_dir() + sys.exit() + if resolve_resource: + try: + res = processor.resolve_resource(resolve_resource) + print(res) + sys.exit() + except ResourceNotFoundError as e: + log = getLogger('ocrd.processor.base') + log.critical(e.message) + sys.exit(1) + if show_resource: + try: + processor.show_resource(show_resource) + sys.exit() + except ResourceNotFoundError as e: + log = getLogger('ocrd.processor.base') + log.critical(e.message) + sys.exit(1) + if list_resources: + processor.list_resources() sys.exit() if subcommand: # Used for checking/starting network agents for the WebAPI architecture @@ -68,18 +91,13 @@ def ocrd_cli_wrap_processor( elif address or queue or database: raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}") + # from here: single-run processing context initLogging() - - LOG = getLogger('ocrd.cli_wrap_processor') - assert kwargs['input_file_grp'] is not None - assert kwargs['output_file_grp'] is not None - # LOG.info('kwargs=%s' % kwargs) if 'parameter' in kwargs: # Disambiguate parameter file/literal, and resolve file - disposable = processorClass(workspace=None) def resolve(name): try: - return disposable.resolve_resource(name) + return processor.resolve_resource(name) except ResourceNotFoundError: return None kwargs['parameter'] = parse_json_string_or_file(*kwargs['parameter'], @@ -89,12 +107,11 @@ def resolve(name): # Merge parameter overrides and parameters if 'parameter_override' in kwargs: set_json_key_value_overrides(kwargs['parameter'], *kwargs['parameter_override']) - # TODO OCR-D/core#274 # Assert -I / -O - # if not kwargs['input_file_grp']: - # raise ValueError('-I/--input-file-grp is required') - # if not kwargs['output_file_grp']: - # raise ValueError('-O/--output-file-grp is required') + if not kwargs['input_file_grp']: + raise ValueError('-I/--input-file-grp is required') + if not kwargs['output_file_grp']: + raise ValueError('-O/--output-file-grp is required') resolver = Resolver() working_dir, mets, _, mets_server_url = \ resolver.resolve_mets_arguments(working_dir, mets, None, mets_server_url) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 3cf132278b..ff970b9a1a 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -110,15 +110,6 @@ def __init__( output_file_grp=None, page_id=None, download_files=True, - # FIXME: deprecate all the following in favor of respective methods - resolve_resource=None, - show_resource=None, - list_resources=False, - show_help=False, - subcommand=None, - show_version=False, - dump_json=False, - dump_module_dir=False, version=None ): """ @@ -134,27 +125,17 @@ def __init__( Keyword Args: parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \ Can be ``None`` even for processing, but then needs to be set before running. - input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input. - output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output. + input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input. \ + Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ + before processing. + output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output. \ + Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ + before processing. page_id (string): comma-separated list of METS physical ``page`` IDs to process \ - (or empty for all pages). + (or empty for all pages). \ + Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ + before processing. download_files (boolean): Whether input files will be downloaded prior to processing. - resolve_resource (string): If not ``None``, then instead of processing, resolve \ - given resource by name and print its full path to stdout. - show_resource (string): If not ``None``, then instead of processing, resolve \ - given resource by name and print its contents to stdout. - list_resources (boolean): If true, then instead of processing, find all installed \ - resource files in the search paths and print their path names. - show_help (boolean): If true, then instead of processing, print a usage description \ - including the standard CLI and all of this processor's ocrd-tool parameters and \ - docstrings. - subcommand (string): 'worker' or 'server', only used here for the right --help output - show_version (boolean): If true, then instead of processing, print information on \ - this processor's version and OCR-D version. Exit afterwards. - dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \ - on stdout. - dump_module_dir (boolean): If true, then instead of processing, print :py:attr:`moduledir` \ - on stdout. """ if ocrd_tool is not None: deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - " @@ -165,48 +146,24 @@ def __init__( deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - " "use or override metadata/version properties instead") self._version = version - if dump_json: - print(json.dumps(self.ocrd_tool, indent=True)) - return - if dump_module_dir: - print(self.moduledir) - return - if list_resources: - for res in self.list_all_resources(): - print(res) - return - if resolve_resource: - try: - res = self.resolve_resource(resolve_resource) - print(res) - except ResourceNotFoundError as e: - log = getLogger('ocrd.processor.base') - log.critical(e.message) - sys.exit(1) - return - if show_resource: - try: - self.show_resource(show_resource) - except ResourceNotFoundError as e: - log = getLogger('ocrd.processor.base') - log.critical(e.message) - sys.exit(1) - return - if show_help: - self.show_help(subcommand=subcommand) - return - if show_version: - self.show_version() - return - self.workspace = workspace - if self.workspace: + if workspace is not None: deprecation_warning("Passing a workspace argument other than 'None' to Processor " "is deprecated - pass as argument to process_workspace instead") + self.workspace = workspace self.old_pwd = getcwd() os.chdir(self.workspace.directory) - self.input_file_grp = input_file_grp - self.output_file_grp = output_file_grp - self.page_id = None if page_id == [] or page_id is None else page_id + if input_file_grp is not None: + deprecation_warning("Passing an input_file_grp kwarg other than 'None' to Processor " + "is deprecated - pass as argument to process_workspace instead") + self.input_file_grp = input_file_grp + if output_file_grp is not None: + deprecation_warning("Passing an output_file_grp kwarg other than 'None' to Processor " + "is deprecated - pass as argument to process_workspace instead") + self.output_file_grp = output_file_grp + if page_id is not None: + deprecation_warning("Passing a page_id kwarg other than 'None' to Processor " + "is deprecated - pass as argument to process_workspace instead") + self.page_id = page_id or None self.download = download_files if parameter is None: parameter = {} @@ -220,9 +177,16 @@ def __init__( deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process'))) def show_help(self, subcommand=None): + """ + Print a usage description including the standard CLI and all of this processor's ocrd-tool + parameters and docstrings. + """ print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand)) def show_version(self): + """ + Print information on this processor's version and OCR-D version. + """ print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION)) def verify(self): @@ -231,6 +195,28 @@ def verify(self): """ return True + def dump_json(self): + """ + Print :py:attr:`ocrd_tool` on stdout. + """ + print(json.dumps(self.ocrd_tool, indent=True)) + return + + def dump_module_dir(self): + """ + Print :py:attr:`moduledir` on stdout. + """ + print(self.moduledir) + return + + def list_resources(self): + """ + Find all installed resource files in the search paths and print their path names. + """ + for res in self.list_all_resources(): + print(res) + return + def setup(self) -> None: """ Prepare the processor for actual data processing, @@ -265,14 +251,10 @@ def process_workspace(self, workspace: Workspace) -> None: (This will iterate over pages and files, calling :py:meth:`process_page`, handling exceptions.) """ - assert self.input_file_grp is not None - assert self.output_file_grp is not None - input_file_grps = self.input_file_grp.split(',') - for input_file_grp in input_file_grps: - assert input_file_grp in workspace.mets.file_groups log = getLogger('ocrd.processor.base') with pushd_popd(workspace.directory): self.workspace = workspace + self.verify() try: # FIXME: add page parallelization by running multiprocessing.Pool (#322) for input_file_tuple in self.zip_input_files(on_error='abort'): @@ -412,6 +394,14 @@ def resolve_resource(self, val): raise ResourceNotFoundError(val, executable) def show_resource(self, val): + """ + Resolve a resource name to a file path with the algorithm in + https://ocr-d.de/en/spec/ocrd_tool#file-parameters, + then print its contents to stdout. + + Args: + val (string): resource value to show + """ res_fname = self.resolve_resource(val) fpath = Path(res_fname) if fpath.is_dir(): @@ -477,7 +467,7 @@ def input_files(self): - Otherwise raise an error (complaining that only PAGE-XML warrants having multiple images for a single page) Algorithm _ - + Returns: A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects. """ diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 424c05772c..b05ca9e6da 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -79,11 +79,6 @@ def executable(self): def version(self): return '0.0.3' - def setup(self): - super().setup() - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - @click.command() @ocrd_cli_options def cli(*args, **kwargs): diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 92846a6f0d..dff14cfca6 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -369,7 +369,7 @@ def get_cached_processor(parameter: dict, processor_class): """ if processor_class: dict_params = dict(parameter) if parameter else None - processor = processor_class(workspace=None, parameter=dict_params) + processor = processor_class(None, parameter=dict_params) processor.setup() return processor return None @@ -386,22 +386,13 @@ def get_processor( ): if processor_class: if instance_caching: - cached_processor = get_cached_processor( - parameter=parameter, - processor_class=processor_class - ) - cached_processor.workspace = workspace - cached_processor.page_id = page_id - cached_processor.input_file_grp = input_file_grp - cached_processor.output_file_grp = output_file_grp - return cached_processor - processor = processor_class( - workspace=workspace, - page_id=page_id, - input_file_grp=input_file_grp, - output_file_grp=output_file_grp, - parameter=parameter - ) - processor.setup() + processor = get_cached_processor(parameter, processor_class) + else: + processor = processor_class(None, parameter=parameter) + processor.setup() + processor.workspace = workspace + processor.page_id = page_id + processor.input_file_grp = input_file_grp + processor.output_file_grp = output_file_grp return processor raise ValueError("Processor class is not known") From 1f6f0c84f845a2928aaa17ce29eecd6d97fd50f1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:39:35 +0200 Subject: [PATCH 039/228] =?UTF-8?q?Processor=20/=20ocrd-tool.json:=20:fire?= =?UTF-8?q?:=20fileGrp=20cardinality=20checks=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `ocrd_tool.schema.yml`: - deprecate `input_file_grp` - deprecate `output_file_grp` - introduce+require `input_file_grp_cardinality`: number of min:max - introduce+require `output_file_grp_cardinality`: number of min:max - `ocrd_utils.assert_file_grp_cardinality`: deprecate - `Processor.verify`: check that - fileGrp attributes exist, - input fileGrp(s) exist in METS - input/output fileGrp(s) match the cardinality constraints, if specified in ocrd-tool.json: exact number, or minimum+maximum number (skipping negative or zero) (Processor implementors must now specify `input_file_grp_cardinality` and `output_file_grp_cardinality` in order to have a valid `ocrd-tool.json` again.) --- src/ocrd/processor/base.py | 29 ++++++++++++++- .../processor/builtin/dummy/ocrd-tool.json | 6 ++- src/ocrd_utils/str.py | 2 + src/ocrd_validators/ocrd_tool.schema.yml | 37 +++++++++++++++++-- 4 files changed, 67 insertions(+), 7 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index ff970b9a1a..5cde4d9fe2 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -191,8 +191,33 @@ def show_version(self): def verify(self): """ - Verify that the :py:attr:`input_file_grp` fulfills the processor's requirements. - """ + Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements. + """ + assert self.input_file_grp is not None + assert self.output_file_grp is not None + input_file_grps = self.input_file_grp.split(',') + output_file_grps = self.output_file_grp.split(',') + def assert_file_grp_cardinality(grps, spec, msg): + if isinstance(spec, int) and spec > 0: + assert len(grps) == spec, msg % (len(grps), str(spec)) + else: + minimum = spec[0] + maximum = spec[1] + if minimum > 0: + assert len(grps) >= minimum, msg % (len(grps), str(spec)) + if maximum > 0: + assert len(grps) <= maximum, msg % (len(grps), str(spec)) + # FIXME: maybe we should enforce the cardinality properties to be specified or apply default=1 here + # (but we already have ocrd-tool validation, and these first need to be adopted by implementors) + if 'input_file_grp_cardinality' in self.ocrd_tool: + assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'], + "Unexpected number of input file groups %d vs %s") + if 'output_file_grp_cardinality' in self.ocrd_tool: + assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'], + "Unexpected number of output file groups %d vs %s") + for input_file_grp in input_file_grps: + assert input_file_grp in self.workspace.mets.file_groups + # keep this for backwards compatibility: return True def dump_json(self): diff --git a/src/ocrd/processor/builtin/dummy/ocrd-tool.json b/src/ocrd/processor/builtin/dummy/ocrd-tool.json index 30a6d99fd9..ef4a4810fe 100644 --- a/src/ocrd/processor/builtin/dummy/ocrd-tool.json +++ b/src/ocrd/processor/builtin/dummy/ocrd-tool.json @@ -1,12 +1,14 @@ { + "version": "1.0.0", + "git_url": "https://github.com/OCR-D/core", "tools": { "ocrd-dummy": { "executable": "ocrd-dummy", "description": "Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group", "steps": ["preprocessing/optimization"], "categories": ["Image preprocessing"], - "input_file_grp": "DUMMY_INPUT", - "output_file_grp": "DUMMY_OUTPUT", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "parameters": { "copy_files": { "type": "boolean", diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py index 51cce4bf23..38839064f4 100644 --- a/src/ocrd_utils/str.py +++ b/src/ocrd_utils/str.py @@ -7,6 +7,7 @@ from typing import List, Union from .constants import REGEX_FILE_ID, SPARKLINE_CHARS from .deprecate import deprecation_warning +from deprecated import deprecated from warnings import warn from numpy import array_split @@ -26,6 +27,7 @@ ] +@deprecated(version='3.0', reason='specify input and output file_grp_cardinality in ocrd-tool.json instead') def assert_file_grp_cardinality(grps, n, msg=None): """ Assert that a string of comma-separated fileGrps contains exactly ``n`` entries. diff --git a/src/ocrd_validators/ocrd_tool.schema.yml b/src/ocrd_validators/ocrd_tool.schema.yml index 766fd892cc..db1b61458e 100644 --- a/src/ocrd_validators/ocrd_tool.schema.yml +++ b/src/ocrd_validators/ocrd_tool.schema.yml @@ -29,28 +29,59 @@ properties: - steps - executable - categories - - input_file_grp - # Not required because not all processors produce output files - # - output_file_grp + - input_file_grp_cardinality + - output_file_grp_cardinality properties: executable: description: The name of the CLI executable in $PATH type: string input_file_grp: + deprecated: true description: Input fileGrp@USE this tool expects by default type: array items: type: string # pattern: '^OCR-D-[A-Z0-9-]+$' output_file_grp: + deprecated: true description: Output fileGrp@USE this tool produces by default type: array items: type: string # pattern: '^OCR-D-[A-Z0-9-]+$' + input_file_grp_cardinality: + description: Number of (comma-separated) input fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited) + oneOf: + - items: + type: number + multipleOf: 1 + - items: + type: array + items: + type: number + multipleOf: 1 + minItems: 2 + maxItems: 2 + default: 1 + additionalProperties: false + output_file_grp_cardinality: + description: Number of (comma-separated) output fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited) + oneOf: + - items: + type: number + multipleOf: 1 + - items: + type: array + items: + type: number + multipleOf: 1 + minItems: 2 + maxItems: 2 + default: 1 parameters: description: Object describing the parameters of a tool. Keys are parameter names, values sub-schemas. type: object + default: {} patternProperties: ".*": type: object From 9b417d69ae0be9f51d6cc7dfbbc6a9c514738437 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:49:09 +0200 Subject: [PATCH 040/228] test_processor: adapt to Processor init changes --- tests/processor/test_processor.py | 43 ++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index e0ebfbb1d5..d4f0637f78 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -30,8 +30,11 @@ def setUp(self): def test_incomplete_processor(self): proc = IncompleteProcessor(None) + proc.input_file_grp = 'OCR-D-IMG' + proc.output_file_grp = 'DUMMY' + proc.page_id = None with self.assertRaises(NotImplementedError): - proc.process() + proc.process_workspace(self.workspace) def test_no_resolver(self): with self.assertRaisesRegex(Exception, 'pass a resolver to create a workspace'): @@ -74,15 +77,20 @@ def test_parameter(self): self.assertEqual(processor.parameter['baz'], 'quux') def test_verify(self): - proc = DummyProcessor(self.workspace) + proc = DummyProcessor(None) + with self.assertRaises(AttributeError): + proc.verify() + proc.workspace = self.workspace + proc.input_file_grp = "OCR-D-IMG" + proc.output_file_grp = "DUMMY" self.assertEqual(proc.verify(), True) def test_json(self): - DummyProcessor(self.workspace, dump_json=True) + DummyProcessor(None).dump_json() def test_params_missing_required(self): with self.assertRaisesRegex(Exception, 'is a required property'): - DummyProcessorWithRequiredParameters(workspace=self.workspace) + DummyProcessorWithRequiredParameters(None) def test_params_preset_resolve(self): with pushd_popd(tempdir=True) as tempdir: @@ -112,7 +120,7 @@ class ParamTestProcessor(Processor): @property def ocrd_tool(self): return {} - proc = ParamTestProcessor(self.workspace) + proc = ParamTestProcessor(None) self.assertEqual(proc.parameter, {}) def test_run_agent(self): @@ -192,7 +200,10 @@ def ocrd_tool(self): ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar4', page_id='phys_0002') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + proc = ZipTestProcessor(None) + proc.workspace = ws + proc.input_file_grp = 'GRP1,GRP2' + proc.page_id = page_id tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] assert ('foobar1', 'foobar2') in tuples assert ('foobar3', 'foobar4') in tuples @@ -217,7 +228,10 @@ def ocrd_tool(self): ws.add_file('GRP2', mimetype='image/tiff', file_id='foobar4', page_id='phys_0002') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + proc = ZipTestProcessor(None) + proc.workspace = ws + proc.input_file_grp = 'GRP1,GRP2' + proc.page_id = page_id print("unfiltered") tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] assert ('foobar1', 'foobar2') in tuples @@ -228,7 +242,10 @@ def ocrd_tool(self): ws.add_file('GRP2', mimetype='image/tiff', file_id='foobar4dup', page_id='phys_0002') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + proc = ZipTestProcessor(None) + proc.workspace = ws + proc.input_file_grp = 'GRP1,GRP2' + proc.page_id = page_id tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(on_error='first')] assert ('foobar1', 'foobar2') in tuples assert ('foobar3', 'foobar4') in tuples @@ -239,7 +256,10 @@ def ocrd_tool(self): ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2dup', page_id='phys_0001') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + proc = ZipTestProcessor(None) + proc.workspace = ws + proc.input_file_grp = 'GRP1,GRP2' + proc.page_id = page_id with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"): tuples = proc.zip_input_files() @@ -255,7 +275,10 @@ def ocrd_tool(self): ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0001') for page_id in [None, 'phys_0001']: with self.subTest(page_id=page_id): - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + proc = ZipTestProcessor(None) + proc.workspace = ws + proc.input_file_grp = 'GRP1,GRP2' + proc.page_id = page_id assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] r = self.capture_out_err() assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err From fbe83c9e9ed186664a42f54638cd2976ebf98a7a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:50:42 +0200 Subject: [PATCH 041/228] adapt to ocrd-tool.json cardinality changes --- tests/cli/test_bashlib.py | 2 +- tests/cli/test_validate.py | 4 ++-- tests/validator/test_ocrd_tool_validator.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/cli/test_bashlib.py b/tests/cli/test_bashlib.py index ab52b6b1ba..1807b1f47d 100644 --- a/tests/cli/test_bashlib.py +++ b/tests/cli/test_bashlib.py @@ -50,7 +50,7 @@ def invoke_bash(self, script, *args, executable=None): return -1, "", str(e) finally: os.remove(scriptfile.name) - + def setUp(self): self.maxDiff = None super().setUp() diff --git a/tests/cli/test_validate.py b/tests/cli/test_validate.py index 36ee3e5995..ecfedc6794 100644 --- a/tests/cli/test_validate.py +++ b/tests/cli/test_validate.py @@ -21,8 +21,8 @@ "ocrd-xyz": { "executable": "ocrd-xyz", "description": "bars all the foos", - "input_file_grp": ["OCR-D-FOO"], - "output_file_grp": ["OCR-D-BAR"], + "input_file_grp_cardinality": [1, 2], + "output_file_grp_cardinality": 1, "categories": ["Layout analysis"], "steps": ["layout/analysis"], "parameters": { diff --git a/tests/validator/test_ocrd_tool_validator.py b/tests/validator/test_ocrd_tool_validator.py index 3ad40d8645..5c89ecbf06 100644 --- a/tests/validator/test_ocrd_tool_validator.py +++ b/tests/validator/test_ocrd_tool_validator.py @@ -12,8 +12,8 @@ "ocrd-xyz": { "executable": "ocrd-xyz", "description": "bars all the foos", - "input_file_grp": ["OCR-D-FOO"], - "output_file_grp": ["OCR-D-BAR"], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "categories": ["Layout analysis"], "steps": ["layout/analysis"] } From 09dd54bef98b03e7936ab3f19465e55e0bea70af Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:52:09 +0200 Subject: [PATCH 042/228] use up-to-date kwargs (avoiding old deprecations) --- tests/data/__init__.py | 4 ++-- tests/processor/test_processor.py | 10 +++++----- tests/validator/test_page_validator.py | 9 +++++---- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index ff403ebef6..b299c512ec 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -87,9 +87,9 @@ def process(self): file_id = make_file_id(input_file, self.output_file_grp) # print(input_file.ID, file_id) self.workspace.add_file( - ID=file_id, + file_id=file_id, file_grp=self.output_file_grp, - pageId=input_file.pageId, + page_id=input_file.pageId, mimetype=input_file.mimetype, local_filename=os.path.join(self.output_file_grp, file_id), content='CONTENT') diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index d4f0637f78..2cf8a189b4 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -138,8 +138,8 @@ def test_run_input(self): def test_run_output0(self): with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002') run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", output_file_grp="OCR-D-OUT") @@ -148,10 +148,10 @@ def test_run_output0(self): def test_run_output_overwrite(self): with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002') ws.overwrite_mode = True - ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, ID='OCR-D-OUT_phys_0001', pageId='phys_0001') + ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, file_id='OCR-D-OUT_phys_0001', page_id='phys_0001') ws.overwrite_mode = False with pytest.raises(Exception) as exc: run_processor(DummyProcessorWithOutput, workspace=ws, diff --git a/tests/validator/test_page_validator.py b/tests/validator/test_page_validator.py index 79e92d90fa..e6aaff1523 100644 --- a/tests/validator/test_page_validator.py +++ b/tests/validator/test_page_validator.py @@ -16,9 +16,10 @@ def test_validate_err(self): PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_strategy='best') # test with deprecated name with self.assertRaisesRegex(Exception, 'page_textequiv_strategy best not implemented'): - PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, strategy='best') + with self.assertWarnsRegex(DeprecationWarning, r'use page_textequiv_strategy'): + PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, strategy='best') with self.assertRaisesRegex(Exception, 'page_textequiv_consistency level superstrictest not implemented'): - PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_consistency='superstrictest', strategy='first') + PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_consistency='superstrictest', page_textequiv_strategy='first') def test_validate_filename(self): report = PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME) @@ -44,7 +45,7 @@ def test_validate_lax(self): report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 26, '26 textequiv consistency errors - strict') - report = PageValidator.validate(ocrd_page=ocrd_page, strictness='lax') + report = PageValidator.validate(ocrd_page=ocrd_page, page_textequiv_consistency='lax') self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 1, '1 textequiv consistency errors - lax') def test_validate_multi_textequiv_first(self): @@ -89,7 +90,7 @@ def test_fix(self): ocrd_page = parse(FAULTY_GLYPH_PAGE_FILENAME, silence=True) report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors') - PageValidator.validate(ocrd_page=ocrd_page, strictness='fix') + PageValidator.validate(ocrd_page=ocrd_page, page_textequiv_consistency='fix') report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 0, 'no more textequiv consistency errors') From af880e4a302332d23a58e45c0f933351c67cc936 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:52:59 +0200 Subject: [PATCH 043/228] hide/test expected deprecation warnings --- tests/data/__init__.py | 4 +++- tests/test_resolver.py | 29 +++++++++++++++-------------- tests/test_utils.py | 12 ++++++++---- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index b299c512ec..e7ef30fc2b 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -1,5 +1,6 @@ import json import os +from pytest import warns from ocrd import Processor from ocrd_utils import make_file_id @@ -38,7 +39,8 @@ def process(self): # override to prevent iterating over empty files def process_workspace(self, workspace): - self.process() + with warns(DeprecationWarning, match='should be replaced with process_page'): + self.process() class DummyProcessorWithRequiredParameters(Processor): @property diff --git a/tests/test_resolver.py b/tests/test_resolver.py index abcf69257b..7e102612e1 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -287,20 +287,21 @@ def test_resolve_mets_arguments(): https://github.com/OCR-D/core/issues/517 """ resolver = Resolver() - assert resolver.resolve_mets_arguments(None, None, None, None) == (str(Path.cwd()), str(Path.cwd() / 'mets.xml'), 'mets.xml', None) - assert resolver.resolve_mets_arguments('/', None, 'mets.xml', None) == ('/', '/mets.xml', 'mets.xml', None) - assert resolver.resolve_mets_arguments('/foo', '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) - assert resolver.resolve_mets_arguments(None, '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) - assert resolver.resolve_mets_arguments('/foo', 'foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) - assert resolver.resolve_mets_arguments('/foo', 'http://bar/foo.xml', None, None) == ('/foo', 'http://bar/foo.xml', 'foo.xml', None) - with pytest.raises(ValueError, match="Use either --mets or --mets-basename, not both"): - resolver.resolve_mets_arguments('/', '/foo/bar', 'foo.xml', None) - with pytest.raises(ValueError, match="inconsistent with --directory"): - resolver.resolve_mets_arguments('/foo', '/bar/foo.xml', None, None) - with pytest.warns(DeprecationWarning): - resolver.resolve_mets_arguments('/foo', None, 'not_mets.xml', None) - with pytest.raises(ValueError, match=r"--mets is an http\(s\) URL but no --directory was given"): - resolver.resolve_mets_arguments(None, 'http://bar/foo.xml', None, None) + with pytest.warns(DeprecationWarning, match='--mets-basename'): + assert resolver.resolve_mets_arguments(None, None, None, None) == (str(Path.cwd()), str(Path.cwd() / 'mets.xml'), 'mets.xml', None) + assert resolver.resolve_mets_arguments('/', None, 'mets.xml', None) == ('/', '/mets.xml', 'mets.xml', None) + assert resolver.resolve_mets_arguments('/foo', '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) + assert resolver.resolve_mets_arguments(None, '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) + assert resolver.resolve_mets_arguments('/foo', 'foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) + assert resolver.resolve_mets_arguments('/foo', 'http://bar/foo.xml', None, None) == ('/foo', 'http://bar/foo.xml', 'foo.xml', None) + with pytest.raises(ValueError, match="Use either --mets or --mets-basename, not both"): + resolver.resolve_mets_arguments('/', '/foo/bar', 'foo.xml', None) + with pytest.raises(ValueError, match="inconsistent with --directory"): + resolver.resolve_mets_arguments('/foo', '/bar/foo.xml', None, None) + with pytest.warns(DeprecationWarning): + resolver.resolve_mets_arguments('/foo', None, 'not_mets.xml', None) + with pytest.raises(ValueError, match=r"--mets is an http\(s\) URL but no --directory was given"): + resolver.resolve_mets_arguments(None, 'http://bar/foo.xml', None, None) if __name__ == '__main__': main(__file__) diff --git a/tests/test_utils.py b/tests/test_utils.py index 89ff6d90f3..dea7ad7942 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -242,12 +242,16 @@ def test_set_json_key_value_overrides(): def test_assert_file_grp_cardinality(): with raises(AssertionError, match="Expected exactly 5 output file groups, but '.'FOO', 'BAR'.' has 2"): - assert_file_grp_cardinality('FOO,BAR', 5) + with warns(DeprecationWarning, match="file_grp_cardinality in ocrd-tool.json instead"): + assert_file_grp_cardinality('FOO,BAR', 5) with raises(AssertionError, match="Expected exactly 1 output file group, but '.'FOO', 'BAR'.' has 2"): - assert_file_grp_cardinality('FOO,BAR', 1) - assert_file_grp_cardinality('FOO,BAR', 2) + with warns(DeprecationWarning, match="file_grp_cardinality in ocrd-tool.json instead"): + assert_file_grp_cardinality('FOO,BAR', 1) + with warns(DeprecationWarning, match="file_grp_cardinality in ocrd-tool.json instead"): + assert_file_grp_cardinality('FOO,BAR', 2) with raises(AssertionError, match="Expected exactly 1 output file group .foo bar., but '.'FOO', 'BAR'.' has 2"): - assert_file_grp_cardinality('FOO,BAR', 1, 'foo bar') + with warns(DeprecationWarning, match="file_grp_cardinality in ocrd-tool.json instead"): + assert_file_grp_cardinality('FOO,BAR', 1, 'foo bar') def test_make_file_id_simple(): f = create_ocrd_file('MAX', ID="MAX_0012") From e381a0fe94a14150e8004c50869462f870e5b591 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:53:25 +0200 Subject: [PATCH 044/228] improve output in case of assertion failures --- tests/cli/test_validate.py | 22 ++++++++++----------- tests/validator/test_ocrd_tool_validator.py | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/cli/test_validate.py b/tests/cli/test_validate.py index ecfedc6794..0682ea7a01 100644 --- a/tests/cli/test_validate.py +++ b/tests/cli/test_validate.py @@ -57,24 +57,24 @@ def test_validate_ocrd_tool(self): json_path.write_text(OCRD_TOOL) # normal call - code, _, _ = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) + self.assertEqual(code, 0, err) # relative path with pushd_popd(tempdir): - code, _, _ = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) + self.assertEqual(code, 0, err) # default path with pushd_popd(tempdir): - code, _, _ = self.invoke_cli(validate_cli, ['tool-json']) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['tool-json']) + self.assertEqual(code, 0, err) def test_validate_parameter(self): with TemporaryDirectory() as tempdir: json_path = Path(tempdir, 'ocrd-tool.json') json_path.write_text(OCRD_TOOL) with pushd_popd(tempdir): - code, _, _ = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) + self.assertEqual(code, 0, err) def test_validate_page(self): page_path = assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml') @@ -84,11 +84,11 @@ def test_validate_page(self): def test_validate_tasks(self): # simple - code, _, _ = self.invoke_cli(validate_cli, ['tasks', + code, _, err = self.invoke_cli(validate_cli, ['tasks', "sample-processor-required-param -I FOO -O OUT1 -p '{\"param1\": true}'", "sample-processor-required-param -I FOO -O OUT2 -p '{\"param1\": true}'", ]) - self.assertEqual(code, 0) + self.assertEqual(code, 0, err) # with workspace code, out, err = self.invoke_cli(validate_cli, ['tasks', '--workspace', assets.path_to('kant_aufklaerung_1784/data'), @@ -96,7 +96,7 @@ def test_validate_tasks(self): "sample-processor-required-param -I OCR-D-IMG,OCR-D-GT-PAGE -O OUT2 -p '{\"param1\": true}'", ]) print('code=%s out=%s err=%s' % (code, out, err)) - self.assertEqual(code, 0) + self.assertEqual(code, 0, err) if __name__ == '__main__': diff --git a/tests/validator/test_ocrd_tool_validator.py b/tests/validator/test_ocrd_tool_validator.py index 5c89ecbf06..8612353890 100644 --- a/tests/validator/test_ocrd_tool_validator.py +++ b/tests/validator/test_ocrd_tool_validator.py @@ -29,7 +29,7 @@ def setUp(self): def test_smoke(self): report = OcrdToolValidator.validate(self.ocrd_tool) - self.assertEqual(report.is_valid, True) + self.assertTrue(report.is_valid, report) def test_additional_props(self): self.ocrd_tool['not-allowed'] = 'YUP' @@ -48,7 +48,7 @@ def test_file_param_ok(self): ocrd_tool = json.loads(skeleton) ocrd_tool['tools']['ocrd-xyz']['parameters'] = {"file-param": {"description": "...", "type": "string", "content-type": 'application/rdf+xml'}} report = OcrdToolValidator.validate(ocrd_tool) - self.assertEqual(report.is_valid, True) + self.assertTrue(report.is_valid, report) # Not restricted anymore since spec 3.3.0 # def test_file_param_bad_content_types(self): From 874b5061583342f82ef122ec0ab2718a84b20b45 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 14 Aug 2024 11:03:37 +0200 Subject: [PATCH 045/228] Set VERSION to upcoming 3.0.0a1 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 3d6ac35b13..a6f4248b2f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.66.0 +3.0.0a1 From 5ffe3cb258ee2dc4dad8b095e0ac2ef914508933 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 14 Aug 2024 11:44:53 +0200 Subject: [PATCH 046/228] CircleCI: use version 2.1 --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9c5ff83227..24c742aa68 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,4 +1,4 @@ -version: 2 +version: 2.1 orbs: python: circleci/python@2.0.3 From 93a742efffb31cf4890585c555bfd7e1e77c22bc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 14 Aug 2024 13:41:42 +0200 Subject: [PATCH 047/228] test_bashlib: use version verbatim --- repo/spec | 2 +- src/ocrd_utils/config.py | 2 +- tests/cli/test_bashlib.py | 4 +--- tests/validator/test_ocrd_tool_validator.py | 4 ++-- 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/repo/spec b/repo/spec index 506b33936d..2bbd4dd916 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit 506b33936d89080a683fa8a26837f2a23b23e5e2 +Subproject commit 2bbd4dd916519f567e5c648b24c0b5ca6fc8a183 diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index b3a3e9537d..d0955a8dcf 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -125,7 +125,7 @@ def raw_value(self, name): description="If set, then the CPU profile is written to this file for later peruse with a analysis tools like snakeviz") config.add("OCRD_DOWNLOAD_RETRIES", - description="Number of times to retry failed attempts for downloads of workspace files.", + description="Number of times to retry failed attempts for downloads of resource or workspace files.", validator=int, parser=int) diff --git a/tests/cli/test_bashlib.py b/tests/cli/test_bashlib.py index 1807b1f47d..c4b2fd7da7 100644 --- a/tests/cli/test_bashlib.py +++ b/tests/cli/test_bashlib.py @@ -104,10 +104,8 @@ def test_bashlib_minversion(self): exit_code, out, err = self.invoke_bash( "source $(ocrd bashlib filename) && ocrd__minversion 2.29.0") assert exit_code == 0 - (major, minor, patch) = map(int, str(VERSION).split('.')) - version = "%d.%d.%d" % (major, minor + 1, patch) exit_code, out, err = self.invoke_bash( - "source $(ocrd bashlib filename) && ocrd__minversion " + version) + "source $(ocrd bashlib filename) && ocrd__minversion " + VERSION) assert exit_code > 0 assert "ERROR: ocrd/core is too old" in err diff --git a/tests/validator/test_ocrd_tool_validator.py b/tests/validator/test_ocrd_tool_validator.py index 8612353890..2d035757ed 100644 --- a/tests/validator/test_ocrd_tool_validator.py +++ b/tests/validator/test_ocrd_tool_validator.py @@ -29,7 +29,7 @@ def setUp(self): def test_smoke(self): report = OcrdToolValidator.validate(self.ocrd_tool) - self.assertTrue(report.is_valid, report) + self.assertTrue(report.is_valid, str(report.errors)) def test_additional_props(self): self.ocrd_tool['not-allowed'] = 'YUP' @@ -48,7 +48,7 @@ def test_file_param_ok(self): ocrd_tool = json.loads(skeleton) ocrd_tool['tools']['ocrd-xyz']['parameters'] = {"file-param": {"description": "...", "type": "string", "content-type": 'application/rdf+xml'}} report = OcrdToolValidator.validate(ocrd_tool) - self.assertTrue(report.is_valid, report) + self.assertTrue(report.is_valid, str(report.errors)) # Not restricted anymore since spec 3.3.0 # def test_file_param_bad_content_types(self): From 51176841d330d6e238d35fabbad7db3ecceecf9d Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 14 Aug 2024 16:17:46 +0200 Subject: [PATCH 048/228] . --- repo/spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repo/spec b/repo/spec index 506b33936d..2bbd4dd916 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit 506b33936d89080a683fa8a26837f2a23b23e5e2 +Subproject commit 2bbd4dd916519f567e5c648b24c0b5ca6fc8a183 From 456cc6dd65a40ccb17006392eea2d4e1481884a5 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 14 Aug 2024 17:45:29 +0200 Subject: [PATCH 049/228] fix make spec --- Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 1b4ef47bd2..886eed9630 100644 --- a/Makefile +++ b/Makefile @@ -238,9 +238,9 @@ repo/assets repo/spec: always-update .PHONY: spec # Copy JSON Schema, OpenAPI from OCR-D/spec -spec: repo/spec - cp repo/spec/ocrd_tool.schema.yml ocrd_validators/ocrd_validators/ocrd_tool.schema.yml - cp repo/spec/bagit-profile.yml ocrd_validators/ocrd_validators/bagit-profile.yml +spec: # repo/spec + cp repo/spec/ocrd_tool.schema.yml src/ocrd_validators/ocrd_tool.schema.yml + cp repo/spec/bagit-profile.yml src/ocrd_validators/bagit-profile.yml # # Assets From 7a9fc2778f774cef304706f33a0f8f68a71b4fe6 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 14 Aug 2024 18:39:46 +0200 Subject: [PATCH 050/228] adapt lib.bash to handle prerelease suffixes like a1, b2, rc3 --- src/ocrd/lib.bash | 31 +++++++++++++++++++++++-------- tests/cli/test_bashlib.py | 28 +++++++++++++++++++++++----- 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index 1e3ecfc6eb..9e0460e6d8 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -27,12 +27,22 @@ ocrd__log () { ## Ensure minimum version # ht https://stackoverflow.com/posts/4025065 ocrd__minversion () { - local minversion="$1" - local version=$(ocrd --version|sed 's/ocrd, version //') - #echo "$minversion < $version?" - local IFS=. - version=($version) - minversion=($minversion) + local minversion_raw="$1" + set -e + local version_raw=$(ocrd --version|sed 's/ocrd, version //') + local version_mmp=$(echo "$version_raw" | grep -Eo '([0-9]+\.?){3}') + local version_prerelease_suffix="${version_raw#$version_mmp}" + if [[ -z $version_prerelease_suffix ]];then + version_prerelease_suffix=0 + fi + local minversion_mmp=$(echo "$minversion_raw" | grep -Eo '([0-9]+\.?){3}') + local minversion_prerelease_suffix="${minversion_raw#$minversion_mmp}" + if [[ -z $minversion_prerelease_suffix ]];then + minversion_prerelease_suffix=0 + fi + local IFS='.' + version=($version_mmp) + minversion=($minversion_mmp) # MAJOR > MAJOR if (( ${version[0]} > ${minversion[0]} ));then return @@ -44,12 +54,17 @@ ocrd__minversion () { # MINOR == MINOR elif (( ${version[1]} == ${minversion[1]} ));then # PATCH > PATCH - if (( ${version[2]} >= ${minversion[2]} ));then + if (( ${version[2]} > ${minversion[2]} ));then + return + elif (( ${version[2]} == ${minversion[2]}));then + # Match prerelease suffix like a1, b1 only literally + if [[ $version_prerelease_suffix == $minversion_prerelease_suffix ]];then return + fi fi fi fi - ocrd__raise "ocrd/core is too old (${version[*]} < ${minversion[*]}). Please update OCR-D/core" + ocrd__raise "ocrd/core is too old ($version_raw < $minversion_raw). Please update OCR-D/core" } ## ### `ocrd__dumpjson` diff --git a/tests/cli/test_bashlib.py b/tests/cli/test_bashlib.py index c4b2fd7da7..15af493502 100644 --- a/tests/cli/test_bashlib.py +++ b/tests/cli/test_bashlib.py @@ -1,4 +1,6 @@ from contextlib import contextmanager +import re +from typing import Tuple, Union from tests.base import CapturingTestCase as TestCase, main, assets, copy_of_directory import os, sys @@ -20,6 +22,13 @@ from ocrd_utils import pushd_popd +def parse_version(v : str) -> Union[Tuple[int, int, int], Tuple[int, int, int, str]]: + tokens = re.split('((?:a|b|rc)[0-9]+)', v, 1) + version_wo_suffix = tokens[0] + prerelease_suffix = tokens[1] if len(tokens) > 1 else '' + (major, minor, patch) = map(int, version_wo_suffix.split('.')) + return (major, minor, patch, prerelease_suffix) + class TestBashlibCli(TestCase): def invoke_bash(self, script, *args, executable=None): @@ -101,13 +110,22 @@ def test_bashlib_defs(self): assert 'function' in out def test_bashlib_minversion(self): - exit_code, out, err = self.invoke_bash( - "source $(ocrd bashlib filename) && ocrd__minversion 2.29.0") + exit_code, out, err = self.invoke_bash("source $(ocrd bashlib filename) && ocrd__minversion 2.29.0") assert exit_code == 0 - exit_code, out, err = self.invoke_bash( - "source $(ocrd bashlib filename) && ocrd__minversion " + VERSION) + major, minor, patch, prerelease_suffix = parse_version(VERSION) + + # test normal version with impossible minimum minor version + version = "%d.%d.%d" % (major, minor + 1, patch) + exit_code, out, err = self.invoke_bash("source $(ocrd bashlib filename) && ocrd__minversion " + version) + assert exit_code > 0 + assert f"ERROR: ocrd/core is too old ({VERSION} < {version})" in err + + # test non-matching prerelease (the 99th alpha pre-release here) + version = "%d.%d.%da99" % (major, minor, patch) + assert VERSION != version # assuming we will never have 99 alpha prereleases ^^ + exit_code, out, err = self.invoke_bash("source $(ocrd bashlib filename) && ocrd__minversion " + version) assert exit_code > 0 - assert "ERROR: ocrd/core is too old" in err + assert f"ERROR: ocrd/core is too old ({VERSION} < {version})" in err def test_bashlib_cp_processor(self): # script = (Path(__file__).parent.parent / 'data/bashlib_cp_processor.sh').read_text() From 90afb8a7dccbde24e147f774bfb8929ae56854d9 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 14 Aug 2024 19:43:36 +0200 Subject: [PATCH 051/228] process_page_pcgts must return OcrdProcessResult --- src/ocrd/processor/base.py | 39 ++++++++++--------- src/ocrd/processor/builtin/dummy_processor.py | 10 +++-- src/ocrd/workspace.py | 2 +- src/ocrd_modelfactory/__init__.py | 2 +- src/ocrd_models/__init__.py | 1 + 5 files changed, 29 insertions(+), 25 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 43aec4ace0..2a4679ed47 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -15,7 +15,7 @@ import os from os import getcwd from pathlib import Path -from typing import Optional +from typing import List, Optional import sys import inspect import tarfile @@ -23,6 +23,8 @@ from deprecated import deprecated from ocrd.workspace import Workspace +from ocrd_models.ocrd_file import OcrdFile +from ocrd_models.ocrd_process_result import OcrdProcessResult from ocrd_utils import ( VERSION as OCRD_VERSION, MIMETYPE_PAGE, @@ -309,7 +311,7 @@ def process_workspace(self, workspace: Workspace) -> None: # fall back to deprecated method self.process() - def process_page_file(self, *input_files) -> None: + def process_page_file(self, *input_files : OcrdFile) -> None: """ Process the given ``input_files`` of the :py:attr:`workspace`, representing one physical page (passed as one opened @@ -321,7 +323,7 @@ def process_page_file(self, *input_files) -> None: to handle cases like multiple fileGrps, non-PAGE input etc.) """ log = getLogger('ocrd.processor.base') - input_pcgts = [None] * len(input_files) + input_pcgts : List[OcrdPage] = [None] * len(input_files) page_id = input_files[0].pageId for i, input_file in enumerate(input_files): # FIXME: what about non-PAGE input like image or JSON ??? @@ -331,28 +333,25 @@ def process_page_file(self, *input_files) -> None: except ValueError as e: log.info("non-PAGE input for page %s: %s", page_id, e) output_file_id = make_file_id(input_files[0], self.output_file_grp) - output_pcgts = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id) - if isinstance(output_pcgts, (list, tuple)): - output_images = output_pcgts[1:] - output_pcgts = output_pcgts[0] - for output_image_pil, output_image_id, output_image_path in output_images: - self.workspace.save_image_file( - output_image_pil, - output_image_id, - self.output_file_grp, - page_id=page_id, - file_path=output_image_path) - output_pcgts.set_pcGtsId(output_file_id) - self.add_metadata(output_pcgts) + result = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id) + for output_image_pil, output_image_id, output_image_path in result.images: + self.workspace.save_image_file( + output_image_pil, + output_image_id, + self.output_file_grp, + page_id=page_id, + file_path=output_image_path) + result.pcgts.set_pcGtsId(output_file_id) + self.add_metadata(result.pcgts) # FIXME: what about non-PAGE output like JSON ??? self.workspace.add_file(file_id=output_file_id, file_grp=self.output_file_grp, page_id=page_id, local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), mimetype=MIMETYPE_PAGE, - content=to_xml(output_pcgts)) + content=to_xml(result.pcgts)) - def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPage: + def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdProcessResult: """ Process the given ``input_pcgts`` of the :py:attr:`workspace`, representing one physical page (passed as one parsed @@ -374,7 +373,9 @@ def add_metadata(self, pcgts: OcrdPage) -> None: Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``. """ - pcgts.get_Metadata().add_MetadataItem( + metadata_obj = pcgts.get_Metadata() + assert metadata_obj is not None + metadata_obj.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=self.ocrd_tool['executable'], diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index b05ca9e6da..e01f097d3e 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -1,11 +1,13 @@ # pylint: disable=missing-module-docstring,invalid-name from os.path import join, basename +from typing import Optional import click from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -from ocrd_models.ocrd_page import to_xml +from ocrd_models.ocrd_page import OcrdPage, to_xml +from ocrd_models.ocrd_process_result import OcrdProcessResult from ocrd_utils import ( getLogger, assert_file_grp_cardinality, @@ -24,9 +26,9 @@ class DummyProcessor(Processor): Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group """ - def process_page_pcgts(self, *input_pcgts, output_file_id=None, page_id=None): + def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdProcessResult: # nothing to do here - return input_pcgts[0] + return OcrdProcessResult(input_pcgts[0]) def process_page_file(self, *input_files): LOG = getLogger('ocrd.dummy') @@ -48,7 +50,7 @@ def process_page_file(self, *input_files): content=content) file_id = file_id + '_PAGE' pcgts = page_from_file(output_file) - pcgts = self.process_page_pcgts(pcgts) + pcgts = self.process_page_pcgts(pcgts).pcgts pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) LOG.info("Add PAGE-XML %s generated for %s", file_id, output_file) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index fc619b7d0b..eeaa6434fd 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -1073,7 +1073,7 @@ def image_from_segment(self, segment, parent_image, parent_coords, return segment_image, segment_coords # pylint: disable=redefined-builtin - def save_image_file(self, image : Image, + def save_image_file(self, image : Image.Image, file_id : str, file_grp : str, file_path : Optional[str] = None, diff --git a/src/ocrd_modelfactory/__init__.py b/src/ocrd_modelfactory/__init__.py index 7afc5b1765..a98499b2e2 100644 --- a/src/ocrd_modelfactory/__init__.py +++ b/src/ocrd_modelfactory/__init__.py @@ -79,7 +79,7 @@ def page_from_image(input_file, with_tree=False): revmap = dict(((node, element) for element, node in mapping.items())) return pcgts, etree, mapping, revmap -def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET.Element, dict, dict]]: +def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET._Element, dict, dict]]: """ Create :py:class:`~ocrd_models.ocrd_page.OcrdPage` from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` or a file path diff --git a/src/ocrd_models/__init__.py b/src/ocrd_models/__init__.py index a89ee1dec8..19d80a0722 100644 --- a/src/ocrd_models/__init__.py +++ b/src/ocrd_models/__init__.py @@ -7,3 +7,4 @@ from .ocrd_mets import OcrdMets from .ocrd_xml_base import OcrdXmlDocument from .report import ValidationReport +from .ocrd_process_result import OcrdProcessResult From 70ad19120f995fae79dbe37099411dd1df3c1554 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 15 Aug 2024 11:29:28 +0200 Subject: [PATCH 052/228] bashlib ocrd__minversion: compare prerelease suffix alphabetically Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/ocrd/lib.bash | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index 9e0460e6d8..65ef9c1ce4 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -57,8 +57,8 @@ ocrd__minversion () { if (( ${version[2]} > ${minversion[2]} ));then return elif (( ${version[2]} == ${minversion[2]}));then - # Match prerelease suffix like a1, b1 only literally - if [[ $version_prerelease_suffix == $minversion_prerelease_suffix ]];then + # Match prerelease suffix like a1, b1 alphabetically + if [[ $version_prerelease_suffix = $minversion_prerelease_suffix -o $version_prerelease_suffix > $minversion_prerelease_suffix ]]; then return fi fi From 228272b6a4ee94795e8266af4182eacae38e713c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 15 Aug 2024 12:14:57 +0200 Subject: [PATCH 053/228] fix ocrd_tool.schema.yml cardinality oneOf syntax, update spec --- repo/spec | 2 +- src/ocrd_validators/ocrd_tool.schema.yml | 43 +++++++++++------------- 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/repo/spec b/repo/spec index 2bbd4dd916..2948bca7bd 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit 2bbd4dd916519f567e5c648b24c0b5ca6fc8a183 +Subproject commit 2948bca7bda274137221abfdc0765c52beeedc33 diff --git a/src/ocrd_validators/ocrd_tool.schema.yml b/src/ocrd_validators/ocrd_tool.schema.yml index db1b61458e..5de65a04ee 100644 --- a/src/ocrd_validators/ocrd_tool.schema.yml +++ b/src/ocrd_validators/ocrd_tool.schema.yml @@ -11,7 +11,7 @@ properties: type: string pattern: '^[0-9]+\.[0-9]+\.[0-9]+$' git_url: - description: Github/Gitlab URL + description: GitHub/GitLab URL type: string format: url dockerhub: @@ -37,14 +37,14 @@ properties: type: string input_file_grp: deprecated: true - description: Input fileGrp@USE this tool expects by default + description: (DEPRECATED) Input fileGrp@USE this tool expects by default type: array items: type: string # pattern: '^OCR-D-[A-Z0-9-]+$' output_file_grp: deprecated: true - description: Output fileGrp@USE this tool produces by default + description: (DEPRECATED) Output fileGrp@USE this tool produces by default type: array items: type: string @@ -52,31 +52,26 @@ properties: input_file_grp_cardinality: description: Number of (comma-separated) input fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited) oneOf: - - items: + - type: number + multipleOf: 1 + - type: array + items: type: number multipleOf: 1 - - items: - type: array - items: - type: number - multipleOf: 1 - minItems: 2 - maxItems: 2 + minItems: 2 + maxItems: 2 default: 1 - additionalProperties: false output_file_grp_cardinality: description: Number of (comma-separated) output fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited) oneOf: - - items: + - type: number + multipleOf: 1 + - type: array + items: type: number multipleOf: 1 - - items: - type: array - items: - type: number - multipleOf: 1 - minItems: 2 - maxItems: 2 + minItems: 2 + maxItems: 2 default: 1 parameters: description: Object describing the parameters of a tool. Keys are parameter names, values sub-schemas. @@ -152,9 +147,9 @@ properties: description: "If parameter is reference to file: Whether the file should be cached, e.g. because it is large and won't change." default: false description: - description: Concise description what the tool does + description: Concise description of what the tool does categories: - description: Tools belong to this categories, representing modules within the OCR-D project structure + description: Tools belong to these categories, representing modules within the OCR-D project structure type: array items: type: string @@ -229,7 +224,7 @@ properties: default: 'as-is' path_in_archive: type: string - description: if type is archive, the resource is at this location in the archive + description: If type is archive, the resource is at this location in the archive default: '.' version_range: type: string @@ -237,4 +232,4 @@ properties: default: '>= 0.0.1' size: type: number - description: Size of the resource in bytes + description: "Size of the resource in bytes to be retrieved (for archives: size of the archive)" From 5aba83b91ea2d37943f13dddc7ab3c7c444c9af5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 15 Aug 2024 12:22:53 +0200 Subject: [PATCH 054/228] bashlib: fix ocrd__minversion test syntax --- src/ocrd/lib.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index 65ef9c1ce4..82fa2005dc 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -58,7 +58,7 @@ ocrd__minversion () { return elif (( ${version[2]} == ${minversion[2]}));then # Match prerelease suffix like a1, b1 alphabetically - if [[ $version_prerelease_suffix = $minversion_prerelease_suffix -o $version_prerelease_suffix > $minversion_prerelease_suffix ]]; then + if [ "$version_prerelease_suffix" = "$minversion_prerelease_suffix" -o "$version_prerelease_suffix" \> "$minversion_prerelease_suffix" ]; then return fi fi From 3d094d6cac7cca62fec4555f95d35ccac828cc14 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 13:30:02 +0200 Subject: [PATCH 055/228] reimplement OcrdPageResult --- src/ocrd/processor/base.py | 4 ++-- src/ocrd/processor/builtin/dummy_processor.py | 7 +++---- src/ocrd/processor/ocrd_page_result.py | 15 +++++++++++++++ src/ocrd_models/__init__.py | 1 - 4 files changed, 20 insertions(+), 7 deletions(-) create mode 100644 src/ocrd/processor/ocrd_page_result.py diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 2a4679ed47..5e3b8a7fd0 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -24,7 +24,7 @@ from ocrd.workspace import Workspace from ocrd_models.ocrd_file import OcrdFile -from ocrd_models.ocrd_process_result import OcrdProcessResult +from ocrd.processor.ocrd_page_result import OcrdPageResult from ocrd_utils import ( VERSION as OCRD_VERSION, MIMETYPE_PAGE, @@ -351,7 +351,7 @@ def process_page_file(self, *input_files : OcrdFile) -> None: mimetype=MIMETYPE_PAGE, content=to_xml(result.pcgts)) - def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdProcessResult: + def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPageResult: """ Process the given ``input_pcgts`` of the :py:attr:`workspace`, representing one physical page (passed as one parsed diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index e01f097d3e..4ddb434f24 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -6,11 +6,10 @@ from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor +from ocrd.processor.ocrd_page_result import OcrdPageResult from ocrd_models.ocrd_page import OcrdPage, to_xml -from ocrd_models.ocrd_process_result import OcrdProcessResult from ocrd_utils import ( getLogger, - assert_file_grp_cardinality, make_file_id, MIME_TO_EXT, MIMETYPE_PAGE, @@ -26,9 +25,9 @@ class DummyProcessor(Processor): Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group """ - def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdProcessResult: + def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdPageResult: # nothing to do here - return OcrdProcessResult(input_pcgts[0]) + return OcrdPageResult(input_pcgts[0]) def process_page_file(self, *input_files): LOG = getLogger('ocrd.dummy') diff --git a/src/ocrd/processor/ocrd_page_result.py b/src/ocrd/processor/ocrd_page_result.py new file mode 100644 index 0000000000..6e00bd4e58 --- /dev/null +++ b/src/ocrd/processor/ocrd_page_result.py @@ -0,0 +1,15 @@ +from dataclasses import dataclass, field +from typing import List +from ocrd_models.ocrd_page import OcrdPage +from PIL.Image import Image + +@dataclass +class OcrdPageResultImage(): + pil : Image + file_id : str + file_path : str + +@dataclass +class OcrdPageResult(): + pcgts : OcrdPage + images : List[OcrdPageResultImage] = field(default_factory=list) diff --git a/src/ocrd_models/__init__.py b/src/ocrd_models/__init__.py index 19d80a0722..a89ee1dec8 100644 --- a/src/ocrd_models/__init__.py +++ b/src/ocrd_models/__init__.py @@ -7,4 +7,3 @@ from .ocrd_mets import OcrdMets from .ocrd_xml_base import OcrdXmlDocument from .report import ValidationReport -from .ocrd_process_result import OcrdProcessResult From f8b6896bf29f960cfdfea8941d2b5fbb2b2e81fa Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 15 Aug 2024 13:44:13 +0200 Subject: [PATCH 056/228] update spec (with new ocrd_tool.schema) --- repo/spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repo/spec b/repo/spec index 2948bca7bd..cb1ba2e72b 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit 2948bca7bda274137221abfdc0765c52beeedc33 +Subproject commit cb1ba2e72bd176f1a1076eea38d6438c647e68e7 From 72eb75b6509fdd3ba2e8d44fe4c6508b305110a6 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 13:44:48 +0200 Subject: [PATCH 057/228] update spec to v3.25.0, ocrd_tool.schema.yml --- repo/spec | 2 +- src/ocrd_validators/ocrd_tool.schema.yml | 43 +++++++++++------------- 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/repo/spec b/repo/spec index 2bbd4dd916..cb1ba2e72b 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit 2bbd4dd916519f567e5c648b24c0b5ca6fc8a183 +Subproject commit cb1ba2e72bd176f1a1076eea38d6438c647e68e7 diff --git a/src/ocrd_validators/ocrd_tool.schema.yml b/src/ocrd_validators/ocrd_tool.schema.yml index db1b61458e..5de65a04ee 100644 --- a/src/ocrd_validators/ocrd_tool.schema.yml +++ b/src/ocrd_validators/ocrd_tool.schema.yml @@ -11,7 +11,7 @@ properties: type: string pattern: '^[0-9]+\.[0-9]+\.[0-9]+$' git_url: - description: Github/Gitlab URL + description: GitHub/GitLab URL type: string format: url dockerhub: @@ -37,14 +37,14 @@ properties: type: string input_file_grp: deprecated: true - description: Input fileGrp@USE this tool expects by default + description: (DEPRECATED) Input fileGrp@USE this tool expects by default type: array items: type: string # pattern: '^OCR-D-[A-Z0-9-]+$' output_file_grp: deprecated: true - description: Output fileGrp@USE this tool produces by default + description: (DEPRECATED) Output fileGrp@USE this tool produces by default type: array items: type: string @@ -52,31 +52,26 @@ properties: input_file_grp_cardinality: description: Number of (comma-separated) input fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited) oneOf: - - items: + - type: number + multipleOf: 1 + - type: array + items: type: number multipleOf: 1 - - items: - type: array - items: - type: number - multipleOf: 1 - minItems: 2 - maxItems: 2 + minItems: 2 + maxItems: 2 default: 1 - additionalProperties: false output_file_grp_cardinality: description: Number of (comma-separated) output fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited) oneOf: - - items: + - type: number + multipleOf: 1 + - type: array + items: type: number multipleOf: 1 - - items: - type: array - items: - type: number - multipleOf: 1 - minItems: 2 - maxItems: 2 + minItems: 2 + maxItems: 2 default: 1 parameters: description: Object describing the parameters of a tool. Keys are parameter names, values sub-schemas. @@ -152,9 +147,9 @@ properties: description: "If parameter is reference to file: Whether the file should be cached, e.g. because it is large and won't change." default: false description: - description: Concise description what the tool does + description: Concise description of what the tool does categories: - description: Tools belong to this categories, representing modules within the OCR-D project structure + description: Tools belong to these categories, representing modules within the OCR-D project structure type: array items: type: string @@ -229,7 +224,7 @@ properties: default: 'as-is' path_in_archive: type: string - description: if type is archive, the resource is at this location in the archive + description: If type is archive, the resource is at this location in the archive default: '.' version_range: type: string @@ -237,4 +232,4 @@ properties: default: '>= 0.0.1' size: type: number - description: Size of the resource in bytes + description: "Size of the resource in bytes to be retrieved (for archives: size of the archive)" From 75cb20c36ef9f82f858a82b9dc393679d7b20f8a Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 14:24:47 +0200 Subject: [PATCH 058/228] process_page_file: fix handling of images --- src/ocrd/processor/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 5e3b8a7fd0..626c3ca97f 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -334,13 +334,13 @@ def process_page_file(self, *input_files : OcrdFile) -> None: log.info("non-PAGE input for page %s: %s", page_id, e) output_file_id = make_file_id(input_files[0], self.output_file_grp) result = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id) - for output_image_pil, output_image_id, output_image_path in result.images: + for image in result.images: self.workspace.save_image_file( - output_image_pil, - output_image_id, + image.pil, + image.file_id, self.output_file_grp, page_id=page_id, - file_path=output_image_path) + file_path=image.file_path) result.pcgts.set_pcGtsId(output_file_id) self.add_metadata(result.pcgts) # FIXME: what about non-PAGE output like JSON ??? From 9a1c7ad083f2266e0aec3fe4cde6b956e04d7567 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 14:57:13 +0200 Subject: [PATCH 059/228] process_page_pcgts: remove output_file_id, replace OcrdPageResult.file_id with OcrdPageResult.file_id_suffix --- src/ocrd/processor/base.py | 41 +++++++++++-------- src/ocrd/processor/builtin/dummy_processor.py | 12 ++++-- src/ocrd/processor/ocrd_page_result.py | 2 +- 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 626c3ca97f..1985377562 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -15,7 +15,7 @@ import os from os import getcwd from pathlib import Path -from typing import List, Optional +from typing import List, Optional, Union import sys import inspect import tarfile @@ -23,8 +23,9 @@ from deprecated import deprecated from ocrd.workspace import Workspace -from ocrd_models.ocrd_file import OcrdFile +from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile from ocrd.processor.ocrd_page_result import OcrdPageResult +from ocrd_models.ocrd_page_generateds import PcGtsType from ocrd_utils import ( VERSION as OCRD_VERSION, MIMETYPE_PAGE, @@ -200,10 +201,11 @@ def verify(self): assert self.output_file_grp is not None input_file_grps = self.input_file_grp.split(',') output_file_grps = self.output_file_grp.split(',') - def assert_file_grp_cardinality(grps, spec, msg): + def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg): if isinstance(spec, int) and spec > 0: assert len(grps) == spec, msg % (len(grps), str(spec)) else: + assert isinstance(spec, list) minimum = spec[0] maximum = spec[1] if minimum > 0: @@ -291,7 +293,7 @@ def process_workspace(self, workspace: Workspace) -> None: # - ResourceNotFoundError → use ResourceManager to download (once), then retry # - transient (I/O or OOM) error → maybe sleep, retry # - persistent (data) error → skip / dummy / raise - input_files = [None] * len(input_file_tuple) + input_files : List[Optional[Union[OcrdFile, ClientSideOcrdFile]]] = [None] * len(input_file_tuple) for i, input_file in enumerate(input_file_tuple): if i == 0: log.info("processing page %s", input_file.pageId) @@ -311,7 +313,7 @@ def process_workspace(self, workspace: Workspace) -> None: # fall back to deprecated method self.process() - def process_page_file(self, *input_files : OcrdFile) -> None: + def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: """ Process the given ``input_files`` of the :py:attr:`workspace`, representing one physical page (passed as one opened @@ -323,21 +325,25 @@ def process_page_file(self, *input_files : OcrdFile) -> None: to handle cases like multiple fileGrps, non-PAGE input etc.) """ log = getLogger('ocrd.processor.base') - input_pcgts : List[OcrdPage] = [None] * len(input_files) + input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files) + assert isinstance(input_files[0], (OcrdFile, ClientSideOcrdFile)) page_id = input_files[0].pageId for i, input_file in enumerate(input_files): + assert isinstance(input_file, (OcrdFile, ClientSideOcrdFile)) # FIXME: what about non-PAGE input like image or JSON ??? log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId) try: - input_pcgts[i] = page_from_file(input_file) + page_ = page_from_file(input_file) + assert isinstance(page_, PcGtsType) + input_pcgts[i] = page_ except ValueError as e: log.info("non-PAGE input for page %s: %s", page_id, e) output_file_id = make_file_id(input_files[0], self.output_file_grp) - result = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id) + result = self.process_page_pcgts(*input_pcgts, page_id=page_id) for image in result.images: self.workspace.save_image_file( image.pil, - image.file_id, + f'{output_file_id}_{image.file_id_suffix}', self.output_file_grp, page_id=page_id, file_path=image.file_path) @@ -351,18 +357,21 @@ def process_page_file(self, *input_files : OcrdFile) -> None: mimetype=MIMETYPE_PAGE, content=to_xml(result.pcgts)) - def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """ Process the given ``input_pcgts`` of the :py:attr:`workspace`, representing one physical page (passed as one parsed :py:class:`~ocrd_models.OcrdPage` per input fileGrp) under the given :py:attr:`parameter`, and return the - resulting :py:class:`~ocrd_models.OcrdPage`. - - Optionally, return a list or tuple of the :py:class:`~ocrd_models.OcrdPage` - and one or more lists or tuples of :py:class:`PIL.Image` (image data), - :py:class:str (file ID) and :py:class:str (file path) of derived images - to be annotated along with the resulting PAGE file. + resulting :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResult`. + + Optionally, add to the ``images`` attribute of the resulting + :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResult` instances + of :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResultImage`, + which have required fields for ``pil`` (:py:class:`PIL.Image` image data), + ``file_id_suffix`` (used for generating IDs of saved images) and + ``file_path`` (the path used in the AlternativeImage and for saving the + file). (This contains the main functionality and must be overridden by subclasses.) """ diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 4ddb434f24..5ef76d2fa5 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -1,13 +1,15 @@ # pylint: disable=missing-module-docstring,invalid-name from os.path import join, basename -from typing import Optional +from typing import Optional, Union import click from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd.processor.ocrd_page_result import OcrdPageResult +from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile from ocrd_models.ocrd_page import OcrdPage, to_xml +from ocrd_models.ocrd_page_generateds import PcGtsType from ocrd_utils import ( getLogger, make_file_id, @@ -25,13 +27,16 @@ class DummyProcessor(Processor): Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group """ - def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + assert input_pcgts[0] # nothing to do here return OcrdPageResult(input_pcgts[0]) - def process_page_file(self, *input_files): + def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: LOG = getLogger('ocrd.dummy') input_file = input_files[0] + assert input_file + assert input_file.local_filename if self.parameter['copy_files'] and input_file.mimetype != MIMETYPE_PAGE: # we need to mimic the actual copying in addition to the PAGE boilerplate file_id = make_file_id(input_file, self.output_file_grp) @@ -49,6 +54,7 @@ def process_page_file(self, *input_files): content=content) file_id = file_id + '_PAGE' pcgts = page_from_file(output_file) + assert isinstance(pcgts, PcGtsType) pcgts = self.process_page_pcgts(pcgts).pcgts pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) diff --git a/src/ocrd/processor/ocrd_page_result.py b/src/ocrd/processor/ocrd_page_result.py index 6e00bd4e58..92f926cb93 100644 --- a/src/ocrd/processor/ocrd_page_result.py +++ b/src/ocrd/processor/ocrd_page_result.py @@ -6,7 +6,7 @@ @dataclass class OcrdPageResultImage(): pil : Image - file_id : str + file_id_suffix : str file_path : str @dataclass From 60ad4247eab4d54846620431a2b23dd71499f8c4 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 15:07:48 +0200 Subject: [PATCH 060/228] OcrdPageResultImage requires passing alternative_image w/o filename set --- src/ocrd/processor/base.py | 13 ++++++++----- src/ocrd/processor/ocrd_page_result.py | 4 +++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 1985377562..8feedcb886 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -9,7 +9,7 @@ 'run_processor' ] -from os.path import exists +from os.path import exists, join from shutil import copyfileobj import json import os @@ -340,13 +340,16 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc log.info("non-PAGE input for page %s: %s", page_id, e) output_file_id = make_file_id(input_files[0], self.output_file_grp) result = self.process_page_pcgts(*input_pcgts, page_id=page_id) - for image in result.images: + for image_result in result.images: + image_file_id = f'{output_file_id}_{image_result.file_id_suffix}' + image_file_path = join(self.output_file_grp, f'{image_file_id}.png') + image_result.alternative_image.set_filename(image_file_path) self.workspace.save_image_file( - image.pil, - f'{output_file_id}_{image.file_id_suffix}', + image_result.pil, + image_file_id, self.output_file_grp, page_id=page_id, - file_path=image.file_path) + file_path=image_file_path) result.pcgts.set_pcGtsId(output_file_id) self.add_metadata(result.pcgts) # FIXME: what about non-PAGE output like JSON ??? diff --git a/src/ocrd/processor/ocrd_page_result.py b/src/ocrd/processor/ocrd_page_result.py index 92f926cb93..c63330c734 100644 --- a/src/ocrd/processor/ocrd_page_result.py +++ b/src/ocrd/processor/ocrd_page_result.py @@ -3,11 +3,13 @@ from ocrd_models.ocrd_page import OcrdPage from PIL.Image import Image +from ocrd_models.ocrd_page_generateds import AlternativeImageType + @dataclass class OcrdPageResultImage(): pil : Image file_id_suffix : str - file_path : str + alternative_image : AlternativeImageType @dataclass class OcrdPageResult(): From 50dfdd6356395f9965505f915e4641d1dc553834 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 15 Aug 2024 17:06:19 +0200 Subject: [PATCH 061/228] Processor.verify: handle -1 case Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/ocrd/processor/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 8feedcb886..230d1fdbab 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -202,8 +202,9 @@ def verify(self): input_file_grps = self.input_file_grp.split(',') output_file_grps = self.output_file_grp.split(',') def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg): - if isinstance(spec, int) and spec > 0: - assert len(grps) == spec, msg % (len(grps), str(spec)) + if isinstance(spec, int): + if spec > 0: + assert len(grps) == spec, msg % (len(grps), str(spec)) else: assert isinstance(spec, list) minimum = spec[0] From 53f2634280c437bc057e23bfc7e0992ae7930a82 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 15 Aug 2024 17:21:32 +0200 Subject: [PATCH 062/228] processor.base: remove obsolete FIXME Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/ocrd/processor/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 230d1fdbab..17d9eac40a 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -331,7 +331,6 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc page_id = input_files[0].pageId for i, input_file in enumerate(input_files): assert isinstance(input_file, (OcrdFile, ClientSideOcrdFile)) - # FIXME: what about non-PAGE input like image or JSON ??? log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId) try: page_ = page_from_file(input_file) From d210afa527003c7f8ed4af5ea3853dc0db5ccd52 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 15 Aug 2024 17:26:14 +0200 Subject: [PATCH 063/228] Processor.process_page_pcgts: update docstring for file_path/alternative_image Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/ocrd/processor/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 17d9eac40a..9daa23697e 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -372,9 +372,9 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResult` instances of :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResultImage`, which have required fields for ``pil`` (:py:class:`PIL.Image` image data), - ``file_id_suffix`` (used for generating IDs of saved images) and - ``file_path`` (the path used in the AlternativeImage and for saving the - file). + ``file_id_suffix`` (used for generating IDs of the saved image) and + ``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType` + for setting the filename of the saved image). (This contains the main functionality and must be overridden by subclasses.) """ From 5718cf92b7a1729f789992569de801660837cb76 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 17:28:44 +0200 Subject: [PATCH 064/228] export OcrdPageResult{Image} from ocrd.processor --- src/ocrd/processor/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ocrd/processor/__init__.py b/src/ocrd/processor/__init__.py index 21b0c69eb2..0b3ce5a56e 100644 --- a/src/ocrd/processor/__init__.py +++ b/src/ocrd/processor/__init__.py @@ -2,6 +2,10 @@ Processor, ResourceNotFoundError ) +from .ocrd_page_result import ( + OcrdPageResult, + OcrdPageResultImage +) from .helpers import ( run_cli, run_processor, From f5f3145ef4dc902edf71e72ee4ca5fdaf6640361 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 15 Aug 2024 17:29:17 +0200 Subject: [PATCH 065/228] Processor.process.page_pcgts: simplify references in docstring --- src/ocrd/processor/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 8feedcb886..54f05d6d12 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -366,11 +366,11 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option representing one physical page (passed as one parsed :py:class:`~ocrd_models.OcrdPage` per input fileGrp) under the given :py:attr:`parameter`, and return the - resulting :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResult`. + resulting :py:class:`~ocrd.processor.OcrdPageResult`. Optionally, add to the ``images`` attribute of the resulting - :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResult` instances - of :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResultImage`, + :py:class:`~ocrd.processor.OcrdPageResult` instances + of :py:class:`~ocrd.processor.OcrdPageResultImage`, which have required fields for ``pil`` (:py:class:`PIL.Image` image data), ``file_id_suffix`` (used for generating IDs of saved images) and ``file_path`` (the path used in the AlternativeImage and for saving the From 7045318105e9e58328a6c12adb63e0fbae1e9a69 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 17:34:43 +0200 Subject: [PATCH 066/228] allow "from ocrd_models import OcrdPage --- src/ocrd_models/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ocrd_models/__init__.py b/src/ocrd_models/__init__.py index a89ee1dec8..330fefe97d 100644 --- a/src/ocrd_models/__init__.py +++ b/src/ocrd_models/__init__.py @@ -5,5 +5,6 @@ from .ocrd_exif import OcrdExif from .ocrd_file import OcrdFile, ClientSideOcrdFile from .ocrd_mets import OcrdMets +from .ocrd_page import OcrdPage from .ocrd_xml_base import OcrdXmlDocument from .report import ValidationReport From 3220e3f6d9805949e2e40f48a43f2509e0e25936 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 19:11:12 +0200 Subject: [PATCH 067/228] :memo: v3.0.0a1 --- CHANGELOG.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index dd816a3545..1b6a47d02c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,20 @@ Change Log Versioned according to [Semantic Versioning](http://semver.org/). +## [3.0.0a1] - 2024-08-15 + +Changed: + - :fire: Deprecate `Processor.process` + - update spec to v3.25.0, which allows annotating fileGrp cardinality in `ocrd-tool.json` + - `ocrd.processor`: Handle loading of bundled `ocrd-tool.json` generically + +Added: + - `Processor.process_workspace`: process a complete workspace, with default implementation + - `Processor.process_page_file`: process an OcrdFile, with default implementation + - `Processor.process_page_pcgts`: process a single OcrdPage, produce a single OcrdPage, required to implement + - `Processor.verify`: handle fileGrp cardinality verification, with default implementation + - `Processor.setup`: to set up processor before processing, optional + ## Unreleased Changed: From e1f5744746b29851aa2e2241f8ea3546be965cdc Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 16 Aug 2024 09:58:37 +0200 Subject: [PATCH 068/228] Update CHANGELOG.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- CHANGELOG.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b6a47d02c..b0e89bb8f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,12 @@ Versioned according to [Semantic Versioning](http://semver.org/). Changed: - :fire: Deprecate `Processor.process` - - update spec to v3.25.0, which allows annotating fileGrp cardinality in `ocrd-tool.json` + - update spec to v3.25.0, which requires annotating fileGrp cardinality in `ocrd-tool.json` + - :fire: Remove passing non-processing kwargs to `Processor` constructor, add as members + (i.e. `show_help`, `dump_json`, `dump_module_dir`, `list_resources`, `show_resource`, `resolve_resource`) + - :fire: Deprecate passing processing arg / kwargs to `Processor` constructor + (i.e. `workspace`, `page_id`, `input_file_grp`, `output_file_grp`; now all set by `run_processor`) + - :fire: Deprecate passing `ocrd-tool.json` metadata to `Processor` constructor - `ocrd.processor`: Handle loading of bundled `ocrd-tool.json` generically Added: From 80d42f1bb17a67d6b32e4edb8cb3e66ce42badd4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:00:02 +0200 Subject: [PATCH 069/228] ocrd: more convenience imports --- src/ocrd/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd/__init__.py b/src/ocrd/__init__.py index 9aa507b2c0..e4c782685b 100644 --- a/src/ocrd/__init__.py +++ b/src/ocrd/__init__.py @@ -15,7 +15,8 @@ """ from ocrd.processor.base import run_processor, run_cli, Processor, ResourceNotFoundError -from ocrd_models import OcrdMets, OcrdExif, OcrdFile, OcrdAgent +from ocrd.processor.ocrd_page_result import OcrdPageResult, OcrdPageResultImage +from ocrd_models import OcrdMets, OcrdPage, OcrdExif, OcrdFile, OcrdAgent from ocrd.resolver import Resolver from ocrd_validators import * from ocrd.workspace import Workspace From 0e57b4b3897b2dc03a0c8480d146e2e403ee4a23 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:01:01 +0200 Subject: [PATCH 070/228] ocrd.cli: more fix module import order, export help cmd --- src/ocrd/cli/__init__.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 9b80abeb4d..198406afd9 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -10,7 +10,18 @@ from ocrd_utils import config -__all__ = ['cli'] +from ..decorators import ocrd_loglevel +from .ocrd_tool import ocrd_tool_cli +from .workspace import workspace_cli +from .process import process_cli +from .bashlib import bashlib_cli +from .validate import validate_cli +from .resmgr import resmgr_cli +from .zip import zip_cli +from .log import log_cli +from .network import network_cli + +__all__ = ['cli', 'command_with_replaced_help'] _epilog = f""" @@ -54,6 +65,7 @@ {config.describe('OCRD_LOGGING_DEBUG')} """ + def command_with_replaced_help(*replacements): class CommandWithReplacedHelp(click.Command): @@ -66,17 +78,6 @@ def get_help(self, ctx): return CommandWithReplacedHelp -from ocrd.cli.ocrd_tool import ocrd_tool_cli -from ocrd.cli.workspace import workspace_cli -from ocrd.cli.process import process_cli -from ocrd.cli.bashlib import bashlib_cli -from ocrd.cli.validate import validate_cli -from ocrd.cli.resmgr import resmgr_cli -from ocrd.decorators import ocrd_loglevel -from .zip import zip_cli -from .log import log_cli -from .network import network_cli - @click.group(epilog=_epilog) @click.version_option(package_name='ocrd') From 9cfd70cffcc71118293a391310ed7eb3eff7b7a4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:01:19 +0200 Subject: [PATCH 071/228] fix imports --- src/ocrd/decorators/parameter_option.py | 2 +- src/ocrd/workspace.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd/decorators/parameter_option.py b/src/ocrd/decorators/parameter_option.py index 0fbe3e0577..55abbc2a53 100644 --- a/src/ocrd/decorators/parameter_option.py +++ b/src/ocrd/decorators/parameter_option.py @@ -1,10 +1,10 @@ from click import option -#from ocrd_utils import parse_json_string_or_file __all__ = ['parameter_option', 'parameter_override_option'] def _handle_param_option(ctx, param, value): + from ocrd_utils import parse_json_string_or_file return parse_json_string_or_file(*list(value)) parameter_option = option('-p', '--parameter', diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index eeaa6434fd..509b8123b9 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -24,6 +24,7 @@ coordinates_of_segment, adjust_canvas_to_rotation, adjust_canvas_to_transposition, + scale_coordinates, shift_coordinates, rotate_coordinates, transform_coordinates, From 95212b598f19ca4576ff2c22ec5573bf1cd5de7a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:04:19 +0200 Subject: [PATCH 072/228] fix type assertion --- src/ocrd/processor/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 46417ac2cc..79b52dde60 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -25,7 +25,6 @@ from ocrd.workspace import Workspace from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile from ocrd.processor.ocrd_page_result import OcrdPageResult -from ocrd_models.ocrd_page_generateds import PcGtsType from ocrd_utils import ( VERSION as OCRD_VERSION, MIMETYPE_PAGE, @@ -334,9 +333,10 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId) try: page_ = page_from_file(input_file) - assert isinstance(page_, PcGtsType) + assert isinstance(page_, OcrdPage) input_pcgts[i] = page_ except ValueError as e: + # not PAGE and not an image to generate PAGE for log.info("non-PAGE input for page %s: %s", page_id, e) output_file_id = make_file_id(input_files[0], self.output_file_grp) result = self.process_page_pcgts(*input_pcgts, page_id=page_id) From 4aa288a7ff57a2fd2255d5ed8606ea39102d42a5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:13:58 +0200 Subject: [PATCH 073/228] ocrd_utils: forgot to export scale_coordinates at toplvl --- src/ocrd_utils/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ocrd_utils/__init__.py b/src/ocrd_utils/__init__.py index 2055758a89..78400791aa 100644 --- a/src/ocrd_utils/__init__.py +++ b/src/ocrd_utils/__init__.py @@ -13,6 +13,7 @@ :py:meth:`ocrd.workspace.Workspace.image_from_segment`.) * :py:func:`rotate_coordinates`, + :py:func:`scale_coordinates`, :py:func:`shift_coordinates`, :py:func:`transpose_coordinates`, :py:func:`transform_coordinates` @@ -147,6 +148,7 @@ polygon_mask, rotate_coordinates, rotate_image, + scale_coordinates, shift_coordinates, transform_coordinates, transpose_coordinates, From 8044e60590655e19ec3a3127efb925e428d08e6a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:34:57 +0200 Subject: [PATCH 074/228] fix 9cfd70cffcc --- src/ocrd/cli/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 198406afd9..a79faabe96 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -21,7 +21,7 @@ from .log import log_cli from .network import network_cli -__all__ = ['cli', 'command_with_replaced_help'] +__all__ = ['cli'] _epilog = f""" From 21ff810f68c76311ce504d00bc9babde7d14b963 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:58:58 +0200 Subject: [PATCH 075/228] fix 9cfd70cffcc (revert to wrong import order to avoid circle) --- src/ocrd/cli/__init__.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index a79faabe96..322cbde19f 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -10,19 +10,6 @@ from ocrd_utils import config -from ..decorators import ocrd_loglevel -from .ocrd_tool import ocrd_tool_cli -from .workspace import workspace_cli -from .process import process_cli -from .bashlib import bashlib_cli -from .validate import validate_cli -from .resmgr import resmgr_cli -from .zip import zip_cli -from .log import log_cli -from .network import network_cli - -__all__ = ['cli'] - _epilog = f""" \b @@ -65,7 +52,6 @@ {config.describe('OCRD_LOGGING_DEBUG')} """ - def command_with_replaced_help(*replacements): class CommandWithReplacedHelp(click.Command): @@ -79,6 +65,17 @@ def get_help(self, ctx): return CommandWithReplacedHelp +from ..decorators import ocrd_loglevel +from .ocrd_tool import ocrd_tool_cli +from .workspace import workspace_cli +from .process import process_cli +from .bashlib import bashlib_cli +from .validate import validate_cli +from .resmgr import resmgr_cli +from .zip import zip_cli +from .log import log_cli +from .network import network_cli + @click.group(epilog=_epilog) @click.version_option(package_name='ocrd') @ocrd_loglevel @@ -96,3 +93,5 @@ def cli(**kwargs): # pylint: disable=unused-argument cli.add_command(log_cli) cli.add_command(resmgr_cli) cli.add_command(network_cli) + +__all__ = ['cli'] From 4077e8d8f8c306d524bdb0faf5faa9557999d556 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 16 Aug 2024 16:09:30 +0200 Subject: [PATCH 076/228] s,PcGtsType,OcrdPage, --- src/ocrd/processor/base.py | 2 +- src/ocrd/processor/builtin/dummy_processor.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 79b52dde60..344569677e 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -383,7 +383,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option def add_metadata(self, pcgts: OcrdPage) -> None: """ Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing - the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``. + the processing step and runtime parameters to :py:class:`~ocrd_models.OcrdPage` ``pcgts``. """ metadata_obj = pcgts.get_Metadata() assert metadata_obj is not None diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 5ef76d2fa5..1b3f7a5aa0 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -9,7 +9,6 @@ from ocrd.processor.ocrd_page_result import OcrdPageResult from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile from ocrd_models.ocrd_page import OcrdPage, to_xml -from ocrd_models.ocrd_page_generateds import PcGtsType from ocrd_utils import ( getLogger, make_file_id, @@ -54,7 +53,7 @@ def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcr content=content) file_id = file_id + '_PAGE' pcgts = page_from_file(output_file) - assert isinstance(pcgts, PcGtsType) + assert isinstance(pcgts, OcrdPage) pcgts = self.process_page_pcgts(pcgts).pcgts pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) From cd4c96c94c6424628de2ccf2eb503d6933eaad9d Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 19 Aug 2024 14:17:14 +0200 Subject: [PATCH 077/228] add config.OCRD_DOWNLOAD_INPUT --- src/ocrd/cli/__init__.py | 2 ++ src/ocrd/processor/base.py | 6 ++++-- src/ocrd_utils/config.py | 18 +++++++++++++----- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 322cbde19f..bf262b0b96 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -29,6 +29,8 @@ \b {config.describe('OCRD_DOWNLOAD_TIMEOUT')} \b +{config.describe('OCRD_DOWNLOAD_INPUT')} +\b {config.describe('OCRD_METS_CACHING')} \b {config.describe('OCRD_MAX_PROCESSOR_CACHE')} diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 344569677e..78f8b12374 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -29,6 +29,7 @@ VERSION as OCRD_VERSION, MIMETYPE_PAGE, MIME_TO_EXT, + config, getLogger, initLogging, list_resource_candidates, @@ -111,7 +112,7 @@ def __init__( input_file_grp=None, output_file_grp=None, page_id=None, - download_files=True, + download_files=config.OCRD_DOWNLOAD_INPUT, version=None ): """ @@ -137,7 +138,8 @@ def __init__( (or empty for all pages). \ Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ before processing. - download_files (boolean): Whether input files will be downloaded prior to processing. + download_files (boolean): Whether input files will be downloaded prior to processing, \ + defaults to :py:attr:`ocrd_utils.config.OCRD_DOWNLOAD_INPUT` which is ``True`` by default """ if ocrd_tool is not None: deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - " diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index d0955a8dcf..22a566e7bc 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -12,6 +12,8 @@ from tempfile import gettempdir from textwrap import fill, indent +_validator_boolean = lambda val: isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1') +_parser_boolean = lambda val: bool(val) if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1') class OcrdEnvVariable(): @@ -102,8 +104,8 @@ def raw_value(self, name): config.add('OCRD_METS_CACHING', description='If set to `true`, access to the METS file is cached, speeding in-memory search and modification.', - validator=lambda val: val in ('true', 'false', '0', '1'), - parser=lambda val: val in ('true', '1')) + validator=_validator_boolean, + parser=_parser_boolean) config.add('OCRD_MAX_PROCESSOR_CACHE', description="Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers.", @@ -125,7 +127,7 @@ def raw_value(self, name): description="If set, then the CPU profile is written to this file for later peruse with a analysis tools like snakeviz") config.add("OCRD_DOWNLOAD_RETRIES", - description="Number of times to retry failed attempts for downloads of resource or workspace files.", + description="Number of times to retry failed attempts for downloads of resources or workspace files.", validator=int, parser=int) @@ -141,6 +143,12 @@ def _ocrd_download_timeout_parser(val): description="Timeout in seconds for connecting or reading (comma-separated) when downloading.", parser=_ocrd_download_timeout_parser) +config.add("OCRD_DOWNLOAD_INPUT", + description="Whether to download files not present locally during processing", + default=(True, True), + validator=_validator_boolean, + parser=_parser_boolean) + config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING", description="Default address of Processing Server to connect to (for `ocrd network client processing`).", default=(True, '')) @@ -190,5 +198,5 @@ def _ocrd_download_timeout_parser(val): config.add("OCRD_LOGGING_DEBUG", description="Print information about the logging setup to STDERR", default=(True, False), - validator=lambda val: isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1'), - parser=lambda val: val if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1')) + validator=_validator_boolean, + parser=_parser_boolean) From 312525517f9c01f25634d35bbef256b5c56372e7 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 19 Aug 2024 14:51:39 +0200 Subject: [PATCH 078/228] define self.logger in processor base constructor --- src/ocrd/processor/base.py | 14 +++++++++----- src/ocrd/processor/builtin/dummy_processor.py | 3 +-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 344569677e..5d9637b80a 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -47,6 +47,8 @@ # XXX imports must remain for backwards-compatibility from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import +_logger = getLogger('ocrd.processor.base') + class ResourceNotFoundError(FileNotFoundError): """ An exception signifying the requested processor resource @@ -175,6 +177,9 @@ def __init__( if not report.is_valid: raise ValueError("Invalid parameters %s" % report.errors) self.parameter = parameter + # NOTE: this is the logger to be used by processor implementations, + # `processor.base` default implementations should use :py:attr:`_logger` + self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}') # workaround for deprecated#72 (@deprecated decorator does not work for subclasses): setattr(self, 'process', deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process'))) @@ -562,7 +567,6 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): if not self.input_file_grp: raise ValueError("Processor is missing input fileGrp") - LOG = getLogger('ocrd.processor.base') ifgs = self.input_file_grp.split(",") # Iterating over all files repeatedly may seem inefficient at first sight, # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering @@ -582,13 +586,13 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.") if on_error == 'abort': raise ValueError(msg) - LOG.warning(msg) + _logger.warning(msg) for file_ in files_: if not file_.pageId: continue ift = pages.setdefault(file_.pageId, [None]*len(ifgs)) if ift[i]: - LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg) + _logger.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg) # fileGrp has multiple files for this page ID if mimetype: # filter was active, this must not happen @@ -627,14 +631,14 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): else: raise Exception("Unknown 'on_error' strategy '%s'" % on_error) else: - LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg) + _logger.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg) ift[i] = file_ ifts = list() for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): if not ifiles[i]: # other fallback options? - LOG.error('found no page %s in file group %s', + _logger.error('found no page %s in file group %s', page, ifg) if ifiles[0] or not require_first: ifts.append(tuple(ifiles)) diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 1b3f7a5aa0..29082e72d7 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -32,7 +32,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional return OcrdPageResult(input_pcgts[0]) def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: - LOG = getLogger('ocrd.dummy') input_file = input_files[0] assert input_file assert input_file.local_filename @@ -57,7 +56,7 @@ def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcr pcgts = self.process_page_pcgts(pcgts).pcgts pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) - LOG.info("Add PAGE-XML %s generated for %s", file_id, output_file) + self.logger.info("Add PAGE-XML %s generated for %s", file_id, output_file) self.workspace.add_file(file_id=file_id, file_grp=self.output_file_grp, page_id=input_file.pageId, From dcf7c52e0e3c6de3105ff6bab0633cbae2a24ae7 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 19 Aug 2024 18:58:53 +0200 Subject: [PATCH 079/228] OcrdPage proxy object for PcGtsType, including etree and mappings --- src/ocrd_modelfactory/__init__.py | 27 ++++++++++++-------------- src/ocrd_models/ocrd_page.py | 28 +++++++++++++++++++++++---- src/ocrd_validators/page_validator.py | 4 ++-- 3 files changed, 38 insertions(+), 21 deletions(-) diff --git a/src/ocrd_modelfactory/__init__.py b/src/ocrd_modelfactory/__init__.py index a98499b2e2..c0600e51f8 100644 --- a/src/ocrd_modelfactory/__init__.py +++ b/src/ocrd_modelfactory/__init__.py @@ -14,9 +14,10 @@ from ocrd_utils import VERSION, MIMETYPE_PAGE, guess_media_type from ocrd_models import OcrdExif, OcrdFile, ClientSideOcrdFile from ocrd_models.ocrd_page import ( - PcGtsType, PageType, MetadataType, + OcrdPage, PcGtsType, PageType, MetadataType, parse, parseEtree ) +from ocrd_utils.deprecate import deprecation_warning __all__ = [ 'exif_from_filename', @@ -39,7 +40,7 @@ def exif_from_filename(image_filename): ocrd_exif = OcrdExif(pil_img) return ocrd_exif -def page_from_image(input_file, with_tree=False): +def page_from_image(input_file : Union[OcrdFile, ClientSideOcrdFile], **kwargs) -> OcrdPage: """ Create :py:class:`~ocrd_models.ocrd_page.OcrdPage` from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` @@ -48,10 +49,9 @@ def page_from_image(input_file, with_tree=False): Arguments: input_file (:py:class:`~ocrd_models.ocrd_file.OcrdFile`): file to open \ and produce a PAGE DOM for - Keyword arguments: - with_tree (boolean): whether to return XML node tree, element-node mapping \ - and reverse mapping, too (cf. :py:func:`ocrd_models.ocrd_page.parseEtree`) """ + if 'with_etree' in kwargs: + deprecation_warning('kwarg "with_etree" is obsolete now, we always return OcrdPage including etree') if not input_file.local_filename: raise ValueError("input_file must have 'local_filename' property") if not Path(input_file.local_filename).exists(): @@ -72,14 +72,12 @@ def page_from_image(input_file, with_tree=False): ), pcGtsId=input_file.ID ) - if not with_tree: - return pcgts mapping = dict() - etree = pcgts.to_etree(mapping_=mapping) + etree : ET._Element = pcgts.to_etree(mapping_=mapping) revmap = dict(((node, element) for element, node in mapping.items())) - return pcgts, etree, mapping, revmap + return OcrdPage(pcgts, etree, mapping, revmap) -def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET._Element, dict, dict]]: +def page_from_file(input_file, **kwargs) -> OcrdPage: """ Create :py:class:`~ocrd_models.ocrd_page.OcrdPage` from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` or a file path @@ -88,10 +86,9 @@ def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsT Arguments: input_file (:py:class:`~ocrd_models.ocrd_file.OcrdFile` or `str`): file to open \ and produce a PAGE DOM for - Keyword arguments: - with_tree (boolean): whether to return XML node tree, element-node mapping \ - and reverse mapping, too (cf. :py:func:`ocrd_models.ocrd_page.parseEtree`) """ + if 'with_etree' in kwargs: + deprecation_warning('kwarg "with_etree" is obsolete now, we always return OcrdPage including etree') if not isinstance(input_file, (OcrdFile, ClientSideOcrdFile)): mimetype = guess_media_type(input_file, application_xml=MIMETYPE_PAGE) input_file = OcrdFile(ET.Element("dummy"), @@ -102,7 +99,7 @@ def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsT if not Path(input_file.local_filename).exists(): raise FileNotFoundError("File not found: '%s' (%s)" % (input_file.local_filename, input_file)) if input_file.mimetype.startswith('image'): - return page_from_image(input_file, with_tree=with_tree) + return page_from_image(input_file) if input_file.mimetype == MIMETYPE_PAGE: - return (parseEtree if with_tree else parse)(input_file.local_filename, silence=True) + return OcrdPage(*parseEtree(input_file.local_filename, silence=True)) raise ValueError("Unsupported mimetype '%s'" % input_file.mimetype) diff --git a/src/ocrd_models/ocrd_page.py b/src/ocrd_models/ocrd_page.py index b0cc2b3311..e649baace7 100644 --- a/src/ocrd_models/ocrd_page.py +++ b/src/ocrd_models/ocrd_page.py @@ -2,6 +2,8 @@ API to PAGE-XML, generated with generateDS from XML schema. """ from io import StringIO +from inspect import getmembers +from lxml import etree as ET __all__ = [ 'parse', @@ -174,10 +176,28 @@ """ ) -# add alias for DOM root -OcrdPage = PcGtsType - -def to_xml(el, skip_declaration=False): +class OcrdPage(): + """ + Proxy object for :py:class:`ocrd_models.PcGtsType` that also offers access + to the underlying etree, element-node mapping and reverse mapping, too (cf. + :py:func:`ocrd_models.ocrd_page.parseEtree`) + """ + def __init__( + self, + pcgts : PcGtsType, + etree : ET._Element, + mapping : dict[str, ET._Element], + revmap : dict[ET._Element, str], + ): + self._pcgts = pcgts + self.etree = etree + self.mapping = mapping + self.revmap = revmap + + def __getattr__(self, name): + return getattr(self._pcgts, name) + +def to_xml(el, skip_declaration=False) -> str: """ Serialize ``pc:PcGts`` document as string. """ diff --git a/src/ocrd_validators/page_validator.py b/src/ocrd_validators/page_validator.py index 41ce0b9f94..d6d8a95b57 100644 --- a/src/ocrd_validators/page_validator.py +++ b/src/ocrd_validators/page_validator.py @@ -6,7 +6,7 @@ from shapely.validation import explain_validity from ocrd_utils import getLogger, polygon_from_points, deprecated_alias -from ocrd_models.ocrd_page import parse +from ocrd_models.ocrd_page import OcrdPage, parse from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( @@ -236,7 +236,7 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate and whether the coordinates of an element are fully within its parent element coordinates. """ log = getLogger('ocrd.page_validator.validate_consistency') - if isinstance(node, PcGtsType): + if isinstance(node, (PcGtsType, OcrdPage)): # top-level (start recursion) node_id = node.get_pcGtsId() node = node.get_Page() # has no .id From cf45d8b30047c68fa97bfb22de0622232205bc15 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 19 Aug 2024 19:04:53 +0200 Subject: [PATCH 080/228] Processor.base: have a (hopefully) thread-safe logger for the base class --- src/ocrd/processor/base.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 5d9637b80a..785a139ece 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -47,7 +47,6 @@ # XXX imports must remain for backwards-compatibility from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import -_logger = getLogger('ocrd.processor.base') class ResourceNotFoundError(FileNotFoundError): """ @@ -178,8 +177,10 @@ def __init__( raise ValueError("Invalid parameters %s" % report.errors) self.parameter = parameter # NOTE: this is the logger to be used by processor implementations, - # `processor.base` default implementations should use :py:attr:`_logger` + # `processor.base` default implementations should use + # :py:attr:`self._base_logger` self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}') + self._base_logger = getLogger('ocrd.processor.base') # workaround for deprecated#72 (@deprecated decorator does not work for subclasses): setattr(self, 'process', deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process'))) @@ -586,13 +587,13 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.") if on_error == 'abort': raise ValueError(msg) - _logger.warning(msg) + self._base_logger.warning(msg) for file_ in files_: if not file_.pageId: continue ift = pages.setdefault(file_.pageId, [None]*len(ifgs)) if ift[i]: - _logger.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg) + self._base_logger.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg) # fileGrp has multiple files for this page ID if mimetype: # filter was active, this must not happen @@ -631,14 +632,14 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): else: raise Exception("Unknown 'on_error' strategy '%s'" % on_error) else: - _logger.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg) + self._base_logger.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg) ift[i] = file_ ifts = list() for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): if not ifiles[i]: # other fallback options? - _logger.error('found no page %s in file group %s', + self._base_logger.error('found no page %s in file group %s', page, ifg) if ifiles[0] or not require_first: ifts.append(tuple(ifiles)) From 785d60736919590aaa6e2c84a6a487dc46d12468 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 20 Aug 2024 08:05:24 +0200 Subject: [PATCH 081/228] Processor.zip_input_files: warning instead of exception for missing input files --- src/ocrd/processor/base.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 344569677e..958661f79f 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -575,16 +575,9 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): pageId=self.page_id, fileGrp=ifg, mimetype=mimetype), # sort by MIME type so PAGE comes before images key=lambda file_: file_.mimetype) - # Warn if no files found but pageId was specified because that - # might be because of invalid page_id (range) - if self.page_id and not files_: - msg = (f"Could not find any files for --page-id {self.page_id} - " - f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.") - if on_error == 'abort': - raise ValueError(msg) - LOG.warning(msg) for file_ in files_: if not file_.pageId: + # ignore document-global files continue ift = pages.setdefault(file_.pageId, [None]*len(ifgs)) if ift[i]: @@ -629,13 +622,15 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): else: LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg) ift[i] = file_ + # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range) + if self.page_id and not any(pages): + LOG.critical(f"Could not find any files for selected pageId {self.page_id}") ifts = list() for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): if not ifiles[i]: # other fallback options? - LOG.error('found no page %s in file group %s', - page, ifg) + LOG.error(f'Found no page {page} in file group {ifg}') if ifiles[0] or not require_first: ifts.append(tuple(ifiles)) return ifts From b12849da6dd4a46dd0d9a121c50f9438cb61d6e1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 20 Aug 2024 08:07:16 +0200 Subject: [PATCH 082/228] Processor.zip_input_files: introduce NonUniqueInputFile exception --- src/ocrd/processor/__init__.py | 4 ++- src/ocrd/processor/base.py | 46 ++++++++++++++++++++++--------- tests/processor/test_processor.py | 6 ++-- 3 files changed, 39 insertions(+), 17 deletions(-) diff --git a/src/ocrd/processor/__init__.py b/src/ocrd/processor/__init__.py index 0b3ce5a56e..b6c1188def 100644 --- a/src/ocrd/processor/__init__.py +++ b/src/ocrd/processor/__init__.py @@ -1,6 +1,8 @@ from .base import ( Processor, - ResourceNotFoundError + ResourceNotFoundError, + NonUniqueInputFile, + MissingInputFile, ) from .ocrd_page_result import ( OcrdPageResult, diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 958661f79f..516989ae28 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -55,9 +55,36 @@ class ResourceNotFoundError(FileNotFoundError): def __init__(self, name, executable): self.name = name self.executable = executable - self.message = "Could not find resource '%s' for executable '%s'. " \ - "Try 'ocrd resmgr download %s %s' to download this resource." \ - % (name, executable, executable, name) + self.message = (f"Could not find resource '{name}' for executable '{executable}'. " + f"Try 'ocrd resmgr download {executable} {name}' to download this resource.") + super().__init__(self.message) + +class NonUniqueInputFile(ValueError): + """ + An exception signifying the specified fileGrp / pageId / mimetype + selector yields multiple PAGE files, or no PAGE files but multiple images, + or multiple files of that mimetype. + """ + def __init__(self, fileGrp, pageId, mimetype): + self.fileGrp = fileGrp + self.pageId = pageId + self.mimetype = mimetype + self.message = (f"Could not determine unique input file for fileGrp {fileGrp} " + f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}") + super().__init__(self.message) + +class MissingInputFile(ValueError): + """ + An exception signifying the specified fileGrp / pageId / mimetype + selector yields no PAGE files, or no PAGE and no image files, + or no files of that mimetype. + """ + def __init__(self, fileGrp, pageId, mimetype): + self.fileGrp = fileGrp + self.pageId = pageId + self.mimetype = mimetype + self.message = (f"Could not find input file for fileGrp {fileGrp} " + f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}") super().__init__(self.message) class Processor(): @@ -352,7 +379,6 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc file_path=image_file_path) result.pcgts.set_pcGtsId(output_file_id) self.add_metadata(result.pcgts) - # FIXME: what about non-PAGE output like JSON ??? self.workspace.add_file(file_id=output_file_id, file_grp=self.output_file_grp, page_id=page_id, @@ -592,9 +618,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): elif on_error == 'last': ift[i] = file_ elif on_error == 'abort': - raise ValueError( - "Multiple '%s' matches for page '%s' in fileGrp '%s'." % ( - mimetype, file_.pageId, ifg)) + raise NonUniqueInputFile(ifg, file_.pageId, mimetype) else: raise Exception("Unknown 'on_error' strategy '%s'" % on_error) elif (ift[i].mimetype == MIMETYPE_PAGE and @@ -602,9 +626,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): pass # keep PAGE match elif (ift[i].mimetype == MIMETYPE_PAGE and file_.mimetype == MIMETYPE_PAGE): - raise ValueError( - "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % ( - file_.pageId, ifg)) + raise NonUniqueInputFile(ifg, file_.pageId, None) else: # filter was inactive but no PAGE is in control, this must not happen if on_error == 'skip': @@ -614,9 +636,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): elif on_error == 'last': ift[i] = file_ elif on_error == 'abort': - raise ValueError( - "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % ( - file_.pageId, ifg)) + raise NonUniqueInputFile(ifg, file_.pageId, None) else: raise Exception("Unknown 'on_error' strategy '%s'" % on_error) else: diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 2cf8a189b4..5d565ea707 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -251,7 +251,7 @@ def ocrd_tool(self): assert ('foobar3', 'foobar4') in tuples tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')] assert ('foobar3', None) in tuples - with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."): + with self.assertRaisesRegex(Exception, "Could not determine unique input file"): tuples = proc.zip_input_files(on_error='abort') ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2dup', page_id='phys_0001') for page_id in [None, 'phys_0001,phys_0002']: @@ -260,7 +260,7 @@ def ocrd_tool(self): proc.workspace = ws proc.input_file_grp = 'GRP1,GRP2' proc.page_id = page_id - with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"): + with self.assertRaisesRegex(Exception, "Could not determine unique input file"): tuples = proc.zip_input_files() def test_zip_input_files_require_first(self): @@ -281,7 +281,7 @@ def ocrd_tool(self): proc.page_id = page_id assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] r = self.capture_out_err() - assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err + assert 'ERROR ocrd.processor.base - Found no page phys_0001 in file group GRP1' in r.err if __name__ == "__main__": main(__file__) From 95d36585bf7d97193e56b9143fde0416cd7b799b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 20 Aug 2024 08:08:15 +0200 Subject: [PATCH 083/228] Processor.process_workspace: zip_input_files w/o require_first --- src/ocrd/processor/base.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 516989ae28..5becbf8d81 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -314,17 +314,19 @@ def process_workspace(self, workspace: Workspace) -> None: self.verify() try: # FIXME: add page parallelization by running multiprocessing.Pool (#322) - for input_file_tuple in self.zip_input_files(on_error='abort'): + for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): # FIXME: add error handling by catching exceptions in various ways (#579) # for example: # - ResourceNotFoundError → use ResourceManager to download (once), then retry # - transient (I/O or OOM) error → maybe sleep, retry # - persistent (data) error → skip / dummy / raise input_files : List[Optional[Union[OcrdFile, ClientSideOcrdFile]]] = [None] * len(input_file_tuple) + log.info("processing page %s", + next(input_file.pageId + for input_file in input_file_tuple + if input_file)) for i, input_file in enumerate(input_file_tuple): - if i == 0: - log.info("processing page %s", input_file.pageId) - elif input_file is None: + if input_file is None: # file/page not found in this file grp continue input_files[i] = input_file From c7298411bafe74287e15646eaa3b9d20b90c2e65 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 20 Aug 2024 09:33:37 +0200 Subject: [PATCH 084/228] Processor.zip_input_files: introduce MissingInputFile exception and config.OCRD_MISSING_INPUT --- src/ocrd/cli/__init__.py | 2 ++ src/ocrd/processor/base.py | 20 +++++++++++++++----- src/ocrd_utils/config.py | 6 ++++++ tests/processor/test_processor.py | 2 +- 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index bf262b0b96..418d7927a3 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -31,6 +31,8 @@ \b {config.describe('OCRD_DOWNLOAD_INPUT')} \b +{config.describe('OCRD_MISSING_INPUT')} +\b {config.describe('OCRD_METS_CACHING')} \b {config.describe('OCRD_MAX_PROCESSOR_CACHE')} diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 0ec0747428..fddf6383a7 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -338,7 +338,7 @@ def process_workspace(self, workspace: Workspace) -> None: input_files[i] = self.workspace.download_file(input_file) except ValueError as e: log.error(repr(e)) - log.warning("skipping file %s for page %s", input_file, input_file.pageId) + log.warning(f"failed downloading file {input_file} for page {input_file.pageId}") self.process_page_file(*input_files) except NotImplementedError: # fall back to deprecated method @@ -611,10 +611,12 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): continue ift = pages.setdefault(file_.pageId, [None]*len(ifgs)) if ift[i]: - LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg) + LOG.debug(f"another file {file_.ID} for page {file_.pageId} in input file group {ifg}") # fileGrp has multiple files for this page ID if mimetype: # filter was active, this must not happen + LOG.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} " + f"conflicts with file {ift[i].ID} of same MIME type {mimetype} - on_error={on_error}") if on_error == 'skip': ift[i] = None elif on_error == 'first': @@ -633,6 +635,8 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): raise NonUniqueInputFile(ifg, file_.pageId, None) else: # filter was inactive but no PAGE is in control, this must not happen + LOG.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} " + f"conflicts with file {ift[i].ID} but no PAGE available - on_error={on_error}") if on_error == 'skip': ift[i] = None elif on_error == 'first': @@ -644,7 +648,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): else: raise Exception("Unknown 'on_error' strategy '%s'" % on_error) else: - LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg) + LOG.debug(f"adding file {file_.ID} for page {file_.pageId} to input file group {ifg}") ift[i] = file_ # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range) if self.page_id and not any(pages): @@ -653,8 +657,14 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): if not ifiles[i]: - # other fallback options? - LOG.error(f'Found no page {page} in file group {ifg}') + # could be from non-unique with on_error=skip or from true gap + LOG.error(f'Found no file for page {page} in file group {ifg}') + if config.OCRD_MISSING_INPUT == 'abort': + raise MissingInputFile(ifg, page, mimetype) + if not any(ifiles): + # must be from non-unique with on_error=skip + LOG.warning(f'Found no files for {page} - skipping') + continue if ifiles[0] or not require_first: ifts.append(tuple(ifiles)) return ifts diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 22a566e7bc..11af20249f 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -149,6 +149,12 @@ def _ocrd_download_timeout_parser(val): validator=_validator_boolean, parser=_parser_boolean) +config.add("OCRD_MISSING_INPUT", + description="How to deal with missing input files (for some fileGrp/pageId) during processing [SKIP|ABORT]", + default=(True, 'SKIP'), + validator=lambda val: val in ['SKIP', 'ABORT'], + parser=str) + config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING", description="Default address of Processing Server to connect to (for `ocrd network client processing`).", default=(True, '')) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 5d565ea707..aa2124001a 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -281,7 +281,7 @@ def ocrd_tool(self): proc.page_id = page_id assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] r = self.capture_out_err() - assert 'ERROR ocrd.processor.base - Found no page phys_0001 in file group GRP1' in r.err + assert 'ERROR ocrd.processor.base - Found no file for page phys_0001 in file group GRP1' in r.err if __name__ == "__main__": main(__file__) From 7df81af6a7a9b121569ac4288d079bd3dbd7f884 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 20 Aug 2024 11:24:48 +0200 Subject: [PATCH 085/228] OcrdPage: clearer docstring Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/ocrd_models/ocrd_page.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd_models/ocrd_page.py b/src/ocrd_models/ocrd_page.py index e649baace7..b28777e72d 100644 --- a/src/ocrd_models/ocrd_page.py +++ b/src/ocrd_models/ocrd_page.py @@ -178,7 +178,8 @@ class OcrdPage(): """ - Proxy object for :py:class:`ocrd_models.PcGtsType` that also offers access + Proxy object for :py:class:`ocrd_models.PcGtsType` (i.e. PRImA PAGE-XML + for page content, rendered as object model by generateDS) that also offers access to the underlying etree, element-node mapping and reverse mapping, too (cf. :py:func:`ocrd_models.ocrd_page.parseEtree`) """ From 0ab694201c7c6fe2d55113ca9a5158c6f6834387 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 20 Aug 2024 12:19:36 +0200 Subject: [PATCH 086/228] jsonschema: switch from draft6 to draft2019-09 --- src/ocrd_validators/json_validator.py | 8 ++++---- src/ocrd_validators/parameter_validator.py | 4 ++-- src/ocrd_validators/resource_list_validator.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ocrd_validators/json_validator.py b/src/ocrd_validators/json_validator.py index c920fc7c2d..0edb73ed15 100644 --- a/src/ocrd_validators/json_validator.py +++ b/src/ocrd_validators/json_validator.py @@ -3,7 +3,7 @@ """ import json -from jsonschema import Draft6Validator, validators # pylint: disable=import-error +from jsonschema import Draft201909Validator, validators # pylint: disable=import-error from ocrd_models import ValidationReport @@ -28,7 +28,7 @@ def set_defaults(validator, properties, instance, schema): return validators.extend(validator_class, {"properties": set_defaults}) -DefaultValidatingDraft6Validator = extend_with_default(Draft6Validator) +DefaultValidatingDraft20199Validator = extend_with_default(Draft201909Validator) # # ------------------------------------------------- @@ -52,13 +52,13 @@ def validate(obj, schema): obj = json.loads(obj) return JsonValidator(schema)._validate(obj) # pylint: disable=protected-access - def __init__(self, schema, validator_class=Draft6Validator): + def __init__(self, schema, validator_class=Draft201909Validator): """ Construct a JsonValidator. Args: schema (dict): - validator_class (Draft6Validator|DefaultValidatingDraft6Validator): + validator_class (Draft20199Validator|DefaultValidatingDraft20199Validator): """ self.validator = validator_class(schema) diff --git a/src/ocrd_validators/parameter_validator.py b/src/ocrd_validators/parameter_validator.py index 20dd6ff2b7..26364f70fc 100644 --- a/src/ocrd_validators/parameter_validator.py +++ b/src/ocrd_validators/parameter_validator.py @@ -1,7 +1,7 @@ """ Validate parameters against ocrd-tool.json. """ -from .json_validator import JsonValidator, DefaultValidatingDraft6Validator +from .json_validator import DefaultValidatingDraft20199Validator, JsonValidator # # ------------------------------------------------- @@ -45,4 +45,4 @@ def __init__(self, ocrd_tool): "required": required, "additionalProperties": False, "properties": p - }, DefaultValidatingDraft6Validator) + }, DefaultValidatingDraft20199Validator) diff --git a/src/ocrd_validators/resource_list_validator.py b/src/ocrd_validators/resource_list_validator.py index 72a11c34de..d1a77b59be 100644 --- a/src/ocrd_validators/resource_list_validator.py +++ b/src/ocrd_validators/resource_list_validator.py @@ -4,7 +4,7 @@ See `specs `_. """ from .constants import RESOURCE_LIST_SCHEMA -from .json_validator import JsonValidator, DefaultValidatingDraft6Validator +from .json_validator import DefaultValidatingDraft20199Validator, JsonValidator # # ------------------------------------------------- @@ -20,5 +20,5 @@ def validate(obj, schema=RESOURCE_LIST_SCHEMA): """ Validate against ``resource_list.schema.yml`` schema. """ - return JsonValidator(schema, validator_class=DefaultValidatingDraft6Validator)._validate(obj) + return JsonValidator(schema, validator_class=DefaultValidatingDraft20199Validator)._validate(obj) From 66c50b38eab5521d82361b86ad64b7d5f652f198 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 20 Aug 2024 14:27:31 +0200 Subject: [PATCH 087/228] require jsonschema>4 for draft 2019-09 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ed5fd56d59..3d053075a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,7 @@ gdown httpx>=0.22.0 importlib_metadata ; python_version < '3.8' importlib_resources ; python_version < '3.10' -jsonschema +jsonschema > 4 lxml memory-profiler >= 0.58.0 # XXX explicitly do not restrict the numpy version because different From 94e2e60d933910c3088885ebb4ca006dc80c5246 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 20 Aug 2024 16:59:14 +0200 Subject: [PATCH 088/228] OcrdToolValidator: set defaults, handle deprecated --- src/ocrd_validators/json_validator.py | 15 +++++++++++---- src/ocrd_validators/ocrd_tool_validator.py | 7 +++++-- tests/validator/test_json_validator.py | 4 ++-- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/ocrd_validators/json_validator.py b/src/ocrd_validators/json_validator.py index 0edb73ed15..ccd27b92a2 100644 --- a/src/ocrd_validators/json_validator.py +++ b/src/ocrd_validators/json_validator.py @@ -2,11 +2,15 @@ Validating JSON-Schema """ import json +from warnings import warn -from jsonschema import Draft201909Validator, validators # pylint: disable=import-error +from jsonschema import Draft201909Validator, ValidationError, validators # pylint: disable=import-error from ocrd_models import ValidationReport +class JsonSchemaDeprecationWarning(ValidationError): + pass + # http://python-jsonschema.readthedocs.io/en/latest/faq/ def extend_with_default(validator_class): """ @@ -14,18 +18,20 @@ def extend_with_default(validator_class): """ validate_properties = validator_class.VALIDATORS["properties"] - def set_defaults(validator, properties, instance, schema): + def set_defaults_and_handle_deprecate(validator, properties, instance, schema): """ Set defaults in subschemas """ for prop, subschema in properties.items(): if "default" in subschema: instance.setdefault(prop, subschema["default"]) + if subschema.get('deprecated', False): + yield JsonSchemaDeprecationWarning(f"Property {prop} has been deprecated, ocrd-tool.json should be updated.") for error in validate_properties(validator, properties, instance, schema): yield error - return validators.extend(validator_class, {"properties": set_defaults}) + return validators.extend(validator_class, {"properties": set_defaults_and_handle_deprecate}) DefaultValidatingDraft20199Validator = extend_with_default(Draft201909Validator) @@ -74,6 +80,7 @@ def _validate(self, obj): report = ValidationReport() if not self.validator.is_valid(obj): for v in self.validator.iter_errors(obj): + meth = f'add_{"warning" if isinstance(v, JsonSchemaDeprecationWarning) else "error"}' # print(">>>>>>>>> v='%s', obj='%s'" % (v, obj)) - report.add_error("[%s] %s" % ('.'.join(str(vv) for vv in v.path), v.message)) + getattr(report, meth)("[%s] %s" % ('.'.join(str(vv) for vv in v.path), v.message)) return report diff --git a/src/ocrd_validators/ocrd_tool_validator.py b/src/ocrd_validators/ocrd_tool_validator.py index b408bd86e9..827001ef72 100644 --- a/src/ocrd_validators/ocrd_tool_validator.py +++ b/src/ocrd_validators/ocrd_tool_validator.py @@ -4,7 +4,7 @@ See `specs `_. """ from .constants import OCRD_TOOL_SCHEMA -from .json_validator import JsonValidator +from .json_validator import DefaultValidatingDraft20199Validator, JsonValidator # # ------------------------------------------------- @@ -20,4 +20,7 @@ def validate(obj, schema=OCRD_TOOL_SCHEMA): """ Validate against ``ocrd-tool.json`` schema. """ - return JsonValidator.validate(obj, schema) + return OcrdToolValidator(schema)._validate(obj) # pylint: disable=protected-access + + def __init__(self, schema, validator_class=...): + super().__init__(schema, DefaultValidatingDraft20199Validator) diff --git a/tests/validator/test_json_validator.py b/tests/validator/test_json_validator.py index 8a8387d4b6..25771b701f 100644 --- a/tests/validator/test_json_validator.py +++ b/tests/validator/test_json_validator.py @@ -1,5 +1,5 @@ from tests.base import TestCase, main -from ocrd_validators.json_validator import JsonValidator, DefaultValidatingDraft6Validator +from ocrd_validators.json_validator import JsonValidator, DefaultValidatingDraft20199Validator class TestParameterValidator(TestCase): @@ -15,7 +15,7 @@ def setUp(self): } } } - self.defaults_validator = JsonValidator(self.schema, DefaultValidatingDraft6Validator) + self.defaults_validator = JsonValidator(self.schema, DefaultValidatingDraft20199Validator) super().setUp() def test_validate_string(self): From 2e7bdc295dad859fbd1374db4f53bf097bc7d5ec Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 20 Aug 2024 17:00:27 +0200 Subject: [PATCH 089/228] processor.base: validate/setdefault ocrd-tool.json on first access --- src/ocrd/processor/base.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 344569677e..8620881c7b 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -20,6 +20,7 @@ import inspect import tarfile import io +from warnings import warn from deprecated import deprecated from ocrd.workspace import Workspace @@ -43,6 +44,7 @@ from ocrd_validators import ParameterValidator from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType, OcrdPage, to_xml from ocrd_modelfactory import page_from_file +from ocrd_validators.ocrd_tool_validator import OcrdToolValidator # XXX imports must remain for backwards-compatibility from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import @@ -71,15 +73,20 @@ class Processor(): """ @property - def metadata(self): + def metadata(self) -> dict: """the ocrd-tool.json dict of the package""" if hasattr(self, '_metadata'): return self._metadata self._metadata = json.loads(resource_string(self.__module__.split('.')[0], 'ocrd-tool.json')) + report = OcrdToolValidator.validate(self._metadata) + if not report.is_valid: + # FIXME: remove when bertsky/core#10 is merged + self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}') + self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n{report.to_xml()}.\nPlease open an issue at {self._metadata['git_url']}.") return self._metadata @property - def version(self): + def version(self) -> str: """the version of the package""" if hasattr(self, '_version'): return self._version @@ -87,7 +94,7 @@ def version(self): return self._version @property - def executable(self): + def executable(self) -> str: """the executable name of this processor tool""" if hasattr(self, '_executable'): return self._executable @@ -95,7 +102,7 @@ def executable(self): return self._executable @property - def ocrd_tool(self): + def ocrd_tool(self) -> dict: """the ocrd-tool.json dict of this processor tool""" if hasattr(self, '_ocrd_tool'): return self._ocrd_tool From 346f166737bf8d90aeeecccf0075101ee333752a Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 20 Aug 2024 17:00:48 +0200 Subject: [PATCH 090/228] update spec and ocrd_tool.schema.yml --- repo/spec | 2 +- requirements.txt | 2 +- src/ocrd_validators/ocrd_tool.schema.yml | 13 +++++++++++-- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/repo/spec b/repo/spec index cb1ba2e72b..df2a07e3fd 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit cb1ba2e72bd176f1a1076eea38d6438c647e68e7 +Subproject commit df2a07e3fda634b2eda5785afe67399b61a81173 diff --git a/requirements.txt b/requirements.txt index 3d053075a9..e78c186618 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,7 @@ gdown httpx>=0.22.0 importlib_metadata ; python_version < '3.8' importlib_resources ; python_version < '3.10' -jsonschema > 4 +jsonschema>=4 lxml memory-profiler >= 0.58.0 # XXX explicitly do not restrict the numpy version because different diff --git a/src/ocrd_validators/ocrd_tool.schema.yml b/src/ocrd_validators/ocrd_tool.schema.yml index 5de65a04ee..bdf834b6a6 100644 --- a/src/ocrd_validators/ocrd_tool.schema.yml +++ b/src/ocrd_validators/ocrd_tool.schema.yml @@ -108,6 +108,12 @@ properties: maximum: type: number description: Maximum value for number parameters, including the maximum + minProperties: + type: number + description: Minimum number of properties of an object + maxProperties: + type: number + description: Maximum number of properties of an object exclusiveMinimum: type: number description: Minimum value for number parameters, excluding the minimum @@ -121,8 +127,11 @@ properties: type: object description: Describe the properties of an object value additionalProperties: - type: boolean - description: Whether an object value may contain properties not explicitly defined + oneOf: + - type: boolean + description: Whether an object value may contain properties not explicitly defined + - type: object + description: Schema any additional properties need to adhere to required: type: boolean description: Whether this parameter is required From 577baa529103de170f6b1259ae6b161b281f475c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 00:06:14 +0200 Subject: [PATCH 091/228] processor parameter decorator: no '{}' default (unnecessary) --- src/ocrd/decorators/parameter_option.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/decorators/parameter_option.py b/src/ocrd/decorators/parameter_option.py index 55abbc2a53..2f8be3d868 100644 --- a/src/ocrd/decorators/parameter_option.py +++ b/src/ocrd/decorators/parameter_option.py @@ -10,7 +10,7 @@ def _handle_param_option(ctx, param, value): parameter_option = option('-p', '--parameter', help="Parameters, either JSON string or path to JSON file", multiple=True, - default=['{}'], + default=[], # now handled in ocrd_cli_wrap_processor to resolve processor preset files # callback=_handle_param_option callback=lambda ctx, param, kv: list(kv)) From f00ecda84717d274126a736b67e3ab29e5bae83d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 00:07:01 +0200 Subject: [PATCH 092/228] =?UTF-8?q?Processor:=20add=20error=20handling?= =?UTF-8?q?=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - introduce `config.OCRD_MISSING_OUTPUT` and catch exceptions during `process_page_file`: - `ABORT`: re-raise - `SKIP`: ignore and continue with next page - `COPY`: ignore and provide input PAGE-XML as output (with just `@PcGtsId` and `Metadata` added to simulate the processing step) - introduce `config.OCRD_EXISTING_OUTPUT`: - `ABORT`: re-raise FileExistsError - `SKIP`: ignore and continue with next page - `OVERWRITE`: force overwriting the exact output files (instead of removing output files indiscriminately) - :fire: remove `Workspace.overwrite_mode`, have `--overwrite` merely delegate to `config.OCRD_EXISTING_OUTPUT=OVERWRITE` - introduce `--debug`, just delegate to `config.OCRD_MISSING_OUTPUT=ABORT` - `cli.bashlib.input-files`: delegate everything to `ocrd_cli_wrap_processor` (for CLI handling) and `process_workspace` (for error handling), but override `process_page_file` to (never fail and) print bash-friendly strings for actual processing - update tests, add `test_processor.test_run_output_missing` covering all `OCRD_MISSING_OUTPUT` options and the newly `OCRD_EXISTING_OUTPUT=SKIP` --- src/ocrd/cli/__init__.py | 4 + src/ocrd/cli/bashlib.py | 66 +++++++------- src/ocrd/decorators/__init__.py | 22 +---- src/ocrd/decorators/ocrd_cli_options.py | 1 + src/ocrd/processor/base.py | 89 +++++++++++++++---- src/ocrd/processor/builtin/dummy_processor.py | 13 +-- src/ocrd/processor/helpers.py | 8 +- src/ocrd/workspace.py | 14 +-- src/ocrd_utils/config.py | 12 +++ tests/cli/test_bashlib.py | 2 +- tests/data/__init__.py | 39 +++++++- tests/processor/test_processor.py | 37 +++++++- tests/test_workspace.py | 23 +++-- 13 files changed, 224 insertions(+), 106 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 418d7927a3..3722e3c21e 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -33,6 +33,10 @@ \b {config.describe('OCRD_MISSING_INPUT')} \b +{config.describe('OCRD_MISSING_OUTPUT')} +\b +{config.describe('OCRD_EXISTING_OUTPUT')} +\b {config.describe('OCRD_METS_CACHING')} \b {config.describe('OCRD_MAX_PROCESSOR_CACHE')} diff --git a/src/ocrd/cli/bashlib.py b/src/ocrd/cli/bashlib.py index 2c57bb412a..26139cb48f 100644 --- a/src/ocrd/cli/bashlib.py +++ b/src/ocrd/cli/bashlib.py @@ -20,13 +20,16 @@ from ocrd.decorators import ( parameter_option, parameter_override_option, - ocrd_loglevel + ocrd_loglevel, + ocrd_cli_wrap_processor ) from ocrd_utils import ( is_local_filename, get_local_filename, initLogging, - make_file_id + getLogger, + make_file_id, + config ) from ocrd.resolver import Resolver from ocrd.processor import Processor @@ -81,11 +84,15 @@ def bashlib_constants(name): @bashlib_cli.command('input-files') @click.option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME) @click.option('-w', '--working-dir', help="Working Directory") -@click.option('-I', '--input-file-grp', help='File group(s) used as input.', default='INPUT') -@click.option('-O', '--output-file-grp', help='File group(s) used as output.', default='OUTPUT') +@click.option('-I', '--input-file-grp', help='File group(s) used as input.', default=None) +@click.option('-O', '--output-file-grp', help='File group(s) used as output.', default=None) # repeat some other processor options for convenience (will be ignored here) @click.option('-g', '--page-id', help="ID(s) of the pages to process") -@click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist") +@click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist\n" + "(with '--page-id', remove only those).\n" + "Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE") +@click.option('--debug', is_flag=True, default=False, help="Abort on any errors with full stack trace.\n" + "Short-hand for OCRD_MISSING_OUTPUT=ABORT") @parameter_option @parameter_override_option @ocrd_loglevel @@ -100,37 +107,26 @@ def bashlib_input_files(**kwargs): (The printing format is one associative array initializer per line.) """ - initLogging() - mets = kwargs.pop('mets') - working_dir = kwargs.pop('working_dir') - if is_local_filename(mets) and not isfile(get_local_filename(mets)): - msg = "File does not exist: %s" % mets - raise FileNotFoundError(msg) - resolver = Resolver() - workspace = resolver.workspace_from_url(mets, working_dir) class BashlibProcessor(Processor): @property def ocrd_tool(self): - return {} + return {'executable': '', 'steps': ['']} @property - def executable(self): - return '' - processor = BashlibProcessor(None) - # go half way of the normal run_processor / process_workspace call tree - processor.workspace = workspace - processor.page_id = kwargs['page_id'] - processor.input_file_grp = kwargs['input_file_grp'] - processor.output_file_grp = kwargs['output_file_grp'] - for input_files in processor.zip_input_files(mimetype=None, on_error='abort'): - # ensure all input files exist locally (without persisting them in the METS) - # - this mimics the default behaviour of all Pythonic processors - input_files = [workspace.download_file(input_file) if input_file else None - for input_file in input_files] - for field in ['url', 'local_filename', 'ID', 'mimetype', 'pageId']: - # make this bash-friendly (show initialization for associative array) - if len(input_files) > 1: - # single quotes allow us to preserve the list value inside the alist - print("[%s]='%s'" % (field, ' '.join(str(getattr(res, field)) for res in input_files)), end=' ') - else: - print("[%s]='%s'" % (field, str(getattr(input_files[0], field))), end=' ') - print("[outputFileId]='%s'" % make_file_id(input_files[0], kwargs['output_file_grp'])) + def version(self): + return '1.0' + # go half way of the normal run_processor / process_workspace call tree + # by just delegating to process_workspace, overriding process_page_file + # to ensure all input files exist locally (without persisting them in the METS) + # and print what needs to be acted on in bash-friendly way + def process_page_file(self, *input_files): + for field in ['url', 'local_filename', 'ID', 'mimetype', 'pageId']: + # make this bash-friendly (show initialization for associative array) + if len(input_files) > 1: + # single quotes allow us to preserve the list value inside the alist + value = ' '.join(str(getattr(res, field)) for res in input_files) + else: + value = str(getattr(input_files[0], field)) + print(f"[{field}]='{value}'", end=' ') + output_file_id = make_file_id(input_files[0], kwargs['output_file_grp']) + print(f"[outputFileId]='{output_file_id}'") + ocrd_cli_wrap_processor(BashlibProcessor, **kwargs) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index d9d1fb69dd..364ef4c847 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -36,6 +36,7 @@ def ocrd_cli_wrap_processor( profile_file=None, version=False, overwrite=False, + debug=False, resolve_resource=None, show_resource=None, list_resources=False, @@ -117,25 +118,10 @@ def resolve(name): resolver.resolve_mets_arguments(working_dir, mets, None, mets_server_url) workspace = resolver.workspace_from_url(mets, working_dir, mets_server_url=mets_server_url) page_id = kwargs.get('page_id') - # XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505 - # if overwrite - # if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']: - # raise Exception("--overwrite requires --output-file-grp") - # LOG.info("Removing files because of --overwrite") - # for grp in kwargs['output_file_grp'].split(','): - # if page_id: - # for one_page_id in kwargs['page_id'].split(','): - # LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id) - # for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp): - # workspace.remove_file(file, force=True, keep_file=False, page_recursive=True) - # else: - # LOG.debug("Removing all files in output file group %s ", grp) - # # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors) - # workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False) - # workspace.save_mets() - # XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace if overwrite: - workspace.overwrite_mode = True + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' + if debug: + config.OCRD_MISSING_OUTPUT = 'ABORT' report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) if not report.is_valid: raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) diff --git a/src/ocrd/decorators/ocrd_cli_options.py b/src/ocrd/decorators/ocrd_cli_options.py index e640a20032..e069b3ea81 100644 --- a/src/ocrd/decorators/ocrd_cli_options.py +++ b/src/ocrd/decorators/ocrd_cli_options.py @@ -33,6 +33,7 @@ def cli(mets_url): option('-O', '--output-file-grp', default=None), option('-g', '--page-id'), option('--overwrite', is_flag=True, default=False), + option('--debug', is_flag=True, default=False), option('--profile', is_flag=True, default=False), option('--profile-file', type=Path(dir_okay=False, writable=True)), parameter_option, diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index fddf6383a7..0ec2711f61 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -21,6 +21,7 @@ import tarfile import io from deprecated import deprecated +from requests import HTTPError from ocrd.workspace import Workspace from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile @@ -317,16 +318,11 @@ def process_workspace(self, workspace: Workspace) -> None: try: # FIXME: add page parallelization by running multiprocessing.Pool (#322) for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): - # FIXME: add error handling by catching exceptions in various ways (#579) - # for example: - # - ResourceNotFoundError → use ResourceManager to download (once), then retry - # - transient (I/O or OOM) error → maybe sleep, retry - # - persistent (data) error → skip / dummy / raise input_files : List[Optional[Union[OcrdFile, ClientSideOcrdFile]]] = [None] * len(input_file_tuple) - log.info("processing page %s", - next(input_file.pageId - for input_file in input_file_tuple - if input_file)) + page_id = next(input_file.pageId + for input_file in input_file_tuple + if input_file) + log.info(f"processing page {page_id}") for i, input_file in enumerate(input_file_tuple): if input_file is None: # file/page not found in this file grp @@ -336,14 +332,71 @@ def process_workspace(self, workspace: Workspace) -> None: continue try: input_files[i] = self.workspace.download_file(input_file) - except ValueError as e: + except (ValueError, FileNotFoundError, HTTPError) as e: log.error(repr(e)) - log.warning(f"failed downloading file {input_file} for page {input_file.pageId}") - self.process_page_file(*input_files) + log.warning(f"failed downloading file {input_file} for page {page_id}") + # FIXME: differentiate error cases in various ways: + # - ResourceNotFoundError → use ResourceManager to download (once), then retry + # - transient (I/O or OOM) error → maybe sleep, retry + # - persistent (data) error → skip / dummy / raise + try: + self.process_page_file(*input_files) + except Exception as err: + # we have to be broad here, but want to exclude NotImplementedError + if isinstance(err, NotImplementedError): + raise err + if isinstance(err, FileExistsError): + if config.OCRD_EXISTING_OUTPUT == 'ABORT': + raise err + if config.OCRD_EXISTING_OUTPUT == 'SKIP': + continue + if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': + # too late here, must not happen + raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") + # FIXME: re-usable/actionable logging + log.exception(f"Failure on page {page_id}: {err}") + if config.OCRD_MISSING_OUTPUT == 'ABORT': + raise err + if config.OCRD_MISSING_OUTPUT == 'SKIP': + continue + if config.OCRD_MISSING_OUTPUT == 'COPY': + self._copy_page_file(input_files[0]) + else: + desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) + raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") except NotImplementedError: # fall back to deprecated method self.process() + def _copy_page_file(self, input_file : Union[OcrdFile, ClientSideOcrdFile]) -> None: + """ + Copy the given ``input_file`` of the :py:attr:`workspace`, + representing one physical page (passed as one opened + :py:class:`~ocrd_models.OcrdFile` per input fileGrp) + and add it as if it was a processing result. + """ + log = getLogger('ocrd.processor.base') + input_pcgts : OcrdPage + assert isinstance(input_file, (OcrdFile, ClientSideOcrdFile)) + log.debug(f"parsing file {input_file.ID} for page {input_file.pageId}") + try: + input_pcgts = page_from_file(input_file) + except ValueError as err: + # not PAGE and not an image to generate PAGE for + log.error(f"non-PAGE input for page {input_file.pageId}: {err}") + return + output_file_id = make_file_id(input_file, self.output_file_grp) + input_pcgts.set_pcGtsId(output_file_id) + self.add_metadata(input_pcgts) + self.workspace.add_file(file_id=output_file_id, + file_grp=self.output_file_grp, + page_id=input_file.pageId, + local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), + mimetype=MIMETYPE_PAGE, + content=to_xml(input_pcgts), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) + def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: """ Process the given ``input_files`` of the :py:attr:`workspace`, @@ -366,9 +419,9 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc page_ = page_from_file(input_file) assert isinstance(page_, OcrdPage) input_pcgts[i] = page_ - except ValueError as e: + except ValueError as err: # not PAGE and not an image to generate PAGE for - log.info("non-PAGE input for page %s: %s", page_id, e) + log.error("non-PAGE input for page %s: %s", page_id, err) output_file_id = make_file_id(input_files[0], self.output_file_grp) result = self.process_page_pcgts(*input_pcgts, page_id=page_id) for image_result in result.images: @@ -380,7 +433,9 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc image_file_id, self.output_file_grp, page_id=page_id, - file_path=image_file_path) + file_path=image_file_path, + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) result.pcgts.set_pcGtsId(output_file_id) self.add_metadata(result.pcgts) self.workspace.add_file(file_id=output_file_id, @@ -388,7 +443,9 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc page_id=page_id, local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), mimetype=MIMETYPE_PAGE, - content=to_xml(result.pcgts)) + content=to_xml(result.pcgts), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """ diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 1b3f7a5aa0..f8890274ae 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -15,7 +15,8 @@ MIME_TO_EXT, MIMETYPE_PAGE, parse_json_string_with_comments, - resource_string + resource_string, + config ) from ocrd_modelfactory import page_from_file @@ -43,14 +44,15 @@ def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcr local_filename = join(self.output_file_grp, file_id + ext) LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) with open(input_file.local_filename, 'rb') as f: - content = f.read() output_file = self.workspace.add_file( file_id=file_id, file_grp=self.output_file_grp, page_id=input_file.pageId, mimetype=input_file.mimetype, local_filename=local_filename, - content=content) + content=f.read(), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) file_id = file_id + '_PAGE' pcgts = page_from_file(output_file) assert isinstance(pcgts, OcrdPage) @@ -63,8 +65,9 @@ def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcr page_id=input_file.pageId, local_filename=join(self.output_file_grp, file_id + '.xml'), mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - + content=to_xml(pcgts), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) else: if self.parameter['copy_files']: LOG.info("Not copying %s because it is a PAGE-XML file, which gets identity-transformed", input_file.local_filename) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index dff14cfca6..08ca0a4683 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -160,6 +160,7 @@ def run_cli( workspace=None, page_id=None, overwrite=None, + debug=None, log_level=None, log_filename=None, input_file_grp=None, @@ -202,6 +203,8 @@ def run_cli( args += ['--parameter', parameter] if overwrite: args += ['--overwrite'] + if debug: + args += ['--debug'] if mets_server_url: args += ['--mets-server-url', mets_server_url] log = getLogger('ocrd.processor.helpers.run_cli') @@ -270,7 +273,10 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None) -O, --output-file-grp USE File group(s) used as output -g, --page-id ID Physical page ID(s) to process instead of full document [] --overwrite Remove existing output pages/images - (with "--page-id", remove only those) + (with "--page-id", remove only those). + Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE + --debug Abort on any errors with full stack trace. + Short-hand for OCRD_MISSING_OUTPUT=ABORT --profile Enable profiling --profile-file PROF-PATH Write cProfile stats to PROF-PATH. Implies "--profile" -p, --parameter JSON-PATH Parameters, either verbatim JSON string diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 509b8123b9..2f94913ed7 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -42,7 +42,8 @@ MIME_TO_EXT, MIME_TO_PIL, MIMETYPE_PAGE, - REGEX_PREFIX + REGEX_PREFIX, + config ) from .workspace_backup import WorkspaceBackupManager @@ -75,7 +76,6 @@ class Workspace(): `OcrdMets` of this workspace. If `None`, then the METS will be read from and written to the filesystem directly. baseurl (string, None) : Base URL to prefix to relative URL. - overwrite_mode (boolean, False) : Whether to force add operations on this workspace globally """ def __init__( @@ -91,7 +91,6 @@ def __init__( self.resolver = resolver self.directory = directory self.mets_target = str(Path(directory, mets_basename)) - self.overwrite_mode = False self.is_remote = bool(mets_server_url) if mets is None: if self.is_remote: @@ -243,8 +242,6 @@ def remove_file(self, file_id, force=False, keep_file=False, page_recursive=Fals """ log = getLogger('ocrd.workspace.remove_file') log.debug('Deleting mets:file %s', file_id) - if self.overwrite_mode: - force = True if isinstance(file_id, OcrdFile): file_id = file_id.ID try: @@ -296,9 +293,6 @@ def remove_file_group(self, USE, recursive=False, force=False, keep_files=False, page_same_group (boolean): Remove only images in the same file group as the PAGE-XML. Has no effect unless ``page_recursive`` is `True`. """ - if not force and self.overwrite_mode: - force = True - if (not USE.startswith(REGEX_PREFIX)) and (USE not in self.mets.file_groups) and (not force): raise Exception("No such fileGrp: %s" % USE) @@ -419,8 +413,6 @@ def add_file(self, file_grp, content=None, **kwargs) -> Union[OcrdFile, ClientSi raise ValueError("workspace.add_file must be passed a 'page_id' kwarg, even if it is None.") if content is not None and not kwargs.get('local_filename'): raise Exception("'content' was set but no 'local_filename'") - if self.overwrite_mode: - kwargs['force'] = True with pushd_popd(self.directory): if kwargs.get('local_filename'): @@ -1101,8 +1093,6 @@ def save_image_file(self, image : Image.Image, The (absolute) path of the created file. """ log = getLogger('ocrd.workspace.save_image_file') - if self.overwrite_mode: - force = True saveargs = dict() if 'dpi' in image.info: saveargs['dpi'] = image.info['dpi'] diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 11af20249f..fa4c34d63b 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -155,6 +155,18 @@ def _ocrd_download_timeout_parser(val): validator=lambda val: val in ['SKIP', 'ABORT'], parser=str) +config.add("OCRD_MISSING_OUTPUT", + description="How to deal with missing output files (for some fileGrp/pageId) during processing [SKIP|COPY|ABORT]", + default=(True, 'SKIP'), + validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'], + parser=str) + +config.add("OCRD_EXISTING_OUTPUT", + description="How to deal with already existing output files (for some fileGrp/pageId) during processing [SKIP|OVERWRITE|ABORT]", + default=(True, 'SKIP'), + validator=lambda val: val in ['SKIP', 'OVERWRITE', 'ABORT'], + parser=str) + config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING", description="Default address of Processing Server to connect to (for `ocrd network client processing`).", default=(True, '')) diff --git a/tests/cli/test_bashlib.py b/tests/cli/test_bashlib.py index 15af493502..b1ab68c7fc 100644 --- a/tests/cli/test_bashlib.py +++ b/tests/cli/test_bashlib.py @@ -98,7 +98,7 @@ def test_constants_fail(self): def test_input_files(self): with copy_of_directory(assets.path_to('kant_aufklaerung_1784/data')) as wsdir: with pushd_popd(wsdir): - _, out, err = self.invoke_cli(bashlib_cli, ['input-files', '-I', 'OCR-D-IMG']) + _, out, err = self.invoke_cli(bashlib_cli, ['input-files', '-I', 'OCR-D-IMG', '-O', 'OUTPUT']) assert ("[url]='' [local_filename]='OCR-D-IMG/INPUT_0017.tif' [ID]='INPUT_0017' [mimetype]='image/tiff' " "[pageId]='PHYS_0017' [outputFileId]='OUTPUT_PHYS_0017'") in out diff --git a/tests/data/__init__.py b/tests/data/__init__.py index e7ef30fc2b..53fa227d01 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -1,8 +1,9 @@ import json import os +import re from pytest import warns from ocrd import Processor -from ocrd_utils import make_file_id +from ocrd_utils import make_file_id, config DUMMY_TOOL = { 'executable': 'ocrd-test', @@ -94,7 +95,41 @@ def process(self): page_id=input_file.pageId, mimetype=input_file.mimetype, local_filename=os.path.join(self.output_file_grp, file_id), - content='CONTENT') + content='CONTENT', + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) + +class DummyProcessorWithOutputFailures(Processor): + @property + def ocrd_tool(self): + return DUMMY_TOOL + + @property + def version(self): + return '0.0.1' + + @property + def executable(self): + return 'ocrd-test' + + def __init__(self, *args, **kwargs): + kwargs['download_files'] = False + super().__init__(*args, **kwargs) + + # no error handling with old process(), so override new API + def process_page_file(self, input_file): + n = int(re.findall(r'\d+', input_file.pageId)[-1]) + if n % 2: + raise Exception(f"intermittent failure on page {input_file.pageId}") + output_file_id = make_file_id(input_file, self.output_file_grp) + self.workspace.add_file(file_id=output_file_id, + file_grp=self.output_file_grp, + page_id=input_file.pageId, + local_filename=os.path.join(self.output_file_grp, output_file_id), + mimetype=input_file.mimetype, + content='CONTENT', + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) class IncompleteProcessor(Processor): @property diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index aa2124001a..064142574e 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -5,7 +5,13 @@ from pathlib import Path from os import environ from tests.base import CapturingTestCase as TestCase, assets, main, copy_of_directory # pylint: disable=import-error, no-name-in-module -from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, DummyProcessorWithOutput, IncompleteProcessor +from tests.data import ( + DummyProcessor, + DummyProcessorWithRequiredParameters, + DummyProcessorWithOutput, + DummyProcessorWithOutputFailures, + IncompleteProcessor +) from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging from ocrd.resolver import Resolver @@ -145,20 +151,43 @@ def test_run_output0(self): output_file_grp="OCR-D-OUT") assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 2 + def test_run_output_missing(self): + ws = self.workspace + from ocrd_utils import config + config.OCRD_MISSING_OUTPUT = 'SKIP' + run_processor(DummyProcessorWithOutputFailures, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT") + # only half succeed + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) // 2 + config.OCRD_MISSING_OUTPUT = 'ABORT' + with pytest.raises(Exception) as exc: + run_processor(DummyProcessorWithOutputFailures, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT") + assert "intermittent" in str(exc.value) + config.OCRD_MISSING_OUTPUT = 'COPY' + config.OCRD_EXISTING_OUTPUT = 'SKIP' + run_processor(DummyProcessorWithOutputFailures, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT") + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + def test_run_output_overwrite(self): with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002') - ws.overwrite_mode = True + from ocrd_utils import config + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, file_id='OCR-D-OUT_phys_0001', page_id='phys_0001') - ws.overwrite_mode = False + config.OCRD_EXISTING_OUTPUT = 'ABORT' with pytest.raises(Exception) as exc: run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", output_file_grp="OCR-D-OUT") assert str(exc.value) == "File with ID='OCR-D-OUT_phys_0001' already exists" - ws.overwrite_mode = True + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", output_file_grp="OCR-D-OUT") diff --git a/tests/test_workspace.py b/tests/test_workspace.py index 2fe5f450a0..1ae007ae52 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -270,9 +270,9 @@ def test_remove_file_force(sbb_data_workspace): # TODO check semantics - can a non-existent thing be removed? assert not sbb_data_workspace.remove_file('non-existing-id', force=True) - # should also succeed - sbb_data_workspace.overwrite_mode = True - assert not sbb_data_workspace.remove_file('non-existing-id', force=False) + with pytest.raises(FileNotFoundError) as not_found_exc: + sbb_data_workspace.remove_file('non-existing-id', force=False) + assert "not found in METS" in str(not_found_exc.value) def test_remove_file_remote_not_available_raises_exception(plain_workspace): @@ -292,9 +292,9 @@ def test_remove_file_remote(plain_workspace): assert plain_workspace.remove_file('page1_img', force=True) # TODO check returned value - # should also "succeed", because overwrite_mode is set which also sets 'force' to 'True' - plain_workspace.overwrite_mode = True - assert not plain_workspace.remove_file('page1_img') + with pytest.raises(FileNotFoundError) as not_found_exc: + plain_workspace.remove_file('page1_img') + assert "not found in METS" in str(not_found_exc.value) def test_rename_file_group(tmp_path): @@ -341,9 +341,6 @@ def test_remove_file_group_force(sbb_data_workspace): # check function and tests semantics # should succeed assert not sbb_data_workspace.remove_file_group('I DO NOT EXIST', force=True) - # should also succeed - sbb_data_workspace.overwrite_mode = True - assert not sbb_data_workspace.remove_file_group('I DO NOT EXIST', force=False) def test_remove_file_group_rmdir(sbb_data_tmp, sbb_data_workspace): @@ -432,9 +429,11 @@ def test_save_image_file(plain_workspace): assert exists(join(plain_workspace.directory, 'IMG', 'page1_img.jpg')) # should succeed assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg', force=True) - # should also succeed - plain_workspace.overwrite_mode = True - assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg') + # should fail + with pytest.raises(FileExistsError) as exists_exc: + plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg') + assert "neither force nor ignore are set" in str(exists_exc.value) + # check file_path kwarg assert plain_workspace.save_image_file(img, 'page1_img2', 'IMG', page_id='page1', file_path='IMG/page1_img2.png') assert exists(join(plain_workspace.directory, 'IMG', 'page1_img2.png')) From fdd5d168d0753caad8a19efd49884c52d7934183 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 18:01:31 +0200 Subject: [PATCH 093/228] ocrd_utils.config: add variables to module docstring --- src/ocrd_utils/config.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index fa4c34d63b..28f95b2162 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -62,7 +62,11 @@ def __init__(self): self._variables = {} def add(self, name, *args, **kwargs): - self._variables[name] = OcrdEnvVariable(name, *args, **kwargs) + var = OcrdEnvVariable(name, *args, **kwargs) + # make visible in ocrd_utils.config docstring (apidoc) + txt = var.describe(wrap_text=False, indent_text=True) + globals()['__doc__'] += "\n\n - " + txt + "\n\n" + self._variables[name] = var return self._variables[name] def has_default(self, name): From 6d87f9e6494a0768541e5b18ae557fd594d8319b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 18:02:36 +0200 Subject: [PATCH 094/228] improve docstrings, re-generate docs --- .../ocrd/ocrd.processor.ocrd_page_result.rst | 7 ++ docs/api/ocrd/ocrd.processor.rst | 1 + src/ocrd/cli/validate.py | 4 +- src/ocrd/cli/workspace.py | 1 + src/ocrd/processor/base.py | 75 ++++++++++--------- src/ocrd/workspace.py | 2 - src/ocrd_models/ocrd_exif.py | 1 + src/ocrd_models/ocrd_mets.py | 6 +- src/ocrd_utils/config.py | 28 ++++++- 9 files changed, 80 insertions(+), 45 deletions(-) create mode 100644 docs/api/ocrd/ocrd.processor.ocrd_page_result.rst diff --git a/docs/api/ocrd/ocrd.processor.ocrd_page_result.rst b/docs/api/ocrd/ocrd.processor.ocrd_page_result.rst new file mode 100644 index 0000000000..e13d50e155 --- /dev/null +++ b/docs/api/ocrd/ocrd.processor.ocrd_page_result.rst @@ -0,0 +1,7 @@ +ocrd.processor.ocrd\_page\_result module +======================================== + +.. automodule:: ocrd.processor.ocrd_page_result + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd/ocrd.processor.rst b/docs/api/ocrd/ocrd.processor.rst index 801114d2a3..7507d8439b 100644 --- a/docs/api/ocrd/ocrd.processor.rst +++ b/docs/api/ocrd/ocrd.processor.rst @@ -22,3 +22,4 @@ Submodules ocrd.processor.base ocrd.processor.helpers + ocrd.processor.ocrd_page_result diff --git a/src/ocrd/cli/validate.py b/src/ocrd/cli/validate.py index b26803d053..61d26988a4 100644 --- a/src/ocrd/cli/validate.py +++ b/src/ocrd/cli/validate.py @@ -40,7 +40,7 @@ def validate_cli(): @click.argument('ocrd_tool', required=False, nargs=1) def validate_ocrd_tool(ocrd_tool): ''' - Validate OCRD_TOOL as an ocrd-tool.json file. + Validate OCRD_TOOL as an `ocrd-tool.json` file. ''' if not ocrd_tool: ocrd_tool = 'ocrd-tool.json' @@ -107,7 +107,7 @@ def validate_page(page, **kwargs): @click.argument('tasks', nargs=-1, required=True) def validate_process(tasks, workspace, mets_basename, overwrite, page_id): ''' - Validate a sequence of tasks passable to 'ocrd process' + Validate a sequence of tasks passable to `ocrd process` ''' if workspace: _inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks], diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 0c70fd3a36..e2186a727c 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -308,6 +308,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi echo PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml; \\ } | ocrd workspace bulk-add -r '(?P.*) (?P.*) (?P.*) (?P.*)' \\ -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' - + """ log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name workspace = Workspace( diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 0ec2711f61..d53c3da0bf 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -144,22 +144,21 @@ def __init__( version=None ): """ - Instantiate, but do not process. Unless ``list_resources`` or - ``show_resource`` or ``show_help`` or ``show_version`` or - ``dump_json`` or ``dump_module_dir`` is true, setup for processing - (parsing and validating parameters, entering the workspace directory). + Instantiate, but do not setup (neither for processing nor other usage). + If given, do parse and validate :py:data:`.parameter`. Args: workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \ + If not ``None``, then `chdir` to that directory. Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ before processing. Keyword Args: parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \ Can be ``None`` even for processing, but then needs to be set before running. - input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input. \ + input_file_grp (string): comma-separated list of METS ``fileGrp`` used for input. \ Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ before processing. - output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output. \ + output_file_grp (string): comma-separated list of METS ``fileGrp`` used for output. \ Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ before processing. page_id (string): comma-separated list of METS physical ``page`` IDs to process \ @@ -287,29 +286,32 @@ def setup(self) -> None: """ pass - @deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()') + @deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()') def process(self) -> None: """ - Process all files of the :py:attr:`workspace` - from the given :py:attr:`input_file_grp` - to the given :py:attr:`output_file_grp` - for the given :py:attr:`page_id` (or all pages) - under the given :py:attr:`parameter`. + Process all files of the :py:data:`workspace` + from the given :py:data:`input_file_grp` + to the given :py:data:`output_file_grp` + for the given :py:data:`page_id` (or all pages) + under the given :py:data:`parameter`. - (This contains the main functionality and needs to be overridden by subclasses.) + (This contains the main functionality and needs to be + overridden by subclasses.) """ raise NotImplementedError() def process_workspace(self, workspace: Workspace) -> None: """ Process all files of the given ``workspace``, - from the given :py:attr:`input_file_grp` - to the given :py:attr:`output_file_grp` - for the given :py:attr:`page_id` (or all pages) - under the given :py:attr:`parameter`. + from the given :py:data:`input_file_grp` + to the given :py:data:`output_file_grp` + for the given :py:data:`page_id` (or all pages) + under the given :py:data:`parameter`. (This will iterate over pages and files, calling - :py:meth:`process_page`, handling exceptions.) + :py:meth:`.process_page_file` and handling exceptions. + It should be overridden by subclasses to handle cases + like post-processing or computation across pages.) """ log = getLogger('ocrd.processor.base') with pushd_popd(workspace.directory): @@ -370,7 +372,7 @@ def process_workspace(self, workspace: Workspace) -> None: def _copy_page_file(self, input_file : Union[OcrdFile, ClientSideOcrdFile]) -> None: """ - Copy the given ``input_file`` of the :py:attr:`workspace`, + Copy the given ``input_file`` of the :py:data:`workspace`, representing one physical page (passed as one opened :py:class:`~ocrd_models.OcrdFile` per input fileGrp) and add it as if it was a processing result. @@ -399,14 +401,14 @@ def _copy_page_file(self, input_file : Union[OcrdFile, ClientSideOcrdFile]) -> N def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: """ - Process the given ``input_files`` of the :py:attr:`workspace`, + Process the given ``input_files`` of the :py:data:`workspace`, representing one physical page (passed as one opened - :py:class:`~ocrd_models.OcrdFile` per input fileGrp) - under the given :py:attr:`parameter`, and make sure the + :py:class:`.OcrdFile` per input fileGrp) + under the given :py:data:`.parameter`, and make sure the results get added accordingly. - (This uses process_page_pcgts, but can be overridden by subclasses - to handle cases like multiple fileGrps, non-PAGE input etc.) + (This uses :py:meth:`.process_page_pcgts`, but should be overridden by subclasses + to handle cases like multiple output fileGrps, non-PAGE input etc.) """ log = getLogger('ocrd.processor.base') input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files) @@ -449,28 +451,28 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """ - Process the given ``input_pcgts`` of the :py:attr:`workspace`, + Process the given ``input_pcgts`` of the :py:data:`.workspace`, representing one physical page (passed as one parsed - :py:class:`~ocrd_models.OcrdPage` per input fileGrp) - under the given :py:attr:`parameter`, and return the - resulting :py:class:`~ocrd.processor.OcrdPageResult`. + :py:class:`.OcrdPage` per input fileGrp) + under the given :py:data:`.parameter`, and return the + resulting :py:class:`.OcrdPageResult`. Optionally, add to the ``images`` attribute of the resulting - :py:class:`~ocrd.processor.OcrdPageResult` instances - of :py:class:`~ocrd.processor.OcrdPageResultImage`, + :py:class:`.OcrdPageResult` instances of :py:class:`.OcrdPageResultImage`, which have required fields for ``pil`` (:py:class:`PIL.Image` image data), ``file_id_suffix`` (used for generating IDs of the saved image) and ``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType` for setting the filename of the saved image). - (This contains the main functionality and must be overridden by subclasses.) + (This contains the main functionality and must be overridden by subclasses, + unless it does not get called by some overriden :py:meth:`.process_page_file`.) """ raise NotImplementedError() def add_metadata(self, pcgts: OcrdPage) -> None: """ Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing - the processing step and runtime parameters to :py:class:`~ocrd_models.OcrdPage` ``pcgts``. + the processing step and runtime parameters to :py:class:`.OcrdPage` ``pcgts``. """ metadata_obj = pcgts.get_Metadata() assert metadata_obj is not None @@ -496,7 +498,7 @@ def add_metadata(self, pcgts: OcrdPage) -> None: def resolve_resource(self, val): """ Resolve a resource name to an absolute file path with the algorithm in - https://ocr-d.de/en/spec/ocrd_tool#file-parameters + `spec `_ Args: val (string): resource value to resolve @@ -522,7 +524,7 @@ def resolve_resource(self, val): def show_resource(self, val): """ Resolve a resource name to a file path with the algorithm in - https://ocr-d.de/en/spec/ocrd_tool#file-parameters, + `spec `_, then print its contents to stdout. Args: @@ -593,7 +595,8 @@ def input_files(self): files for that page) - Otherwise raise an error (complaining that only PAGE-XML warrants having multiple images for a single page) - Algorithm _ + + See `algorithm `_ Returns: A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects. @@ -635,11 +638,13 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): - if ``last``, then the last matching file for the page will be silently selected (as if the last was the only match) - if ``abort``, then an exception will be raised. + Multiple matches for PAGE-XML will always raise an exception. Keyword Args: require_first (boolean): If true, then skip a page entirely whenever it is not available in the first input `fileGrp`. + on_error (string): How to handle multiple file matches per page. mimetype (string): If not `None`, filter by the specified MIME type (literal or regex prefixed by `//`). Otherwise prefer PAGE or image. diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 2f94913ed7..3523d9f15f 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -606,7 +606,6 @@ def image_from_page(self, page, page_id, Cropping uses a polygon mask (not just the bounding box rectangle). Areas outside the polygon will be filled according to ``fill``: - \b - if `"background"` (the default), then fill with the median color of the image; - else if `"none"`, then avoid masking polygons where possible @@ -850,7 +849,6 @@ def image_from_segment(self, segment, parent_image, parent_coords, Cropping uses a polygon mask (not just the bounding box rectangle). Areas outside the polygon will be filled according to `fill`: - \b - if `"background"` (the default), then fill with the median color of the image; - else if `"none"`, then avoid masking polygons where possible diff --git a/src/ocrd_models/ocrd_exif.py b/src/ocrd_models/ocrd_exif.py index 406e60a85a..82b8b7e1c3 100644 --- a/src/ocrd_models/ocrd_exif.py +++ b/src/ocrd_models/ocrd_exif.py @@ -21,6 +21,7 @@ class OcrdExif(): * ``RGB`` for 24-bit truecolor, * ``I`` for 32-bit signed integer grayscale, * ``F`` for floating-point grayscale + (see PIL concept **mode**) resolution (int): pixel density xResolution (int): pixel density diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index d6da3e1cda..4d1e6cba58 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -194,7 +194,7 @@ def unique_identifier(self, purl : str) -> None: @property def agents(self) -> List[OcrdAgent]: """ - List all :py:class:`ocrd_models.ocrd_agent.OcrdAgent`s + List all :py:class:`ocrd_models.ocrd_agent.OcrdAgent` entries. """ return [OcrdAgent(el_agent) for el_agent in self._tree.getroot().findall('mets:metsHdr/mets:agent', NS)] @@ -218,7 +218,7 @@ def add_agent(self, *args, **kwargs) -> OcrdAgent: @property def file_groups(self) -> List[str]: """ - List the `@USE` of all `mets:fileGrp` entries. + List the ``@USE`` of all ``mets:fileGrp`` entries. """ # WARNING: Actually we cannot return strings in place of elements! @@ -894,7 +894,7 @@ def merge(self, other_mets, force : bool = False, Add all files from other_mets. Accepts the same kwargs as :py:func:`find_files` Keyword Args: - force (boolean): Whether to :py:meth:`add_file`s with force (overwriting existing ``mets:file``s) + force (boolean): Whether to do :py:meth:`add_file` with ``force`` (overwriting existing ``mets:file`` entries) fileGrp_mapping (dict): Map :py:attr:`other_mets` fileGrp to fileGrp in this METS fileId_mapping (dict): Map :py:attr:`other_mets` file ID to file ID in this METS pageId_mapping (dict): Map :py:attr:`other_mets` page ID to page ID in this METS diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 28f95b2162..851fb42a8c 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -120,9 +120,11 @@ def raw_value(self, name): description="""\ Whether to enable gathering runtime statistics on the `ocrd.profile` logger (comma-separated): + - `CPU`: yields CPU and wall-time, - `RSS`: also yields peak memory (resident set size) - `PSS`: also yields peak memory (proportional set size) + """, validator=lambda val : all(t in ('', 'CPU', 'RSS', 'PSS') for t in val.split(',')), default=(True, '')) @@ -154,19 +156,39 @@ def _ocrd_download_timeout_parser(val): parser=_parser_boolean) config.add("OCRD_MISSING_INPUT", - description="How to deal with missing input files (for some fileGrp/pageId) during processing [SKIP|ABORT]", + description="""\ +How to deal with missing input files (for some fileGrp/pageId) during processing: + + - `SKIP`: ignore and proceed with next page's input + - `ABORT`: throw :py:class:`.MissingInputFile` + +""", default=(True, 'SKIP'), validator=lambda val: val in ['SKIP', 'ABORT'], parser=str) config.add("OCRD_MISSING_OUTPUT", - description="How to deal with missing output files (for some fileGrp/pageId) during processing [SKIP|COPY|ABORT]", + description="""\ +How to deal with missing output files (for some fileGrp/pageId) during processing: + + - `SKIP`: ignore and proceed processing next page + - `COPY`: fall back to copying input PAGE to output fileGrp for page + - `ABORT`: re-throw whatever caused processing to fail + +""", default=(True, 'SKIP'), validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'], parser=str) config.add("OCRD_EXISTING_OUTPUT", - description="How to deal with already existing output files (for some fileGrp/pageId) during processing [SKIP|OVERWRITE|ABORT]", + description="""\ +How to deal with already existing output files (for some fileGrp/pageId) during processing: + + - `SKIP`: ignore and proceed processing next page + - `OVERWRITE`: force writing result to output fileGrp for page + - `ABORT`: re-throw :py:class:`FileExistsError` + +""", default=(True, 'SKIP'), validator=lambda val: val in ['SKIP', 'OVERWRITE', 'ABORT'], parser=str) From 9942bbe6dc42246c0a7e6eda85444aa0f745face Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 21 Aug 2024 18:05:38 +0200 Subject: [PATCH 095/228] Processor.zip_input_files: more verbose log msg Co-authored-by: Konstantin Baierer --- src/ocrd/processor/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index d53c3da0bf..55b4619422 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -714,7 +714,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): ift[i] = file_ # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range) if self.page_id and not any(pages): - LOG.critical(f"Could not find any files for selected pageId {self.page_id}") + LOG.critical(f"Could not find any files for selected pageId {self.page_id}.\ncompare '{self.page_id}' with the output of 'orcd workspace list-page'.") ifts = list() for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): From 8a584e9dcc5794baa9e08556a943bc7e9eb9991f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 21 Aug 2024 18:07:05 +0200 Subject: [PATCH 096/228] test_processor: test for specific exception Co-authored-by: Konstantin Baierer --- tests/processor/test_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 064142574e..c263d99fce 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -280,7 +280,7 @@ def ocrd_tool(self): assert ('foobar3', 'foobar4') in tuples tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')] assert ('foobar3', None) in tuples - with self.assertRaisesRegex(Exception, "Could not determine unique input file"): + with self.assertRaisesRegex(NonUniqueInputFile, "Could not determine unique input file"): tuples = proc.zip_input_files(on_error='abort') ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2dup', page_id='phys_0001') for page_id in [None, 'phys_0001,phys_0002']: @@ -289,7 +289,7 @@ def ocrd_tool(self): proc.workspace = ws proc.input_file_grp = 'GRP1,GRP2' proc.page_id = page_id - with self.assertRaisesRegex(Exception, "Could not determine unique input file"): + with self.assertRaisesRegex(NonUniqueInputFile, "Could not determine unique input file"): tuples = proc.zip_input_files() def test_zip_input_files_require_first(self): From 8077d45056c9d2682bee5bb5017f79eb0a7b336a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 19:54:26 +0200 Subject: [PATCH 097/228] test_processor: fix missing import --- tests/processor/test_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index c263d99fce..0cbae7d548 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -15,7 +15,7 @@ from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging from ocrd.resolver import Resolver -from ocrd.processor.base import Processor, run_processor, run_cli +from ocrd.processor import Processor, run_processor, run_cli, NonUniqueInputFile from unittest import mock import pytest From cf7b193fe52477448897a0877188e188ba3f2f9a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 20:24:08 +0200 Subject: [PATCH 098/228] OcrdPage: fix typeing typo --- src/ocrd_models/ocrd_page.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ocrd_models/ocrd_page.py b/src/ocrd_models/ocrd_page.py index b28777e72d..87e644fd90 100644 --- a/src/ocrd_models/ocrd_page.py +++ b/src/ocrd_models/ocrd_page.py @@ -2,6 +2,7 @@ API to PAGE-XML, generated with generateDS from XML schema. """ from io import StringIO +from typing import Dict from inspect import getmembers from lxml import etree as ET @@ -187,8 +188,8 @@ def __init__( self, pcgts : PcGtsType, etree : ET._Element, - mapping : dict[str, ET._Element], - revmap : dict[ET._Element, str], + mapping : Dict[str, ET._Element], + revmap : Dict[ET._Element, str], ): self._pcgts = pcgts self.etree = etree From 9af8670dbbcdd06addde68a29c9b7a91f7f1a0c9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 20:40:40 +0200 Subject: [PATCH 099/228] dummy_processor: fix typos from logging --- src/ocrd/processor/builtin/dummy_processor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 1e3f52ebe9..c2f0eec4f7 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -41,7 +41,7 @@ def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcr file_id = make_file_id(input_file, self.output_file_grp) ext = MIME_TO_EXT.get(input_file.mimetype, '') local_filename = join(self.output_file_grp, file_id + ext) - LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) + self.logger.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) with open(input_file.local_filename, 'rb') as f: output_file = self.workspace.add_file( file_id=file_id, @@ -69,9 +69,9 @@ def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcr ) else: if self.parameter['copy_files']: - LOG.info("Not copying %s because it is a PAGE-XML file, which gets identity-transformed", input_file.local_filename) + self.logger.info("Not copying %s because it is a PAGE-XML file, which gets identity-transformed", input_file.local_filename) else: - LOG.info("Not copying %s because it is not a PAGE-XML file and copy_files was false", input_file.local_filename) + self.logger.info("Not copying %s because it is not a PAGE-XML file and copy_files was false", input_file.local_filename) # we can rely on base implementation verbatim super().process_page_file(input_file) From c6d9736b1ecbb2041c7686873568c50f09360fe6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 22:28:29 +0200 Subject: [PATCH 100/228] tests report.is_valid: improve output on failure --- tests/cli/test_validate.py | 23 +++++++++---------- tests/validator/test_json_validator.py | 6 ++--- tests/validator/test_ocrd_tool_validator.py | 4 ++-- tests/validator/test_parameter_validator.py | 2 +- .../validator/test_resource_list_validator.py | 3 +-- tests/validator/test_xsd_validator.py | 8 +++---- 6 files changed, 22 insertions(+), 24 deletions(-) diff --git a/tests/cli/test_validate.py b/tests/cli/test_validate.py index 0682ea7a01..12e87f4dc9 100644 --- a/tests/cli/test_validate.py +++ b/tests/cli/test_validate.py @@ -57,24 +57,24 @@ def test_validate_ocrd_tool(self): json_path.write_text(OCRD_TOOL) # normal call - code, _, err = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) + self.assertEqual(code, 0, out + err) # relative path with pushd_popd(tempdir): - code, _, err = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) + self.assertEqual(code, 0, out + err) # default path with pushd_popd(tempdir): - code, _, err = self.invoke_cli(validate_cli, ['tool-json']) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['tool-json']) + self.assertEqual(code, 0, out + err) def test_validate_parameter(self): with TemporaryDirectory() as tempdir: json_path = Path(tempdir, 'ocrd-tool.json') json_path.write_text(OCRD_TOOL) with pushd_popd(tempdir): - code, _, err = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) + self.assertEqual(code, 0, out + err) def test_validate_page(self): page_path = assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml') @@ -84,19 +84,18 @@ def test_validate_page(self): def test_validate_tasks(self): # simple - code, _, err = self.invoke_cli(validate_cli, ['tasks', + code, out, err = self.invoke_cli(validate_cli, ['tasks', "sample-processor-required-param -I FOO -O OUT1 -p '{\"param1\": true}'", "sample-processor-required-param -I FOO -O OUT2 -p '{\"param1\": true}'", ]) - self.assertEqual(code, 0, err) + self.assertEqual(code, 0, out + err) # with workspace code, out, err = self.invoke_cli(validate_cli, ['tasks', '--workspace', assets.path_to('kant_aufklaerung_1784/data'), "sample-processor-required-param -I OCR-D-IMG,OCR-D-GT-PAGE -O OUT1 -p '{\"param1\": true}'", "sample-processor-required-param -I OCR-D-IMG,OCR-D-GT-PAGE -O OUT2 -p '{\"param1\": true}'", ]) - print('code=%s out=%s err=%s' % (code, out, err)) - self.assertEqual(code, 0, err) + self.assertEqual(code, 0, out + err) if __name__ == '__main__': diff --git a/tests/validator/test_json_validator.py b/tests/validator/test_json_validator.py index 25771b701f..d81c894f97 100644 --- a/tests/validator/test_json_validator.py +++ b/tests/validator/test_json_validator.py @@ -20,18 +20,18 @@ def setUp(self): def test_validate_string(self): report = JsonValidator.validate('{}', {}) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_defaults_set(self): obj = {'bar': 2000} report = self.defaults_validator._validate(obj) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) self.assertEqual(obj, {'foo': 3000, 'bar': 2000}) def test_properr(self): obj = {'bar': 100, 'quux': {}} report = self.defaults_validator._validate(obj) - self.assertFalse(report.is_valid) + self.assertFalse(report.is_valid, str(report.to_xml())) self.assertEqual(len(report.errors), 1) diff --git a/tests/validator/test_ocrd_tool_validator.py b/tests/validator/test_ocrd_tool_validator.py index 2d035757ed..df19e8e64c 100644 --- a/tests/validator/test_ocrd_tool_validator.py +++ b/tests/validator/test_ocrd_tool_validator.py @@ -29,7 +29,7 @@ def setUp(self): def test_smoke(self): report = OcrdToolValidator.validate(self.ocrd_tool) - self.assertTrue(report.is_valid, str(report.errors)) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_additional_props(self): self.ocrd_tool['not-allowed'] = 'YUP' @@ -48,7 +48,7 @@ def test_file_param_ok(self): ocrd_tool = json.loads(skeleton) ocrd_tool['tools']['ocrd-xyz']['parameters'] = {"file-param": {"description": "...", "type": "string", "content-type": 'application/rdf+xml'}} report = OcrdToolValidator.validate(ocrd_tool) - self.assertTrue(report.is_valid, str(report.errors)) + self.assertTrue(report.is_valid, str(report.to_xml())) # Not restricted anymore since spec 3.3.0 # def test_file_param_bad_content_types(self): diff --git a/tests/validator/test_parameter_validator.py b/tests/validator/test_parameter_validator.py index f0d9d41d2c..297a149064 100644 --- a/tests/validator/test_parameter_validator.py +++ b/tests/validator/test_parameter_validator.py @@ -42,7 +42,7 @@ def test_default_assignment(self): }) obj = {'baz': '23'} report = validator.validate(obj) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) self.assertEqual(obj, {'baz': '23', "num-param": 1}) def test_min_max(): diff --git a/tests/validator/test_resource_list_validator.py b/tests/validator/test_resource_list_validator.py index eb95d9b1ea..cc63c30ea7 100644 --- a/tests/validator/test_resource_list_validator.py +++ b/tests/validator/test_resource_list_validator.py @@ -22,8 +22,7 @@ def reslist(): def test_resource_list_validator(reslist): report = OcrdResourceListValidator.validate(reslist) - print(report.errors) - assert report.is_valid == True + assert report.is_valid, str(report.to_xml()) if __name__ == '__main__': main(__file__) diff --git a/tests/validator/test_xsd_validator.py b/tests/validator/test_xsd_validator.py index d0150338dd..50b3851ffc 100644 --- a/tests/validator/test_xsd_validator.py +++ b/tests/validator/test_xsd_validator.py @@ -37,22 +37,22 @@ def test_mets_empty(self): def test_validate_simple_protected_str(self): val = XsdValidator(XSD_METS_URL) report = val._validate(self.ws.mets.to_xml()) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_validate_simple_protected_doc(self): val = XsdValidator(XSD_METS_URL) report = val._validate(self.ws.mets._tree) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_validate_simple_static_doc(self): report = XsdValidator.validate(XSD_METS_URL, self.ws.mets._tree) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) class TestXsdPageValidator(TestCase): def test_validate_page_simple_static_doc(self): report = XsdPageValidator.validate(simple_page) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) if __name__ == '__main__': main(__file__) From 161cf0c5797bd6c619340fd6d2df48d5dee6c078 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 22:28:56 +0200 Subject: [PATCH 101/228] JsonValidator: fix deprecation warning (by actually checking instance) --- src/ocrd_validators/json_validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd_validators/json_validator.py b/src/ocrd_validators/json_validator.py index ccd27b92a2..4fb84b3fdb 100644 --- a/src/ocrd_validators/json_validator.py +++ b/src/ocrd_validators/json_validator.py @@ -25,7 +25,7 @@ def set_defaults_and_handle_deprecate(validator, properties, instance, schema): for prop, subschema in properties.items(): if "default" in subschema: instance.setdefault(prop, subschema["default"]) - if subschema.get('deprecated', False): + if subschema.get('deprecated', False) and instance.get(prop): yield JsonSchemaDeprecationWarning(f"Property {prop} has been deprecated, ocrd-tool.json should be updated.") for error in validate_properties(validator, properties, instance, schema): From b2e6485642d096ed229c004a7cb88cf73ae1718c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 22:52:26 +0200 Subject: [PATCH 102/228] predefine union types OcrdFileType and OcrdPageType --- src/ocrd/processor/base.py | 16 ++++++++-------- src/ocrd/processor/builtin/dummy_processor.py | 4 ++-- src/ocrd_models/__init__.py | 4 ++-- src/ocrd_models/ocrd_file.py | 2 ++ src/ocrd_models/ocrd_page.py | 5 ++++- 5 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 7517497906..6c91eb00a9 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -15,7 +15,7 @@ import os from os import getcwd from pathlib import Path -from typing import List, Optional, Union +from typing import List, Optional, Union, get_args import sys import inspect import tarfile @@ -25,7 +25,7 @@ from requests import HTTPError from ocrd.workspace import Workspace -from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile +from ocrd_models.ocrd_file import OcrdFileType from ocrd.processor.ocrd_page_result import OcrdPageResult from ocrd_utils import ( VERSION as OCRD_VERSION, @@ -332,7 +332,7 @@ def process_workspace(self, workspace: Workspace) -> None: try: # FIXME: add page parallelization by running multiprocessing.Pool (#322) for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): - input_files : List[Optional[Union[OcrdFile, ClientSideOcrdFile]]] = [None] * len(input_file_tuple) + input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) page_id = next(input_file.pageId for input_file in input_file_tuple if input_file) @@ -382,7 +382,7 @@ def process_workspace(self, workspace: Workspace) -> None: # fall back to deprecated method self.process() - def _copy_page_file(self, input_file : Union[OcrdFile, ClientSideOcrdFile]) -> None: + def _copy_page_file(self, input_file : OcrdFileType) -> None: """ Copy the given ``input_file`` of the :py:data:`workspace`, representing one physical page (passed as one opened @@ -390,7 +390,7 @@ def _copy_page_file(self, input_file : Union[OcrdFile, ClientSideOcrdFile]) -> N and add it as if it was a processing result. """ input_pcgts : OcrdPage - assert isinstance(input_file, (OcrdFile, ClientSideOcrdFile)) + assert isinstance(input_file, get_args(OcrdFileType)) self._base_logger.debug(f"parsing file {input_file.ID} for page {input_file.pageId}") try: input_pcgts = page_from_file(input_file) @@ -410,7 +410,7 @@ def _copy_page_file(self, input_file : Union[OcrdFile, ClientSideOcrdFile]) -> N force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) - def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: + def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: """ Process the given ``input_files`` of the :py:data:`workspace`, representing one physical page (passed as one opened @@ -422,10 +422,10 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc to handle cases like multiple output fileGrps, non-PAGE input etc.) """ input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files) - assert isinstance(input_files[0], (OcrdFile, ClientSideOcrdFile)) + assert isinstance(input_files[0], get_args(OcrdFileType)) page_id = input_files[0].pageId for i, input_file in enumerate(input_files): - assert isinstance(input_file, (OcrdFile, ClientSideOcrdFile)) + assert isinstance(input_file, get_args(OcrdFileType)) self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}") try: page_ = page_from_file(input_file) diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index c2f0eec4f7..7b2f1b66ee 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -7,7 +7,7 @@ from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd.processor.ocrd_page_result import OcrdPageResult -from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile +from ocrd_models.ocrd_file import OcrdFileType from ocrd_models.ocrd_page import OcrdPage, to_xml from ocrd_utils import ( getLogger, @@ -32,7 +32,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional # nothing to do here return OcrdPageResult(input_pcgts[0]) - def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: + def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: input_file = input_files[0] assert input_file assert input_file.local_filename diff --git a/src/ocrd_models/__init__.py b/src/ocrd_models/__init__.py index 330fefe97d..ff4e31798b 100644 --- a/src/ocrd_models/__init__.py +++ b/src/ocrd_models/__init__.py @@ -3,8 +3,8 @@ """ from .ocrd_agent import OcrdAgent, ClientSideOcrdAgent from .ocrd_exif import OcrdExif -from .ocrd_file import OcrdFile, ClientSideOcrdFile +from .ocrd_file import OcrdFile, ClientSideOcrdFile, OcrdFileType from .ocrd_mets import OcrdMets -from .ocrd_page import OcrdPage +from .ocrd_page import OcrdPage, OcrdPageType from .ocrd_xml_base import OcrdXmlDocument from .report import ValidationReport diff --git a/src/ocrd_models/ocrd_file.py b/src/ocrd_models/ocrd_file.py index 2315a08ff3..a116341710 100644 --- a/src/ocrd_models/ocrd_file.py +++ b/src/ocrd_models/ocrd_file.py @@ -266,3 +266,5 @@ def __str__(self): for k in ['fileGrp', 'ID', 'mimetype', 'url', 'local_filename'] ]) return '' % (props) + +OcrdFileType = Union[OcrdFile, ClientSideOcrdFile] diff --git a/src/ocrd_models/ocrd_page.py b/src/ocrd_models/ocrd_page.py index 87e644fd90..6accb9241f 100644 --- a/src/ocrd_models/ocrd_page.py +++ b/src/ocrd_models/ocrd_page.py @@ -2,7 +2,7 @@ API to PAGE-XML, generated with generateDS from XML schema. """ from io import StringIO -from typing import Dict +from typing import Dict, Union from inspect import getmembers from lxml import etree as ET @@ -11,6 +11,7 @@ 'parseEtree', 'parseString', 'OcrdPage', + 'OcrdPageType', "AdvertRegionType", "AlternativeImageType", @@ -199,6 +200,8 @@ def __init__( def __getattr__(self, name): return getattr(self._pcgts, name) +OcrdPageType = Union[OcrdPage, PcGtsType] + def to_xml(el, skip_declaration=False) -> str: """ Serialize ``pc:PcGts`` document as string. From 822d731059532b5e9f401afd7532ba4ab8acfa34 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 23:24:08 +0200 Subject: [PATCH 103/228] processor CLI --debug: set all to ABORT (not just MISSING_OUTPUT) --- src/ocrd/decorators/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index 364ef4c847..3f07ede4a6 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -118,10 +118,12 @@ def resolve(name): resolver.resolve_mets_arguments(working_dir, mets, None, mets_server_url) workspace = resolver.workspace_from_url(mets, working_dir, mets_server_url=mets_server_url) page_id = kwargs.get('page_id') - if overwrite: - config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' if debug: + config.OCRD_MISSING_INPUT = 'ABORT' config.OCRD_MISSING_OUTPUT = 'ABORT' + config.OCRD_EXISTING_OUTPUT = 'ABORT' + if overwrite: + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) if not report.is_valid: raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) From 3a7a7713abdf218a6bc64317dba83cd528e26589 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 23:24:39 +0200 Subject: [PATCH 104/228] :memo: changelog --- CHANGELOG.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d552580706..1b53c6a28d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,28 @@ Change Log Versioned according to [Semantic Versioning](http://semver.org/). +## Unreleased + +Changed: + - :fire: `OcrdPage` as proxy of `PcGtsType` instead of alias; also contains `etree` and `mapping` now + - :fire: `Processor.zip_input_files` now can throw `ocrd.NonUniqueInputFile` and `ocrd.MissingInputFile` + (the latter only if `OCRD_MISSING_INPUT=ABORT`) + - :fire: `Processor.zip_input_files` does not by default use `require_first` anymore + (so the first file in any input file tuple per page can be `None` as well) + - :fire: no more `Workspace.overwrite_mode`, merely delegate to `OCRD_EXISTING_OUTPUT=OVERWRITE` + - :art: improve on docs result for `ocrd_utils.config` + +Added: + - :point_right: `OCRD_DOWNLOAD_INPUT` for whether input files should be downloaded before processing + - :point_right: `OCRD_MISSING_INPUT` for how to handle missing input files (**`SKIP`** or `ABORT`) + - :point_right: `OCRD_MISSING_OUTPUT` for how to handle processing failures (**`SKIP`** or `ABORT` or `COPY`) + the latter behaves like ocrd-dummy for the failed page(s) + - :point_right: `OCRD_EXISTING_OUTPUT` for how to handle existing output files (**`SKIP`** or `ABORT` or `OVERWRITE`) + - new CLI option `--debug` as short-hand for `ABORT` choices above + - `Processor.logger` set up by constructor already (for re-use by processor implementors) + - `default`-expand and validate `ocrd_tool.json` in `Processor` constructor, log invalidities + - handle JSON `deprecation` in `ocrd_tool.json` by reporting warnings + ## [3.0.0a1] - 2024-08-15 Changed: From 2bdb6c438d8ca3a9592dbe34d95592cbcdc650f6 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 22 Aug 2024 11:10:51 +0200 Subject: [PATCH 105/228] :package: v3.0.0a2 --- CHANGELOG.md | 4 ++++ VERSION | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b53c6a28d..38f36b96d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [3.0.0a2] - 2024-08-22 + Changed: - :fire: `OcrdPage` as proxy of `PcGtsType` instead of alias; also contains `etree` and `mapping` now - :fire: `Processor.zip_input_files` now can throw `ocrd.NonUniqueInputFile` and `ocrd.MissingInputFile` @@ -2191,6 +2193,8 @@ Fixed Initial Release +[3.0.0a2]: ../../compare/v3.0.0a2..v3.0.0a1 +[3.0.0a1]: ../../compare/v3.0.0a1..v2.67.2 [2.67.2]: ../../compare/v2.67.2..v2.67.1 [2.67.1]: ../../compare/v2.67.1..v2.67.0 [2.67.0]: ../../compare/v2.67.0..v2.66.1 diff --git a/VERSION b/VERSION index 2a94548735..3a5b5bc9d6 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0a1 \ No newline at end of file +3.0.0a2 From 00bd6fe8500ffcbf125dcc157c5997ed115c9023 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 22 Aug 2024 11:13:38 +0200 Subject: [PATCH 106/228] remove make *-workaround, we will not do that for v3+ --- CHANGELOG.md | 3 +++ Makefile | 38 -------------------------------------- 2 files changed, 3 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38f36b96d6..43bf85764d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Changed: + - :fire: `ocrd_utils`, `ocrd_models`, `ocrd_modelfactory`, `ocrd_validators` and `ocrd_network` are not published as separate packages anymore, everything is contained in `ocrd` and you should adapt your `requirements.txt` accordingly. + ## [3.0.0a2] - 2024-08-22 Changed: diff --git a/Makefile b/Makefile index 39b46ee849..fd1210b65a 100644 --- a/Makefile +++ b/Makefile @@ -401,41 +401,3 @@ docker docker-cuda docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch: # Build wheels and source dist and twine upload them pypi: build twine upload --verbose dist/ocrd-$(VERSION)*{tar.gz,whl} - -pypi-workaround: build-workaround - for dist in $(BUILD_ORDER);do twine upload dist/$$dist-$(VERSION)*{tar.gz,whl};done - -# Only in place until v3 so we don't break existing installations -build-workaround: pyclean - cp pyproject.toml pyproject.toml.BAK - cp src/ocrd_utils/constants.py src/ocrd_utils/constants.py.BAK - cp src/ocrd/cli/__init__.py src/ocrd/cli/__init__.py.BAK - for dist in $(BUILD_ORDER);do \ - cat pyproject.toml.BAK | sed "s,^name =.*,name = \"$$dist\"," > pyproject.toml; \ - cat src/ocrd_utils/constants.py.BAK | sed "s,dist_version('ocrd'),dist_version('$$dist')," > src/ocrd_utils/constants.py; \ - cat src/ocrd/cli/__init__.py.BAK | sed "s,package_name='ocrd',package_name='$$dist'," > src/ocrd/cli/__init__.py; \ - $(MAKE) build; \ - done - rm pyproject.toml.BAK - rm src/ocrd_utils/constants.py.BAK - rm src/ocrd/cli/__init__.py.BAK - -# test that the aliased packages work in isolation and combined -test-workaround: build-workaround - $(MAKE) uninstall-workaround - for dist in $(BUILD_ORDER);do \ - pip install dist/$$dist-*.whl ;\ - ocrd --version ;\ - make test ;\ - pip uninstall --yes $$dist ;\ - done - for dist in $(BUILD_ORDER);do \ - pip install dist/$$dist-*.whl ;\ - done - ocrd --version ;\ - make test ;\ - for dist in $(BUILD_ORDER);do pip uninstall --yes $$dist;done - -uninstall-workaround: - for dist in $(BUILD_ORDER);do $(PIP) uninstall --yes $$dist;done - From d7775273be5aada8554e4a14693a83afff2cdd1d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 13:10:10 +0200 Subject: [PATCH 107/228] =?UTF-8?q?Processor.parameter:=20only=20validate?= =?UTF-8?q?=20when=20set=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `Processor` init: do not validate, only set `parameter` if present (needed to avoid exception from `ParameterValidator` for processors with mandatory params in non-processing usage) - `Processor.parameter`: allow `None`, but validate when set - `Processor._setup`: validate parameters, then call `Processor.setup` --- src/ocrd/processor/base.py | 39 +++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 6c91eb00a9..76639acdd7 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -139,6 +139,19 @@ def ocrd_tool(self) -> dict: self._ocrd_tool = self.metadata['tools'][self.executable] return self._ocrd_tool + @property + def parameter(self) -> Optional[dict]: + """the runtime parameter dict to be used by this processor""" + if hasattr(self, '_parameter'): + return self._parameter + return None + + @parameter.setter + def parameter(self, parameter : dict) -> None: + self._parameter = parameter + # re-run setup to validate parameters and load models etc + self._setup() + def __init__( self, # FIXME: deprecate in favor of process_workspace(workspace) @@ -204,19 +217,12 @@ def __init__( "is deprecated - pass as argument to process_workspace instead") self.page_id = page_id or None self.download = download_files - if parameter is None: - parameter = {} - parameterValidator = ParameterValidator(self.ocrd_tool) - - report = parameterValidator.validate(parameter) - if not report.is_valid: - raise ValueError("Invalid parameters %s" % report.errors) - self.parameter = parameter - # NOTE: this is the logger to be used by processor implementations, - # `processor.base` default implementations should use - # :py:attr:`self._base_logger` + #: The logger to be used by processor implementations. + # `ocrd.processor.base` internals should use :py:attr:`self._base_logger` self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}') self._base_logger = getLogger('ocrd.processor.base') + if parameter is not None: + self.parameter = parameter # workaround for deprecated#72 (@deprecated decorator does not work for subclasses): setattr(self, 'process', deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process'))) @@ -289,6 +295,17 @@ def list_resources(self): print(res) return + def _setup(self) -> None: + """ + Validate parameters, then run :py:meth:`setup`. Called whenever + :py:data:`parameter` changes. + """ + parameterValidator = ParameterValidator(self.ocrd_tool) + report = parameterValidator.validate(self.parameter) + if not report.is_valid: + raise ValueError("Invalid parameters %s" % report.errors) + self.setup() + def setup(self) -> None: """ Prepare the processor for actual data processing, From 7998aae6e1e1ff4a80da3a203496d8ba4bd5e04a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 13:16:01 +0200 Subject: [PATCH 108/228] get_processor: ensure passing non-empty parameter, rely on `_setup` to call `setup` --- src/ocrd/processor/helpers.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 08ca0a4683..bf9b0e8a1c 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -7,7 +7,7 @@ import json import inspect from subprocess import run -from typing import List +from typing import List, Optional from click import wrap_text from ocrd.workspace import Workspace @@ -374,16 +374,14 @@ def get_cached_processor(parameter: dict, processor_class): Otherwise, an instance of the `:py:class:~ocrd.Processor` is returned. """ if processor_class: - dict_params = dict(parameter) if parameter else None - processor = processor_class(None, parameter=dict_params) - processor.setup() + processor = processor_class(None, parameter=dict(parameter)) return processor return None def get_processor( processor_class, - parameter: dict, + parameter: Optional[dict], workspace: Workspace = None, page_id: str = None, input_file_grp: List[str] = None, @@ -391,11 +389,14 @@ def get_processor( instance_caching: bool = False, ): if processor_class: + if parameter is None: + parameter = {} if instance_caching: processor = get_cached_processor(parameter, processor_class) else: + # avoid passing workspace already (deprecated chdir behaviour) processor = processor_class(None, parameter=parameter) - processor.setup() + # set current processing parameters processor.workspace = workspace processor.page_id = page_id processor.input_file_grp = input_file_grp From cc8592b7b0667118a98abe880e02a353c16136ca Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 13:17:03 +0200 Subject: [PATCH 109/228] test_processor: adapt, check required parameters --- tests/processor/test_processor.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 0cbae7d548..8ade93a708 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -16,6 +16,7 @@ from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging from ocrd.resolver import Resolver from ocrd.processor import Processor, run_processor, run_cli, NonUniqueInputFile +from ocrd.processor.helpers import get_processor from unittest import mock import pytest @@ -95,8 +96,18 @@ def test_json(self): DummyProcessor(None).dump_json() def test_params_missing_required(self): - with self.assertRaisesRegex(Exception, 'is a required property'): - DummyProcessorWithRequiredParameters(None) + proc = DummyProcessorWithRequiredParameters(None) + assert proc.parameter is None + with self.assertRaisesRegex(ValueError, 'is a required property'): + proc.parameter = {} + with self.assertRaisesRegex(ValueError, 'is a required property'): + get_processor(DummyProcessorWithRequiredParameters, None) + with self.assertRaisesRegex(ValueError, 'is a required property'): + get_processor(DummyProcessorWithRequiredParameters, {}) + with self.assertRaisesRegex(ValueError, 'is a required property'): + run_processor(DummyProcessorWithRequiredParameters, + workspace=self.workspace, input_file_grp="OCR-D-IMG") + proc.parameter = {'i-am-required': 'foo'} def test_params_preset_resolve(self): with pushd_popd(tempdir=True) as tempdir: @@ -127,6 +138,9 @@ class ParamTestProcessor(Processor): def ocrd_tool(self): return {} proc = ParamTestProcessor(None) + self.assertEqual(proc.parameter, None) + # get_processor will set to non-none and validate + proc = get_processor(ParamTestProcessor, None) self.assertEqual(proc.parameter, {}) def test_run_agent(self): From 45e556d425cc755b84ed409b817fd00a77175270 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 22 Aug 2024 13:46:26 +0200 Subject: [PATCH 110/228] improve _setup docstring Co-authored-by: Konstantin Baierer --- src/ocrd/processor/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 76639acdd7..14fb799f48 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -298,7 +298,7 @@ def list_resources(self): def _setup(self) -> None: """ Validate parameters, then run :py:meth:`setup`. Called whenever - :py:data:`parameter` changes. + :py:data:`parameter` is re-assigned. """ parameterValidator = ParameterValidator(self.ocrd_tool) report = parameterValidator.validate(self.parameter) From d4c802be7af4c4d4bdd6b9b41bd22e8552af11b8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 22 Aug 2024 13:50:32 +0200 Subject: [PATCH 111/228] Processor._setup: raise with full ParameterValidator report Co-authored-by: Konstantin Baierer --- src/ocrd/processor/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 14fb799f48..d930f8a0cf 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -303,7 +303,7 @@ def _setup(self) -> None: parameterValidator = ParameterValidator(self.ocrd_tool) report = parameterValidator.validate(self.parameter) if not report.is_valid: - raise ValueError("Invalid parameters %s" % report.errors) + raise ValueError(f'Invalid parameters:\n{report.to_xml()}') self.setup() def setup(self) -> None: From b28fefb066dc2b487aa86212a04c4c4736e662b9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 22 Aug 2024 14:58:03 +0200 Subject: [PATCH 112/228] get_processor: parameter only as kwarg Co-authored-by: Konstantin Baierer --- src/ocrd/processor/helpers.py | 2 +- tests/processor/test_processor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index bf9b0e8a1c..56328fad72 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -381,7 +381,7 @@ def get_cached_processor(parameter: dict, processor_class): def get_processor( processor_class, - parameter: Optional[dict], + parameter: Optional[dict] = None, workspace: Workspace = None, page_id: str = None, input_file_grp: List[str] = None, diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 8ade93a708..74c56aa9a5 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -101,7 +101,7 @@ def test_params_missing_required(self): with self.assertRaisesRegex(ValueError, 'is a required property'): proc.parameter = {} with self.assertRaisesRegex(ValueError, 'is a required property'): - get_processor(DummyProcessorWithRequiredParameters, None) + get_processor(DummyProcessorWithRequiredParameters) with self.assertRaisesRegex(ValueError, 'is a required property'): get_processor(DummyProcessorWithRequiredParameters, {}) with self.assertRaisesRegex(ValueError, 'is a required property'): From 642938b6a92709e65827858fe2efee79e3992714 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 15:08:03 +0200 Subject: [PATCH 113/228] tests: adapt for get_processor parameter only as kwarg --- src/ocrd/processor/helpers.py | 2 +- tests/processor/test_processor.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 56328fad72..a8cea96fab 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -84,7 +84,7 @@ def run_processor( log.debug("Running processor %s", processorClass) processor = get_processor( - processor_class=processorClass, + processorClass, parameter=parameter, workspace=None, page_id=page_id, diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 74c56aa9a5..6a35dda0fa 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -103,7 +103,7 @@ def test_params_missing_required(self): with self.assertRaisesRegex(ValueError, 'is a required property'): get_processor(DummyProcessorWithRequiredParameters) with self.assertRaisesRegex(ValueError, 'is a required property'): - get_processor(DummyProcessorWithRequiredParameters, {}) + get_processor(DummyProcessorWithRequiredParameters, parameter={}) with self.assertRaisesRegex(ValueError, 'is a required property'): run_processor(DummyProcessorWithRequiredParameters, workspace=self.workspace, input_file_grp="OCR-D-IMG") @@ -140,7 +140,7 @@ def ocrd_tool(self): proc = ParamTestProcessor(None) self.assertEqual(proc.parameter, None) # get_processor will set to non-none and validate - proc = get_processor(ParamTestProcessor, None) + proc = get_processor(ParamTestProcessor) self.assertEqual(proc.parameter, {}) def test_run_agent(self): From f5e5c54a5f830c37fde1d57d1cf48c6013ddbc70 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 22 Aug 2024 15:08:10 +0200 Subject: [PATCH 114/228] Processor.parameter: make the bound dict read-only Co-authored-by: Konstantin Baierer --- src/ocrd/processor/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index d930f8a0cf..53e2da3dc3 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -148,7 +148,8 @@ def parameter(self) -> Optional[dict]: @parameter.setter def parameter(self, parameter : dict) -> None: - self._parameter = parameter + from types import MappingProxyType + self._parameter = MappingProxyType(parameter) # re-run setup to validate parameters and load models etc self._setup() From f2d53a63afab5588131d14dc9e82f39d28108635 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 15:31:14 +0200 Subject: [PATCH 115/228] Processor.parameter: move ParameterValidator back to setter, convert to plain dict in getter for serialization etc --- src/ocrd/processor/base.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 53e2da3dc3..4780338dd2 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -16,6 +16,7 @@ from os import getcwd from pathlib import Path from typing import List, Optional, Union, get_args +from types import MappingProxyType import sys import inspect import tarfile @@ -143,15 +144,19 @@ def ocrd_tool(self) -> dict: def parameter(self) -> Optional[dict]: """the runtime parameter dict to be used by this processor""" if hasattr(self, '_parameter'): - return self._parameter + return dict(self._parameter) return None @parameter.setter def parameter(self, parameter : dict) -> None: - from types import MappingProxyType + parameterValidator = ParameterValidator(self.ocrd_tool) + report = parameterValidator.validate(parameter) + if not report.is_valid: + raise ValueError(f'Invalid parameters:\n{report.to_xml()}') + # make parameter dict read-only self._parameter = MappingProxyType(parameter) - # re-run setup to validate parameters and load models etc - self._setup() + # (re-)run setup to load models etc + self.setup() def __init__( self, @@ -296,17 +301,6 @@ def list_resources(self): print(res) return - def _setup(self) -> None: - """ - Validate parameters, then run :py:meth:`setup`. Called whenever - :py:data:`parameter` is re-assigned. - """ - parameterValidator = ParameterValidator(self.ocrd_tool) - report = parameterValidator.validate(self.parameter) - if not report.is_valid: - raise ValueError(f'Invalid parameters:\n{report.to_xml()}') - self.setup() - def setup(self) -> None: """ Prepare the processor for actual data processing, From 7297ca2d1a76bf82ed96efa44e55ddb75e9b7551 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 15:56:31 +0200 Subject: [PATCH 116/228] Processor.parameter: frozendict instead of mappingproxy, add test --- src/ocrd/processor/base.py | 6 +++--- tests/processor/test_processor.py | 10 +++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 4780338dd2..336b479f50 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -16,12 +16,12 @@ from os import getcwd from pathlib import Path from typing import List, Optional, Union, get_args -from types import MappingProxyType import sys import inspect import tarfile import io from warnings import warn +from frozendict import frozendict from deprecated import deprecated from requests import HTTPError @@ -144,7 +144,7 @@ def ocrd_tool(self) -> dict: def parameter(self) -> Optional[dict]: """the runtime parameter dict to be used by this processor""" if hasattr(self, '_parameter'): - return dict(self._parameter) + return self._parameter return None @parameter.setter @@ -154,7 +154,7 @@ def parameter(self, parameter : dict) -> None: if not report.is_valid: raise ValueError(f'Invalid parameters:\n{report.to_xml()}') # make parameter dict read-only - self._parameter = MappingProxyType(parameter) + self._parameter = frozendict(parameter) # (re-)run setup to load models etc self.setup() diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 6a35dda0fa..d037eed3f9 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -74,14 +74,22 @@ def test_parameter(self): with open(jsonpath, 'w') as f: f.write('{"baz": "quux"}') with open(jsonpath, 'r') as f: + parameter = json.load(f) processor = run_processor( DummyProcessor, - parameter=json.load(f), + parameter=parameter, input_file_grp="OCR-D-IMG", resolver=self.resolver, workspace=self.workspace ) self.assertEqual(processor.parameter['baz'], 'quux') + processor = get_processor( + DummyProcessor, + parameter=parameter) + with self.assertRaises(TypeError): + processor.parameter['baz'] = 'xuuq' + processor.parameter = { **parameter, 'baz': 'xuuq' } + self.assertEqual(processor.parameter['baz'], 'xuuq') def test_verify(self): proc = DummyProcessor(None) From 6cd4a34c689f56d4cbe5600d85978567d7b1e60e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 19:14:41 +0200 Subject: [PATCH 117/228] introduce Processor.shutdown to be overridden (called at deinit or parameter re-assignment) --- src/ocrd/processor/base.py | 15 +++++++++++++++ tests/processor/test_processor.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 336b479f50..29305c8803 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -20,6 +20,7 @@ import inspect import tarfile import io +import weakref from warnings import warn from frozendict import frozendict from deprecated import deprecated @@ -149,6 +150,8 @@ def parameter(self) -> Optional[dict]: @parameter.setter def parameter(self, parameter : dict) -> None: + if self.parameter is not None: + self.shutdown() parameterValidator = ParameterValidator(self.ocrd_tool) report = parameterValidator.validate(parameter) if not report.is_valid: @@ -229,6 +232,8 @@ def __init__( self._base_logger = getLogger('ocrd.processor.base') if parameter is not None: self.parameter = parameter + # ensure that shutdown gets called at destruction + self._finalizer = weakref.finalize(self, self.shutdown) # workaround for deprecated#72 (@deprecated decorator does not work for subclasses): setattr(self, 'process', deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process'))) @@ -311,6 +316,16 @@ def setup(self) -> None: """ pass + def shutdown(self) -> None: + """ + Bring down the processor after data processing, + after to changing back from the workspace directory but + before exiting (or setting up with different parameters). + + (Override this to unload models from memory etc.) + """ + pass + @deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()') def process(self) -> None: """ diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index d037eed3f9..5cee01d644 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -91,6 +91,37 @@ def test_parameter(self): processor.parameter = { **parameter, 'baz': 'xuuq' } self.assertEqual(processor.parameter['baz'], 'xuuq') + def test_instance_caching(self): + class DyingDummyProcessor(DummyProcessor): + def shutdown(self): + print(self.parameter['baz']) + self.capture_out_err() + # well above OCRD_MAX_PROCESSOR_CACHE=128 + firstp = None + for i in range(200): + p = get_processor( + DyingDummyProcessor, + parameter={'baz': str(i)}, + instance_caching=True + ) + if i == 0: + firstp = p + lastp = p + p = get_processor(DyingDummyProcessor, + parameter={'baz': '0'}, + instance_caching=True + ) + # should not be cached anymore + self.assertNotEqual(firstp, p) + p = get_processor(DyingDummyProcessor, + parameter={'baz': '199'}, + instance_caching=True + ) + # should still be cached + self.assertEqual(lastp, p) + out, err = self.capture_out_err() + #assert '0' in out.split('\n') + def test_verify(self): proc = DummyProcessor(None) with self.assertRaises(AttributeError): From 407bff8c0fd5a19f3b1a9864718addb4713ce403 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 23 Aug 2024 17:27:47 +0200 Subject: [PATCH 118/228] Processor: introduce `max_instances` class attribute --- src/ocrd/processor/base.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 29305c8803..e880928243 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -104,6 +104,14 @@ class Processor(): a number of optional or mandatory parameters. """ + max_instances : int = -1 + """ + maximum number of cached instances (ignored if negative), to be applied on top of + :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller). + + (Override this if you know how many instances fit into memory at once.) + """ + @property def metadata(self) -> dict: """the ocrd-tool.json dict of the package""" From c9fbb2c5aa569428f99fb953c87df10eaf447895 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 23 Aug 2024 17:33:44 +0200 Subject: [PATCH 119/228] get_cached_processor: set lru_cache maxsize from min(cfg,class) at runtime --- src/ocrd/processor/helpers.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index a8cea96fab..e0dd502724 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -359,9 +359,9 @@ def wrap(s): pass -# Taken from https://github.com/OCR-D/core/pull/884 -@freeze_args -@lru_cache(maxsize=config.OCRD_MAX_PROCESSOR_CACHE) +# not decorated here but at runtime (on first use) +#@freeze_args +#@lru_cache(maxsize=config.OCRD_MAX_PROCESSOR_CACHE) def get_cached_processor(parameter: dict, processor_class): """ Call this function to get back an instance of a processor. @@ -378,7 +378,6 @@ def get_cached_processor(parameter: dict, processor_class): return processor return None - def get_processor( processor_class, parameter: Optional[dict] = None, @@ -392,6 +391,16 @@ def get_processor( if parameter is None: parameter = {} if instance_caching: + global get_cached_processor + if not hasattr(get_cached_processor, '__wrapped__'): + # first call: wrap + if processor_class.max_instances < 0: + maxsize = config.OCRD_MAX_PROCESSOR_CACHE + else: + maxsize = min(config.OCRD_MAX_PROCESSOR_CACHE, processor_class.max_instances) + # wrapping in call cache + # wrapping dict into frozendict (from https://github.com/OCR-D/core/pull/884) + get_cached_processor = freeze_args(lru_cache(maxsize=maxsize)(get_cached_processor)) processor = get_cached_processor(parameter, processor_class) else: # avoid passing workspace already (deprecated chdir behaviour) From 9c212a9ca779b0d36fc37dc03034a1b16659e5ff Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 23 Aug 2024 17:34:15 +0200 Subject: [PATCH 120/228] test get_processor instance_caching w/ max_instances --- tests/processor/test_processor.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 5cee01d644..fed950cad5 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -93,12 +93,13 @@ def test_parameter(self): def test_instance_caching(self): class DyingDummyProcessor(DummyProcessor): + max_instances = 10 def shutdown(self): print(self.parameter['baz']) self.capture_out_err() - # well above OCRD_MAX_PROCESSOR_CACHE=128 + # customize (as processor implementors would) firstp = None - for i in range(200): + for i in range(DyingDummyProcessor.max_instances + 2): p = get_processor( DyingDummyProcessor, parameter={'baz': str(i)}, @@ -114,7 +115,7 @@ def shutdown(self): # should not be cached anymore self.assertNotEqual(firstp, p) p = get_processor(DyingDummyProcessor, - parameter={'baz': '199'}, + parameter={'baz': str(i)}, instance_caching=True ) # should still be cached From a413f046c530b49f1f9ce4d62695505717c861a5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 23 Aug 2024 17:54:38 +0200 Subject: [PATCH 121/228] test get_processor instance_caching w/ clear_cache --- tests/processor/test_processor.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index fed950cad5..19ff1087f2 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -95,7 +95,9 @@ def test_instance_caching(self): class DyingDummyProcessor(DummyProcessor): max_instances = 10 def shutdown(self): - print(self.parameter['baz']) + # fixme: will only print _after_ pytest exits, so too late for assertions + #print(self.parameter['baz']) + pass self.capture_out_err() # customize (as processor implementors would) firstp = None @@ -120,7 +122,16 @@ def shutdown(self): ) # should still be cached self.assertEqual(lastp, p) - out, err = self.capture_out_err() + from ocrd.processor.helpers import get_cached_processor + get_cached_processor.__wrapped__.cache_clear() + p = get_processor(DyingDummyProcessor, + parameter={'baz': str(i)}, + instance_caching=True + ) + # should not be cached anymore + self.assertNotEqual(lastp, p) + # fixme: will only print _after_ pytest exits, so too late for assertions + #out, err = self.capture_out_err() #assert '0' in out.split('\n') def test_verify(self): From 870523cb0f7b3558abd03e024121cb0c2521a706 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 22 Aug 2024 11:10:51 +0200 Subject: [PATCH 122/228] :package: v3.0.0a2 --- CHANGELOG.md | 4 ++++ VERSION | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b53c6a28d..38f36b96d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [3.0.0a2] - 2024-08-22 + Changed: - :fire: `OcrdPage` as proxy of `PcGtsType` instead of alias; also contains `etree` and `mapping` now - :fire: `Processor.zip_input_files` now can throw `ocrd.NonUniqueInputFile` and `ocrd.MissingInputFile` @@ -2191,6 +2193,8 @@ Fixed Initial Release +[3.0.0a2]: ../../compare/v3.0.0a2..v3.0.0a1 +[3.0.0a1]: ../../compare/v3.0.0a1..v2.67.2 [2.67.2]: ../../compare/v2.67.2..v2.67.1 [2.67.1]: ../../compare/v2.67.1..v2.67.0 [2.67.0]: ../../compare/v2.67.0..v2.66.1 diff --git a/VERSION b/VERSION index 2a94548735..3a5b5bc9d6 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0a1 \ No newline at end of file +3.0.0a2 From 20bb6d1114433a17c2a88cfdd52db635f1eb24e6 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 22 Aug 2024 11:13:38 +0200 Subject: [PATCH 123/228] remove make *-workaround, we will not do that for v3+ --- CHANGELOG.md | 3 +++ Makefile | 38 -------------------------------------- 2 files changed, 3 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38f36b96d6..43bf85764d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Changed: + - :fire: `ocrd_utils`, `ocrd_models`, `ocrd_modelfactory`, `ocrd_validators` and `ocrd_network` are not published as separate packages anymore, everything is contained in `ocrd` and you should adapt your `requirements.txt` accordingly. + ## [3.0.0a2] - 2024-08-22 Changed: diff --git a/Makefile b/Makefile index 39b46ee849..fd1210b65a 100644 --- a/Makefile +++ b/Makefile @@ -401,41 +401,3 @@ docker docker-cuda docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch: # Build wheels and source dist and twine upload them pypi: build twine upload --verbose dist/ocrd-$(VERSION)*{tar.gz,whl} - -pypi-workaround: build-workaround - for dist in $(BUILD_ORDER);do twine upload dist/$$dist-$(VERSION)*{tar.gz,whl};done - -# Only in place until v3 so we don't break existing installations -build-workaround: pyclean - cp pyproject.toml pyproject.toml.BAK - cp src/ocrd_utils/constants.py src/ocrd_utils/constants.py.BAK - cp src/ocrd/cli/__init__.py src/ocrd/cli/__init__.py.BAK - for dist in $(BUILD_ORDER);do \ - cat pyproject.toml.BAK | sed "s,^name =.*,name = \"$$dist\"," > pyproject.toml; \ - cat src/ocrd_utils/constants.py.BAK | sed "s,dist_version('ocrd'),dist_version('$$dist')," > src/ocrd_utils/constants.py; \ - cat src/ocrd/cli/__init__.py.BAK | sed "s,package_name='ocrd',package_name='$$dist'," > src/ocrd/cli/__init__.py; \ - $(MAKE) build; \ - done - rm pyproject.toml.BAK - rm src/ocrd_utils/constants.py.BAK - rm src/ocrd/cli/__init__.py.BAK - -# test that the aliased packages work in isolation and combined -test-workaround: build-workaround - $(MAKE) uninstall-workaround - for dist in $(BUILD_ORDER);do \ - pip install dist/$$dist-*.whl ;\ - ocrd --version ;\ - make test ;\ - pip uninstall --yes $$dist ;\ - done - for dist in $(BUILD_ORDER);do \ - pip install dist/$$dist-*.whl ;\ - done - ocrd --version ;\ - make test ;\ - for dist in $(BUILD_ORDER);do pip uninstall --yes $$dist;done - -uninstall-workaround: - for dist in $(BUILD_ORDER);do $(PIP) uninstall --yes $$dist;done - From faa59a87364062c887de35108189fb621634db31 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 23 Aug 2024 17:29:11 +0200 Subject: [PATCH 124/228] Processor.metadata_location property to specify where in the package ocrd-tool.json is found --- src/ocrd/processor/base.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index e880928243..4351b48656 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -112,12 +112,19 @@ class Processor(): (Override this if you know how many instances fit into memory at once.) """ + @property + def metadata_location(self) -> str: + """ + Location of `ocrd-tool.json` inside the package. By default we expect it in the root of the module + """ + return 'ocrd-tool.json' + @property def metadata(self) -> dict: """the ocrd-tool.json dict of the package""" if hasattr(self, '_metadata'): return self._metadata - self._metadata = json.loads(resource_string(self.__module__.split('.')[0], 'ocrd-tool.json')) + self._metadata = json.loads(resource_string(self.__module__.split('.')[0], self.metadata_location)) report = OcrdToolValidator.validate(self._metadata) if not report.is_valid: # FIXME: remove when bertsky/core#10 is merged From 5819c8167d1a2be662dd32b58bff0531ded40f8b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 23 Aug 2024 19:06:37 +0200 Subject: [PATCH 125/228] Processor.verify: always check cardinality (as we now have the defaults from ocrd-tool.json) --- src/ocrd/processor/base.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 4351b48656..4f1e86b962 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -127,9 +127,8 @@ def metadata(self) -> dict: self._metadata = json.loads(resource_string(self.__module__.split('.')[0], self.metadata_location)) report = OcrdToolValidator.validate(self._metadata) if not report.is_valid: - # FIXME: remove when bertsky/core#10 is merged - self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}') - self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n{report.to_xml()}.\nPlease open an issue at {self._metadata['git_url']}.") + self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n" + f"{report.to_xml()}.\nPlease open an issue at {self._metadata['git_url']}.") return self._metadata @property @@ -178,7 +177,7 @@ def parameter(self, parameter : dict) -> None: def __init__( self, - # FIXME: deprecate in favor of process_workspace(workspace) + # FIXME: remove in favor of process_workspace(workspace) workspace : Optional[Workspace], ocrd_tool=None, parameter=None, @@ -286,14 +285,10 @@ def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], assert len(grps) >= minimum, msg % (len(grps), str(spec)) if maximum > 0: assert len(grps) <= maximum, msg % (len(grps), str(spec)) - # FIXME: maybe we should enforce the cardinality properties to be specified or apply default=1 here - # (but we already have ocrd-tool validation, and these first need to be adopted by implementors) - if 'input_file_grp_cardinality' in self.ocrd_tool: - assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'], - "Unexpected number of input file groups %d vs %s") - if 'output_file_grp_cardinality' in self.ocrd_tool: - assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'], - "Unexpected number of output file groups %d vs %s") + assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'], + "Unexpected number of input file groups %d vs %s") + assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'], + "Unexpected number of output file groups %d vs %s") for input_file_grp in input_file_grps: assert input_file_grp in self.workspace.mets.file_groups # keep this for backwards compatibility: From 4f88f1d209bc86cdaea031ae9b2bda685e4f8fba Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 11:49:08 +0200 Subject: [PATCH 126/228] fix --log-filename (6fc606027a): apply in ocrd_cli_wrap_processor --- src/ocrd/decorators/__init__.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index 3f07ede4a6..fcc70a71e8 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -1,4 +1,5 @@ import sys +from contextlib import nullcontext from ocrd_utils import ( config, @@ -9,6 +10,7 @@ parse_json_string_with_comments, set_json_key_value_overrides, parse_json_string_or_file, + redirect_stderr_and_stdout_to_file, ) from ocrd_validators import WorkspaceValidator from ocrd_network import ProcessingWorker, ProcessorServer, AgentType @@ -140,7 +142,7 @@ def resolve(name): print("Profiling...") pr = cProfile.Profile() pr.enable() - def exit(): + def goexit(): pr.disable() print("Profiling completed") if profile_file: @@ -149,8 +151,13 @@ def exit(): s = io.StringIO() pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats() print(s.getvalue()) - atexit.register(exit) - run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) + atexit.register(goexit) + if log_filename: + log_ctx = redirect_stderr_and_stdout_to_file(log_filename) + else: + log_ctx = nullcontext() + with log_ctx: + run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str): From d621f3631bc5a45730bc9f47d9ee0b6cd9aaf040 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 11:54:07 +0200 Subject: [PATCH 127/228] fix exception --- src/ocrd/resource_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index 44bbd081bc..e63c5fd015 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -248,7 +248,7 @@ def _download_impl(url, filename, progress_cb=None, size=None): if "Content-Disposition" not in r.headers: url = get_url_from_gdrive_confirmation(r.text) except RuntimeError as e: - log.warning("Cannot unwrap Google Drive URL: ", e) + log.warning("Cannot unwrap Google Drive URL: %s", e) with open(filename, 'wb') as f: with requests.get(url, stream=True) as r: r.raise_for_status() From 4868fb152a1eedf0452a662aafcdce640cd20a88 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:27:33 +0200 Subject: [PATCH 128/228] adapt to PIL.Image moved constants --- src/ocrd/workspace.py | 8 +++---- src/ocrd_utils/image.py | 50 ++++++++++++++++++++--------------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 3523d9f15f..bd3380652f 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -1168,9 +1168,9 @@ def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh # Transpose in affine coordinate transform: # (consistent with image transposition or AlternativeImage below) transposition = { - 90: Image.ROTATE_90, - 180: Image.ROTATE_180, - 270: Image.ROTATE_270 + 90: Image.Transpose.ROTATE_90, + 180: Image.Transpose.ROTATE_180, + 270: Image.Transpose.ROTATE_270 }.get(orientation) # no default segment_coords['transform'] = transpose_coordinates( segment_coords['transform'], transposition, @@ -1238,5 +1238,5 @@ def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwa segment_image = segment_image.resize((int(segment_image.width * factor), int(segment_image.height * factor)), # slowest, but highest quality: - Image.BICUBIC) + Image.Resampling.BICUBIC) return segment_image, segment_coords, segment_xywh diff --git a/src/ocrd_utils/image.py b/src/ocrd_utils/image.py index 3bc14e6612..6f2524608c 100644 --- a/src/ocrd_utils/image.py +++ b/src/ocrd_utils/image.py @@ -65,10 +65,10 @@ def adjust_canvas_to_transposition(size, method): Return a numpy array of the enlarged width and height. """ - if method in [Image.ROTATE_90, - Image.ROTATE_270, - Image.TRANSPOSE, - Image.TRANSVERSE]: + if method in [Image.Transpose.ROTATE_90, + Image.Transpose.ROTATE_270, + Image.Transpose.TRANSPOSE, + Image.Transpose.TRANSVERSE]: size = size[::-1] return size @@ -348,26 +348,26 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])): calculate the affine coordinate transform corresponding to the composition of both transformations, which is respectively: - - ``PIL.Image.FLIP_LEFT_RIGHT``: + - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``: entails translation to the center, followed by pure reflection about the y-axis, and subsequent translation back - - ``PIL.Image.FLIP_TOP_BOTTOM``: + - ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``: entails translation to the center, followed by pure reflection about the x-axis, and subsequent translation back - - ``PIL.Image.ROTATE_180``: + - ``PIL.Image.Transpose.ROTATE_180``: entails translation to the center, followed by pure reflection about the origin, and subsequent translation back - - ``PIL.Image.ROTATE_90``: + - ``PIL.Image.Transpose.ROTATE_90``: entails translation to the center, followed by pure rotation by 90° counter-clockwise, and subsequent translation back - - ``PIL.Image.ROTATE_270``: + - ``PIL.Image.Transpose.ROTATE_270``: entails translation to the center, followed by pure rotation by 270° counter-clockwise, and subsequent translation back - - ``PIL.Image.TRANSPOSE``: + - ``PIL.Image.Transpose.TRANSPOSE``: entails translation to the center, followed by pure rotation by 90° counter-clockwise and pure reflection about the x-axis, and subsequent translation back - - ``PIL.Image.TRANSVERSE``: + - ``PIL.Image.Transpose.TRANSVERSE``: entails translation to the center, followed by pure rotation by 90° counter-clockwise and pure reflection about the y-axis, and subsequent translation back @@ -388,13 +388,13 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])): [0, 0, 1]]) transform = shift_coordinates(transform, -orig) operations = { - Image.FLIP_LEFT_RIGHT: [refly], - Image.FLIP_TOP_BOTTOM: [reflx], - Image.ROTATE_180: [reflx, refly], - Image.ROTATE_90: [rot90], - Image.ROTATE_270: [rot90, reflx, refly], - Image.TRANSPOSE: [rot90, reflx], - Image.TRANSVERSE: [rot90, refly] + Image.Transpose.FLIP_LEFT_RIGHT: [refly], + Image.Transpose.FLIP_TOP_BOTTOM: [reflx], + Image.Transpose.ROTATE_180: [reflx, refly], + Image.Transpose.ROTATE_90: [rot90], + Image.Transpose.ROTATE_270: [rot90, reflx, refly], + Image.Transpose.TRANSPOSE: [rot90, reflx], + Image.Transpose.TRANSVERSE: [rot90, refly] }.get(method) # no default for operation in operations: transform = np.dot(operation, transform) @@ -411,29 +411,29 @@ def transpose_image(image, method): Given a PIL.Image ``image`` and a transposition mode ``method``, apply the respective operation: - - ``PIL.Image.FLIP_LEFT_RIGHT``: + - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``: all pixels get mirrored at half the width of the image - - ``PIL.Image.FLIP_TOP_BOTTOM``: + - ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``: all pixels get mirrored at half the height of the image - - ``PIL.Image.ROTATE_180``: + - ``PIL.Image.Transpose.ROTATE_180``: all pixels get mirrored at both, the width and half the height of the image, i.e. the image gets rotated by 180° counter-clockwise - - ``PIL.Image.ROTATE_90``: + - ``PIL.Image.Transpose.ROTATE_90``: rows become columns (but counted from the right) and columns become rows, i.e. the image gets rotated by 90° counter-clockwise; width becomes height and vice versa - - ``PIL.Image.ROTATE_270``: + - ``PIL.Image.Transpose.ROTATE_270``: rows become columns and columns become rows (but counted from the bottom), i.e. the image gets rotated by 270° counter-clockwise; width becomes height and vice versa - - ``PIL.Image.TRANSPOSE``: + - ``PIL.Image.Transpose.TRANSPOSE``: rows become columns and vice versa, i.e. all pixels get mirrored at the main diagonal; width becomes height and vice versa - - ``PIL.Image.TRANSVERSE``: + - ``PIL.Image.Transpose.TRANSVERSE``: rows become columns (but counted from the right) and columns become rows (but counted from the bottom), i.e. all pixels get mirrored at the opposite diagonal; From da72c0a93eb6ab7d6bbdde4872ab54820a5c4a30 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:29:12 +0200 Subject: [PATCH 129/228] ocrd_utils: add parse_json_file_with_comments --- src/ocrd_utils/__init__.py | 2 ++ src/ocrd_utils/str.py | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/src/ocrd_utils/__init__.py b/src/ocrd_utils/__init__.py index 836f01dce4..c853a34bd3 100644 --- a/src/ocrd_utils/__init__.py +++ b/src/ocrd_utils/__init__.py @@ -75,6 +75,7 @@ :py:func:`concat_padded`, :py:func:`nth_url_segment`, :py:func:`remove_non_path_from_url`, + :py:func:`parse_json_file_with_comments`, :py:func:`parse_json_string_with_comments`, :py:func:`parse_json_string_or_file`, :py:func:`set_json_key_value_overrides`, @@ -204,6 +205,7 @@ make_xml_id, nth_url_segment, partition_list, + parse_json_file_with_comments, parse_json_string_or_file, parse_json_string_with_comments, sparkline, diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py index 7009a9ec0e..4f1e088050 100644 --- a/src/ocrd_utils/str.py +++ b/src/ocrd_utils/str.py @@ -21,6 +21,7 @@ 'make_file_id', 'make_xml_id', 'nth_url_segment', + 'parse_json_file_with_comments', 'parse_json_string_or_file', 'parse_json_string_with_comments', 'remove_non_path_from_url', @@ -162,6 +163,13 @@ def is_string(val): return isinstance(val, str) +def parse_json_file_with_comments(val): + """ + Parse a file of JSON interspersed with #-prefixed full-line comments + """ + with open(val, 'r', encoding='utf-8') as inputf: + return parse_json_string_with_comments(inputf.read()) + def parse_json_string_with_comments(val): """ Parse a string of JSON interspersed with #-prefixed full-line comments From ca78b94f108d3b2bf848000694b67322ba6a9919 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:31:35 +0200 Subject: [PATCH 130/228] cli.workspace: pass fileGrp as well, improve description --- src/ocrd/cli/workspace.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index e2186a727c..1461e53e07 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -118,7 +118,7 @@ def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency @workspace_cli.command('clone', cls=command_with_replaced_help( (r' \[WORKSPACE_DIR\]', ''))) # XXX deprecated argument @click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True) -@click.option('-a', '--download', is_flag=True, help="Download all files and change location in METS file after cloning") +@click.option('-a', '--download', is_flag=True, help="Download all selected files and add local path references in METS file afterwards") @click.argument('mets_url') @mets_find_options # XXX deprecated @@ -129,8 +129,10 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim Create a workspace from METS_URL and return the directory METS_URL can be a URL, an absolute path or a path relative to $PWD. - If METS_URL is not provided, use --mets accordingly. METS_URL can also be an OAI-PMH GetRecord URL wrapping a METS file. + + Additional options pertain to the selection of files / fileGrps / pages + to be downloaded, if --download is used. """ LOG = getLogger('ocrd.cli.workspace.clone') if workspace_dir: @@ -143,6 +145,7 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim mets_basename=ctx.mets_basename, clobber_mets=clobber_mets, download=download, + fileGrp=file_grp, ID=file_id, pageId=page_id, mimetype=mimetype, @@ -408,7 +411,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi if dry_run: log.info('workspace.add_file(%s)' % file_dict) else: - workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) + workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) # pylint: disable=redundant-keyword-arg # save changes to disk workspace.save_mets() From cf41745d24231cb7fdbceeb1305e9dbc7752c94e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:35:37 +0200 Subject: [PATCH 131/228] OcrdMets.add_agent: does not have positional args --- src/ocrd/mets_server.py | 2 +- src/ocrd_models/ocrd_mets.py | 4 ++-- tests/model/test_ocrd_mets.py | 2 +- tests/test_workspace.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index da6e873c06..7c22da278d 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -236,7 +236,7 @@ def agents(self): agent_dict["_type"] = agent_dict.pop("type") return [ClientSideOcrdAgent(None, **agent_dict) for agent_dict in agent_dicts] - def add_agent(self, *args, **kwargs): + def add_agent(self, **kwargs): if not self.multiplexing_mode: return self.session.request("POST", f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).dict()) else: diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index 4d1e6cba58..90d37b37dc 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -198,7 +198,7 @@ def agents(self) -> List[OcrdAgent]: """ return [OcrdAgent(el_agent) for el_agent in self._tree.getroot().findall('mets:metsHdr/mets:agent', NS)] - def add_agent(self, *args, **kwargs) -> OcrdAgent: + def add_agent(self, **kwargs) -> OcrdAgent: """ Add an :py:class:`ocrd_models.ocrd_agent.OcrdAgent` to the list of agents in the ``metsHdr``. """ @@ -213,7 +213,7 @@ def add_agent(self, *args, **kwargs) -> OcrdAgent: el_agent_last.addnext(el_agent) except StopIteration: el_metsHdr.insert(0, el_agent) - return OcrdAgent(el_agent, *args, **kwargs) + return OcrdAgent(el_agent, **kwargs) @property def file_groups(self) -> List[str]: diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 739db7625a..89742a507e 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -248,7 +248,7 @@ def test_file_pageid(sbb_sample_01): def test_agent(sbb_sample_01): beforelen = len(sbb_sample_01.agents) - sbb_sample_01.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'YETOTHERSTILL') + sbb_sample_01.add_agent(name='foo bar v0.0.1', _type='OTHER', othertype='OTHER', role='YETOTHERSTILL') assert len(sbb_sample_01.agents) == beforelen + 1 def test_metshdr(): diff --git a/tests/test_workspace.py b/tests/test_workspace.py index 1ae007ae52..02cb72d342 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -745,7 +745,7 @@ def _fixture_metsDocumentID(tmp_path): def test_agent_before_metsDocumentID(workspace_metsDocumentID): report = WorkspaceValidator.validate(Resolver(), mets_url=workspace_metsDocumentID.mets_target) assert report.is_valid - workspace_metsDocumentID.mets.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'OTHER') + workspace_metsDocumentID.mets.add_agent(name='foo bar v0.0.1', _type='OTHER', othertype='OTHER', role='OTHER') workspace_metsDocumentID.save_mets() report = WorkspaceValidator.validate(Resolver(), mets_url=workspace_metsDocumentID.mets_target) print(report.errors) From cadc6e6e65f659346af50f0f7ade642af94579fe Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:37:48 +0200 Subject: [PATCH 132/228] remove misplaced kwargs from run_processor --- src/ocrd/decorators/__init__.py | 2 +- src/ocrd/processor/helpers.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index fcc70a71e8..b0b1cad04c 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -109,7 +109,7 @@ def resolve(name): kwargs['parameter'] = dict() # Merge parameter overrides and parameters if 'parameter_override' in kwargs: - set_json_key_value_overrides(kwargs['parameter'], *kwargs['parameter_override']) + set_json_key_value_overrides(kwargs['parameter'], *kwargs.pop('parameter_override')) # Assert -I / -O if not kwargs['input_file_grp']: raise ValueError('-I/--input-file-grp is required') diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index e0dd502724..2950af3e4e 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -39,10 +39,7 @@ def run_processor( log_level=None, input_file_grp=None, output_file_grp=None, - show_resource=None, - list_resources=False, parameter=None, - parameter_override=None, working_dir=None, mets_server_url=None, instance_caching=False From 7966057f975a76db39bf10b1c38540860b7c179c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:48:54 +0200 Subject: [PATCH 133/228] =?UTF-8?q?Processor.metadata:=20refactor=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `metadata`, `executable`, `ocrd_tool`, `version`: use cached_property instead of internal ad-hoc attributes - rename `metadata_location` → `metadata_filename`, add cached_property chain `metadata_location`, `metadata_rawdict` used by `metadata` to make it easy to override - `metadata_filename` if just the path of `ocrd-tool.json` in the package deviates - `metadata_location` if the `ocrd-tool.json` is not distributed via Python pkg - `metadata_rawdict` if the `ocrd-tool.json` is not in a file - `metadata` if the validated, expanded `ocrd-tool.json` is somewhere else - `DummyProcessor`: just override `Processor.metadata_filename` - processor tests: adapt to new properties and `verify` enforcing cardinality --- src/ocrd/processor/base.py | 123 +++++++++++++----- src/ocrd/processor/builtin/dummy_processor.py | 10 +- tests/data/__init__.py | 12 +- tests/data/ocrd-cp.ocrd-tool.json | 7 +- .../test_integration_4_processing_worker.py | 5 +- 5 files changed, 107 insertions(+), 50 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 4f1e86b962..5329ea6706 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -9,6 +9,7 @@ 'run_processor' ] +from functools import cached_property from os.path import exists, join from shutil import copyfileobj import json @@ -35,13 +36,12 @@ MIME_TO_EXT, config, getLogger, - initLogging, list_resource_candidates, pushd_popd, list_all_resources, get_processor_resource_types, resource_filename, - resource_string, + parse_json_file_with_comments, make_file_id, deprecation_warning ) @@ -96,12 +96,14 @@ def __init__(self, fileGrp, pageId, mimetype): class Processor(): """ - A processor is a tool that implements the uniform OCR-D command-line interface - for run-time data processing. That is, it executes a single workflow step, - or a combination of workflow steps, on the workspace (represented by local METS). - It reads input files for all or requested physical pages of the input fileGrp(s), - and writes output files for them into the output fileGrp(s). It may take - a number of optional or mandatory parameters. + A processor is a tool that implements the uniform OCR-D + `command-line interface for run-time data processing `_. + + That is, it executes a single workflow step, or a combination of workflow steps, + on the workspace (represented by local METS). It reads input files for all or selected + physical pages of the input fileGrp(s), computes additional annotation, and writes output + files for them into the output fileGrp(s). It may take a number of optional or mandatory + parameters. """ max_instances : int = -1 @@ -113,47 +115,96 @@ class Processor(): """ @property - def metadata_location(self) -> str: + def metadata_filename(self) -> str: """ - Location of `ocrd-tool.json` inside the package. By default we expect it in the root of the module + Relative location of the ``ocrd-tool.json`` file inside the package. + + Used by :py:data:`metadata_location`. + + (Override if ``ocrd-tool.json`` is not in the root of the module, + e.g. ``namespace/ocrd-tool.json`` or ``data/ocrd-tool.json``). """ return 'ocrd-tool.json' - @property + @cached_property + def metadata_location(self) -> str: + """ + Absolute path of the ``ocrd-tool.json`` file as distributed with the package. + + Used by :py:data:`metadata_rawdict`. + + (Override if ``ocrd-tool.json`` is not distributed with the Python package.) + """ + return resource_filename(__package__.split('.')[0], self.metadata_filename) + + @cached_property + def metadata_rawdict(self) -> dict: + """ + Raw (unvalidated, unexpanded) ``ocrd-tool.json`` dict contents of the package. + + Used by :py:data:`metadata`. + + (Override if ``ocrd-tool.json`` is not in a file.) + """ + return parse_json_file_with_comments(self.metadata_location) + + @cached_property def metadata(self) -> dict: - """the ocrd-tool.json dict of the package""" - if hasattr(self, '_metadata'): - return self._metadata - self._metadata = json.loads(resource_string(self.__module__.split('.')[0], self.metadata_location)) - report = OcrdToolValidator.validate(self._metadata) + """ + The ``ocrd-tool.json`` dict contents of the package, according to the OCR-D + `spec `_ for processor tools. + + After deserialisation, it also gets validated against the + `schema `_ with all defaults + expanded. + + Used by :py:data:`ocrd_tool` and :py:data:`version`. + + (Override if you want to provide metadata programmatically instead of a + JSON file.) + """ + metadata = self.metadata_rawdict + report = OcrdToolValidator.validate(metadata) if not report.is_valid: self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n" - f"{report.to_xml()}.\nPlease open an issue at {self._metadata['git_url']}.") - return self._metadata + f"{report.to_xml()}.\nPlease open an issue at {metadata.get('git_url', 'the website')}.") + return metadata - @property + @cached_property def version(self) -> str: - """the version of the package""" - if hasattr(self, '_version'): - return self._version - self._version = self.metadata['version'] - return self._version + """ + The program version of the package. + Usually the ``version`` part of :py:data:`metadata`. - @property + (Override if you do not want to use :py:data:`metadata` lookup + mechanism.) + """ + return self.metadata['version'] + + @cached_property def executable(self) -> str: - """the executable name of this processor tool""" - if hasattr(self, '_executable'): - return self._executable - self._executable = os.path.basename(inspect.stack()[-1].filename) - return self._executable + """ + The executable name of this processor tool. Taken from the runtime + filename. - @property + Used by :py:data:`ocrd_tool` for lookup in :py:data:`metadata`. + + (Override if your entry-point name deviates from the ``executable`` + name, or the processor gets instantiated from another runtime.) + """ + return os.path.basename(inspect.stack()[-1].filename) + + @cached_property def ocrd_tool(self) -> dict: - """the ocrd-tool.json dict of this processor tool""" - if hasattr(self, '_ocrd_tool'): - return self._ocrd_tool - self._ocrd_tool = self.metadata['tools'][self.executable] - return self._ocrd_tool + """ + The ``ocrd-tool.json`` dict contents of this processor tool. + Usually the :py:data:`executable` key of the ``tools`` part + of :py:data:`metadata`. + + (Override if you do not want to use :py:data:`metadata` lookup + mechanism.) + """ + return self.metadata['tools'][self.executable] @property def parameter(self) -> Optional[dict]: diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 7b2f1b66ee..9bba9bee85 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -20,8 +20,6 @@ ) from ocrd_modelfactory import page_from_file -OCRD_TOOL = parse_json_string_with_comments(resource_string(__package__ + '.dummy', 'ocrd-tool.json')) - class DummyProcessor(Processor): """ Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group @@ -76,17 +74,13 @@ def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: super().process_page_file(input_file) @property - def metadata(self): - return OCRD_TOOL + def metadata_filename(self): + return 'processor/builtin/dummy/ocrd-tool.json' @property def executable(self): return 'ocrd-dummy' - @property - def version(self): - return '0.0.3' - @click.command() @ocrd_cli_options def cli(*args, **kwargs): diff --git a/tests/data/__init__.py b/tests/data/__init__.py index 53fa227d01..c706546c57 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -9,6 +9,10 @@ 'executable': 'ocrd-test', 'description': 'dolor sit', 'steps': ['recognition/post-correction'], + # as we bypass Processor.metadata with OcrdToolValidator + # we get no default expansion, so add default cardinalities here + 'input_file_grp_cardinality': 1, + 'output_file_grp_cardinality': 1, 'parameters': { 'baz': { 'type': 'string', @@ -133,7 +137,11 @@ def process_page_file(self, input_file): class IncompleteProcessor(Processor): @property - def ocrd_tool(self): - return {} + def executable(self): + return 'ocrd-foo' + + @property + def metadata_rawdict(self): + return {'tools': {self.executable: {}}} diff --git a/tests/data/ocrd-cp.ocrd-tool.json b/tests/data/ocrd-cp.ocrd-tool.json index 728c144c50..948695c06d 100755 --- a/tests/data/ocrd-cp.ocrd-tool.json +++ b/tests/data/ocrd-cp.ocrd-tool.json @@ -1,15 +1,18 @@ { - "version": "1.0", + "version": "1.0.0", "tools": { "ocrd-cp": { "executable": "ocrd-cp", "description": "dummy processor copying", "steps": ["preprocessing/optimization"], "categories": ["Image preprocessing"], + # we allow 1 or 2 input file grps + # the output cardinality gets expanded from default + "input_file_grp_cardinality": [1,2], "parameters": { "message": { "type": "string", - "default": "", + "default": "hello by default", "description": "message to print on stdout" } } diff --git a/tests/network/test_integration_4_processing_worker.py b/tests/network/test_integration_4_processing_worker.py index e211bd2381..ae322b0978 100644 --- a/tests/network/test_integration_4_processing_worker.py +++ b/tests/network/test_integration_4_processing_worker.py @@ -1,6 +1,6 @@ from pathlib import Path from pika import BasicProperties -from src.ocrd.processor.builtin.dummy_processor import DummyProcessor, OCRD_TOOL +from src.ocrd.processor.builtin.dummy_processor import DummyProcessor from src.ocrd_network.constants import JobState from src.ocrd_network.database import sync_db_create_workspace, sync_db_create_processing_job from src.ocrd_network.logging_utils import get_processing_job_logging_file_path @@ -25,12 +25,13 @@ def test_processing_worker_process_message(): # wrong reads from the deployed dummy worker (part of the processing server integration test) processor_name = "ocrd-dummy-test" result_queue_name = f"{processor_name}-result" + ocrd_tool = DummyProcessor(None).metadata processing_worker = ProcessingWorker( rabbitmq_addr=test_config.RABBITMQ_URL, mongodb_addr=test_config.DB_URL, processor_name=processor_name, - ocrd_tool=OCRD_TOOL, + ocrd_tool=ocrd_tool, processor_class=DummyProcessor ) processing_worker.connect_publisher(enable_acks=True) From bba142d3520b34e13cc200949a6222e2368ff4ea Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 13:03:51 +0200 Subject: [PATCH 134/228] bashlib input-files: adapt, allow passing ocrd-tool.json path and executable name --- src/ocrd/cli/bashlib.py | 39 +++++++++++++++++++++++++++++++-------- src/ocrd/lib.bash | 2 ++ 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/src/ocrd/cli/bashlib.py b/src/ocrd/cli/bashlib.py index 26139cb48f..6934744c85 100644 --- a/src/ocrd/cli/bashlib.py +++ b/src/ocrd/cli/bashlib.py @@ -82,6 +82,8 @@ def bashlib_constants(name): print(val) @bashlib_cli.command('input-files') +@click.option('--ocrd-tool', help="path to ocrd-tool.json of processor to feed", default=None) +@click.option('--executable', help="name of processor executable in ocrd-tool.json", default=None) @click.option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME) @click.option('-w', '--working-dir', help="Working Directory") @click.option('-I', '--input-file-grp', help='File group(s) used as input.', default=None) @@ -96,7 +98,7 @@ def bashlib_constants(name): @parameter_option @parameter_override_option @ocrd_loglevel -def bashlib_input_files(**kwargs): +def bashlib_input_files(ocrd_tool, executable, **kwargs): """ List input files for processing @@ -108,12 +110,6 @@ def bashlib_input_files(**kwargs): (The printing format is one associative array initializer per line.) """ class BashlibProcessor(Processor): - @property - def ocrd_tool(self): - return {'executable': '', 'steps': ['']} - @property - def version(self): - return '1.0' # go half way of the normal run_processor / process_workspace call tree # by just delegating to process_workspace, overriding process_page_file # to ensure all input files exist locally (without persisting them in the METS) @@ -129,4 +125,31 @@ def process_page_file(self, *input_files): print(f"[{field}]='{value}'", end=' ') output_file_id = make_file_id(input_files[0], kwargs['output_file_grp']) print(f"[outputFileId]='{output_file_id}'") - ocrd_cli_wrap_processor(BashlibProcessor, **kwargs) + if ocrd_tool and executable: + class FullBashlibProcessor(BashlibProcessor): + @property + def metadata_location(self): + # needed for metadata loading and validation mechanism + return ocrd_tool + @property + def executable(self): + # needed for ocrd_tool lookup + return executable + else: + # we have no true metadata file, so fill in just to make it work + class FullBashlibProcessor(BashlibProcessor): + @property + def ocrd_tool(self): + # needed to satisfy the validator + return {'executable': '', + # required now + 'input_file_grp_cardinality': 1, + 'output_file_grp_cardinality': 1, + 'steps': [''] + } + @property + def version(self): + # needed to satisfy the validator and wrapper + return '1.0' + + ocrd_cli_wrap_processor(FullBashlibProcessor, **kwargs) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index 82fa2005dc..6b08f669d1 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -299,6 +299,8 @@ ocrd__wrap () { eval "ocrd__files[$i]=ocrd__file$i" let ++i done < <(ocrd bashlib input-files \ + --ocrd-tool $OCRD_TOOL_JSON \ + --executable $OCRD_TOOL_NAME \ -m "${ocrd__argv[mets_file]}" \ -I "${ocrd__argv[input_file_grp]}" \ -O "${ocrd__argv[output_file_grp]}" \ From 32cdc5a7aa35d0c54d2120d47249201e90912f61 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 13:04:40 +0200 Subject: [PATCH 135/228] add to pylint karma --- src/ocrd/cli/__init__.py | 56 +++++---- src/ocrd/cli/bashlib.py | 11 +- src/ocrd/cli/ocrd_tool.py | 10 +- src/ocrd/cli/workspace.py | 9 +- src/ocrd/decorators/__init__.py | 5 +- src/ocrd/mets_server.py | 3 +- src/ocrd/processor/base.py | 8 +- src/ocrd/processor/builtin/dummy_processor.py | 5 +- src/ocrd/processor/helpers.py | 5 +- src/ocrd/resolver.py | 3 - src/ocrd/resource_manager.py | 8 +- src/ocrd/workspace.py | 9 +- src/ocrd/workspace_backup.py | 2 +- src/ocrd_modelfactory/__init__.py | 2 +- src/ocrd_models/constants.py | 1 - src/ocrd_models/ocrd_exif.py | 4 +- src/ocrd_models/ocrd_file.py | 4 +- src/ocrd_models/ocrd_mets.py | 40 +++---- src/ocrd_models/ocrd_page.py | 1 - src/ocrd_models/ocrd_xml_base.py | 4 +- src/ocrd_utils/config.py | 8 +- src/ocrd_utils/logging.py | 36 +++--- src/ocrd_utils/os.py | 9 +- src/ocrd_utils/str.py | 5 +- src/ocrd_validators/json_validator.py | 4 +- src/ocrd_validators/ocrd_tool_validator.py | 4 +- src/ocrd_validators/page_validator.py | 112 +++++++++--------- src/ocrd_validators/parameter_validator.py | 4 +- .../resource_list_validator.py | 7 +- src/ocrd_validators/workspace_validator.py | 42 +++---- src/ocrd_validators/xsd_validator.py | 2 +- tests/validator/test_workspace_validator.py | 2 +- 32 files changed, 202 insertions(+), 223 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 3722e3c21e..6a752f2e3d 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -10,6 +10,36 @@ from ocrd_utils import config +# pylint: disable=wrong-import-position + +def command_with_replaced_help(*replacements): + + class CommandWithReplacedHelp(click.Command): + def get_help(self, ctx): + newhelp = super().get_help(ctx) + for replacement in replacements: + newhelp = re.sub(*replacement, newhelp) + # print(newhelp) + return newhelp + + return CommandWithReplacedHelp + +# pylint: enable=wrong-import-position + +from ..decorators import ocrd_loglevel +from .ocrd_tool import ocrd_tool_cli +from .workspace import workspace_cli +from .process import process_cli +from .bashlib import bashlib_cli +from .validate import validate_cli +from .resmgr import resmgr_cli +from .zip import zip_cli +from .log import log_cli +from .network import network_cli + + +__all__ = ['cli'] + _epilog = f""" \b @@ -60,30 +90,6 @@ {config.describe('OCRD_LOGGING_DEBUG')} """ -def command_with_replaced_help(*replacements): - - class CommandWithReplacedHelp(click.Command): - def get_help(self, ctx): - help = super().get_help(ctx) - for replacement in replacements: - help = re.sub(*replacement, help) - # print(help) - return help - - return CommandWithReplacedHelp - - -from ..decorators import ocrd_loglevel -from .ocrd_tool import ocrd_tool_cli -from .workspace import workspace_cli -from .process import process_cli -from .bashlib import bashlib_cli -from .validate import validate_cli -from .resmgr import resmgr_cli -from .zip import zip_cli -from .log import log_cli -from .network import network_cli - @click.group(epilog=_epilog) @click.version_option(package_name='ocrd') @ocrd_loglevel @@ -101,5 +107,3 @@ def cli(**kwargs): # pylint: disable=unused-argument cli.add_command(log_cli) cli.add_command(resmgr_cli) cli.add_command(network_cli) - -__all__ = ['cli'] diff --git a/src/ocrd/cli/bashlib.py b/src/ocrd/cli/bashlib.py index 6934744c85..d46c81ee46 100644 --- a/src/ocrd/cli/bashlib.py +++ b/src/ocrd/cli/bashlib.py @@ -8,7 +8,6 @@ """ from __future__ import print_function import sys -from os.path import isfile import click from ocrd.constants import BASHLIB_FILENAME @@ -23,15 +22,7 @@ ocrd_loglevel, ocrd_cli_wrap_processor ) -from ocrd_utils import ( - is_local_filename, - get_local_filename, - initLogging, - getLogger, - make_file_id, - config -) -from ocrd.resolver import Resolver +from ocrd_utils import make_file_id from ocrd.processor import Processor # ---------------------------------------------------------------------- diff --git a/src/ocrd/cli/ocrd_tool.py b/src/ocrd/cli/ocrd_tool.py index 929fe47cca..f63a7235a5 100644 --- a/src/ocrd/cli/ocrd_tool.py +++ b/src/ocrd/cli/ocrd_tool.py @@ -17,7 +17,6 @@ from ocrd.processor import Processor from ocrd_utils import ( set_json_key_value_overrides, - VERSION as OCRD_VERSION, parse_json_string_or_file, parse_json_string_with_comments as loads ) @@ -30,22 +29,23 @@ def __init__(self, filename): with codecs.open(filename, encoding='utf-8') as f: self.content = f.read() self.json = loads(self.content) + self.tool_name = '' class BashProcessor(Processor): @property - def metadata(inner_self): + def metadata(inner_self): # pylint: disable=no-self-argument,arguments-renamed return self.json @property - def executable(inner_self): + def executable(inner_self): # pylint: disable=no-self-argument,arguments-renamed return self.tool_name @property - def moduledir(inner_self): + def moduledir(inner_self): # pylint: disable=no-self-argument,arguments-renamed return os.path.dirname(self.filename) # set docstrings to empty __doc__ = None # HACK: override the module-level docstring, too getmodule(OcrdToolCtx).__doc__ = None - def process(inner_self): + def process(inner_self): # pylint: disable=no-self-argument,arguments-renamed return super() self.processor = BashProcessor diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 1461e53e07..3aece34933 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -6,7 +6,7 @@ :nested: full """ import os -from os import getcwd, rmdir, unlink +from os import rmdir, unlink from os.path import dirname, relpath, normpath, exists, join, isabs, isdir from pathlib import Path from json import loads, dumps @@ -14,7 +14,6 @@ from glob import glob # XXX pathlib.Path.glob does not support absolute globs import re import time -import numpy as np import click @@ -455,7 +454,7 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl snake_to_camel = {"file_id": "ID", "page_id": "pageId", "file_grp": "fileGrp"} output_field = [snake_to_camel.get(x, x) for x in output_field] modified_mets = False - ret = list() + ret = [] workspace = Workspace( ctx.resolver, directory=ctx.directory, @@ -751,7 +750,7 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin @workspace_cli.command('update-page') @click.option('--set', 'attr_value_pairs', help=f"set mets:div ATTR to VALUE. possible keys: {METS_PAGE_DIV_ATTRIBUTE.names()}", metavar="ATTR VALUE", nargs=2, multiple=True) -@click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER') +@click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER') @click.option('--orderlabel', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL') @click.option('--contentids', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL') @click.argument('PAGE_ID') @@ -760,7 +759,7 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): """ Update the @ID, @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID """ - update_kwargs = {k: v for k, v in attr_value_pairs} + update_kwargs = dict(attr_value_pairs) if order: update_kwargs['ORDER'] = order if orderlabel: diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index b0b1cad04c..f52a13575b 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -106,7 +106,7 @@ def resolve(name): kwargs['parameter'] = parse_json_string_or_file(*kwargs['parameter'], resolve_preset_file=resolve) else: - kwargs['parameter'] = dict() + kwargs['parameter'] = {} # Merge parameter overrides and parameters if 'parameter_override' in kwargs: set_json_key_value_overrides(kwargs['parameter'], *kwargs.pop('parameter_override')) @@ -146,8 +146,7 @@ def goexit(): pr.disable() print("Profiling completed") if profile_file: - with open(profile_file, 'wb') as f: - pr.dump_stats(profile_file) + pr.dump_stats(profile_file) s = io.StringIO() pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats() print(s.getvalue()) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 7c22da278d..81f9e15d0a 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -21,7 +21,7 @@ import uvicorn from ocrd_models import OcrdFile, ClientSideOcrdFile, OcrdAgent, ClientSideOcrdAgent -from ocrd_utils import getLogger, deprecated_alias +from ocrd_utils import getLogger # @@ -403,7 +403,6 @@ def create_process(mets_server_url: str, ws_dir_path: str, log_file: str) -> int @staticmethod def kill_process(mets_server_pid: int): subprocess_run(args=["kill", "-s", "SIGINT", f"{mets_server_pid}"], shell=False, universal_newlines=True) - return def shutdown(self): if self.is_uds: diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 5329ea6706..dacf9b0729 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -22,7 +22,6 @@ import tarfile import io import weakref -from warnings import warn from frozendict import frozendict from deprecated import deprecated from requests import HTTPError @@ -350,14 +349,12 @@ def dump_json(self): Print :py:attr:`ocrd_tool` on stdout. """ print(json.dumps(self.ocrd_tool, indent=True)) - return def dump_module_dir(self): """ Print :py:attr:`moduledir` on stdout. """ print(self.moduledir) - return def list_resources(self): """ @@ -365,7 +362,6 @@ def list_resources(self): """ for res in self.list_all_resources(): print(res) - return def setup(self) -> None: """ @@ -756,7 +752,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): # can actually be much more costly than traversing the ltree. # This might depend on the number of pages vs number of fileGrps. - pages = dict() + pages = {} for i, ifg in enumerate(ifgs): files_ = sorted(self.workspace.mets.find_all_files( pageId=self.page_id, fileGrp=ifg, mimetype=mimetype), @@ -811,7 +807,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): if self.page_id and not any(pages): self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n" f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.") - ifts = list() + ifts = [] for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): if not ifiles[i]: diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 9bba9bee85..a5f217a155 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -1,6 +1,6 @@ # pylint: disable=missing-module-docstring,invalid-name -from os.path import join, basename -from typing import Optional, Union +from os.path import join +from typing import Optional import click @@ -10,7 +10,6 @@ from ocrd_models.ocrd_file import OcrdFileType from ocrd_models.ocrd_page import OcrdPage, to_xml from ocrd_utils import ( - getLogger, make_file_id, MIME_TO_EXT, MIMETYPE_PAGE, diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 2950af3e4e..6483790bd6 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -1,7 +1,6 @@ """ Helper methods for running and documenting processors """ -from os import chdir, getcwd from time import perf_counter, process_time from functools import lru_cache import json @@ -99,7 +98,7 @@ def run_processor( t0_cpu = process_time() if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']): backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil' - from memory_profiler import memory_usage + from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel try: mem_usage = memory_usage(proc=(processor.process_workspace, [workspace], {}), # only run process once @@ -209,7 +208,7 @@ def run_cli( if not log_filename: result = run(args, check=False) else: - with open(log_filename, 'a') as file_desc: + with open(log_filename, 'a', encoding='utf-8') as file_desc: result = run(args, check=False, stdout=file_desc, stderr=file_desc) return result.returncode diff --git a/src/ocrd/resolver.py b/src/ocrd/resolver.py index 124d006927..7ed58d4d4d 100644 --- a/src/ocrd/resolver.py +++ b/src/ocrd/resolver.py @@ -18,7 +18,6 @@ ) from ocrd.workspace import Workspace from ocrd_models import OcrdMets -from ocrd_models.constants import NAMESPACES as NS from ocrd_models.utils import handle_oai_response class Resolver(): @@ -310,5 +309,3 @@ def resolve_mets_arguments(self, directory, mets_url, mets_basename=DEFAULT_METS raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory)) return str(Path(directory).resolve()), str(mets_url), str(mets_basename), mets_server_url - - diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index e63c5fd015..da1ee48331 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -1,6 +1,6 @@ from pathlib import Path from os.path import join -from os import environ, listdir, makedirs, getcwd, path, unlink +from os import environ, listdir, getcwd, unlink from shutil import copytree, rmtree, copy from fnmatch import filter as apply_glob from datetime import datetime @@ -16,11 +16,11 @@ # https://github.com/OCR-D/core/issues/867 # https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml import yaml.constructor -yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = \ - yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:str'] +yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:timestamp'] = \ + yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:str'] from ocrd_validators import OcrdResourceListValidator -from ocrd_utils import getLogger, directory_size, get_moduledir, EXT_TO_MIME, nth_url_segment, guess_media_type, config +from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index bd3380652f..27c56f048d 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -1,7 +1,7 @@ import io from os import makedirs, unlink, listdir, path from pathlib import Path -from shutil import move, copyfileobj +from shutil import copyfileobj from re import sub from tempfile import NamedTemporaryFile from contextlib import contextmanager @@ -43,7 +43,6 @@ MIME_TO_PIL, MIMETYPE_PAGE, REGEX_PREFIX, - config ) from .workspace_backup import WorkspaceBackupManager @@ -111,7 +110,7 @@ def __init__( def __repr__(self): return 'Workspace[remote=%s, directory=%s, baseurl=%s, file_groups=%s, files=%s]' % ( - not not self.is_remote, + self.is_remote, self.directory, self.baseurl, self.mets.file_groups, @@ -648,7 +647,7 @@ def image_from_page(self, page, page_id, log = getLogger('ocrd.workspace.image_from_page') page_image_info = self.resolve_image_exif(page.imageFilename) page_image = self._resolve_image_as_pil(page.imageFilename) - page_coords = dict() + page_coords = {} # use identity as initial affine coordinate transform: page_coords['transform'] = np.eye(3) # interim bbox (updated with each change to the transform): @@ -1091,7 +1090,7 @@ def save_image_file(self, image : Image.Image, The (absolute) path of the created file. """ log = getLogger('ocrd.workspace.save_image_file') - saveargs = dict() + saveargs = {} if 'dpi' in image.info: saveargs['dpi'] = image.info['dpi'] image_bytes = io.BytesIO() diff --git a/src/ocrd/workspace_backup.py b/src/ocrd/workspace_backup.py index 6cc3f1530d..87ee884bd1 100644 --- a/src/ocrd/workspace_backup.py +++ b/src/ocrd/workspace_backup.py @@ -1,6 +1,6 @@ from datetime import datetime from os import makedirs -from os.path import join, basename, getsize, abspath +from os.path import join, basename, getsize from glob import glob from shutil import copy import hashlib diff --git a/src/ocrd_modelfactory/__init__.py b/src/ocrd_modelfactory/__init__.py index c0600e51f8..828949fe96 100644 --- a/src/ocrd_modelfactory/__init__.py +++ b/src/ocrd_modelfactory/__init__.py @@ -72,7 +72,7 @@ def page_from_image(input_file : Union[OcrdFile, ClientSideOcrdFile], **kwargs) ), pcGtsId=input_file.ID ) - mapping = dict() + mapping = {} etree : ET._Element = pcgts.to_etree(mapping_=mapping) revmap = dict(((node, element) for element, node in mapping.items())) return OcrdPage(pcgts, etree, mapping, revmap) diff --git a/src/ocrd_models/constants.py b/src/ocrd_models/constants.py index db6e51e3a2..a67bfecc13 100644 --- a/src/ocrd_models/constants.py +++ b/src/ocrd_models/constants.py @@ -44,7 +44,6 @@ 'ocrd': 'https://ocr-d.de', } -# pylint: disable=bad-whitespace TAG_METS_AGENT = '{%s}agent' % NAMESPACES['mets'] TAG_METS_DIV = '{%s}div' % NAMESPACES['mets'] TAG_METS_FILE = '{%s}file' % NAMESPACES['mets'] diff --git a/src/ocrd_models/ocrd_exif.py b/src/ocrd_models/ocrd_exif.py index 82b8b7e1c3..ab050bae59 100644 --- a/src/ocrd_models/ocrd_exif.py +++ b/src/ocrd_models/ocrd_exif.py @@ -102,7 +102,7 @@ def to_xml(self): Serialize all properties as XML string. """ ret = '' - for k in self.__dict__: - ret += '<%s>%s' % (k, self.__dict__[k], k) + for k, v in self.__dict__.items(): + ret += f'<{k}>{v}' ret += '' return ret diff --git a/src/ocrd_models/ocrd_file.py b/src/ocrd_models/ocrd_file.py index a116341710..91eac8d8e3 100644 --- a/src/ocrd_models/ocrd_file.py +++ b/src/ocrd_models/ocrd_file.py @@ -230,12 +230,12 @@ class ClientSideOcrdFile: def __init__( self, - el, + el, # pylint: disable=unused-argument mimetype: str = '', pageId: str = '', loctype: str ='OTHER', local_filename: Optional[str] = None, - mets : Any = None, + mets : Any = None, # pylint: disable=unused-argument url: str = '', ID: str = '', fileGrp: str = '' diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index 90d37b37dc..c3fb11f600 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -75,7 +75,7 @@ def empty_mets(now : Optional[str] = None, cache_flag : bool = False): def __init__(self, **kwargs) -> None: """ """ - super(OcrdMets, self).__init__(**kwargs) + super().__init__(**kwargs) # XXX If the environment variable OCRD_METS_CACHING is set to "true", # then enable caching, if "false", disable caching, overriding the @@ -488,11 +488,12 @@ def add_file(self, fileGrp : str, mimetype : Optional[str] = None, url : Optiona f"A file with ID=={ID} already exists {mets_file} but unrelated - cannot mitigate") # To get rid of Python's FutureWarning - checking if v is not None - kwargs = {k: v for k, v in locals().items() if - k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None} + kwargs = {k: v for k, v in locals().items() + if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None} # This separation is needed to reuse the same el_mets_file element in the caching if block el_mets_file = ET.SubElement(el_fileGrp, TAG_METS_FILE) # The caching of the physical page is done in the OcrdFile constructor + # (which calls us back with set_physical_page_for_file) mets_file = OcrdFile(el_mets_file, mets=self, **kwargs) if self._cache_flag: @@ -542,9 +543,9 @@ def remove_one_file(self, ID : Union[str, OcrdFile], fileGrp : str = None) -> Oc # Delete the physical page ref fptrs = [] if self._cache_flag: - for page in self._fptr_cache.keys(): - if ID in self._fptr_cache[page]: - fptrs.append(self._fptr_cache[page][ID]) + for pageId, fptrdict in self._fptr_cache.items(): + if ID in fptrdict: + fptrs.append(fptrdict[ID]) else: fptrs = self._tree.getroot().findall('.//mets:fptr[@FILEID="%s"]' % ID, namespaces=NS) @@ -700,8 +701,8 @@ def get_physical_pages(self, for_fileIds : Optional[List[str]] = None, for_pageI assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright ret = [None] * len(for_fileIds) if self._cache_flag: - for pageId in self._fptr_cache.keys(): - for fptr in self._fptr_cache[pageId].keys(): + for pageId, fptrdict in self._fptr_cache.items(): + for fptr in fptrdict: if fptr in for_fileIds: index = for_fileIds.index(fptr) if return_divs: @@ -737,10 +738,10 @@ def set_physical_page_for_file(self, pageId : str, ocrd_file : OcrdFile, # delete any existing page mapping for this file.ID fptrs = [] if self._cache_flag: - for page_id in self._fptr_cache.keys(): - if ocrd_file.ID in self._fptr_cache[page_id].keys(): - if self._fptr_cache[page_id][ocrd_file.ID] is not None: - fptrs.append(self._fptr_cache[page_id][ocrd_file.ID]) + for page, fptrdict in self._fptr_cache.items(): + if ocrd_file.ID in fptrdict: + if fptrdict[ocrd_file.ID] is not None: + fptrs.append(fptrdict[ocrd_file.ID]) else: fptrs = self._tree.getroot().findall( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % @@ -791,7 +792,7 @@ def set_physical_page_for_file(self, pageId : str, ocrd_file : OcrdFile, self._fptr_cache[pageId].update({ocrd_file.ID: el_fptr}) def update_physical_page_attributes(self, page_id : str, **kwargs) -> None: - invalid_keys = list(k for k in kwargs.keys() if k not in METS_PAGE_DIV_ATTRIBUTE.names()) + invalid_keys = list(k for k in kwargs if k not in METS_PAGE_DIV_ATTRIBUTE.names()) if invalid_keys: raise ValueError(f"Invalid attribute {invalid_keys}. Allowed values: {METS_PAGE_DIV_ATTRIBUTE.names()}") @@ -812,8 +813,8 @@ def get_physical_page_for_file(self, ocrd_file : OcrdFile) -> Optional[str]: corresponding to the ``mets:file`` :py:attr:`ocrd_file`. """ if self._cache_flag: - for pageId in self._fptr_cache.keys(): - if ocrd_file.ID in self._fptr_cache[pageId].keys(): + for pageId, fptrdict in self._fptr_cache.items(): + if ocrd_file.ID in fptrdict: return pageId else: ret = self._tree.getroot().find( @@ -828,7 +829,7 @@ def remove_physical_page(self, ID : str) -> None: """ mets_div = None if self._cache_flag: - if ID in self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].keys(): + if ID in self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID]: mets_div = [self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][ID]] else: mets_div = self._tree.getroot().xpath( @@ -857,9 +858,9 @@ def remove_physical_page_fptr(self, fileId : str) -> List[str]: # If that's the case then we do not need to iterate 2 loops, just one. mets_fptrs = [] if self._cache_flag: - for page_id in self._fptr_cache.keys(): - if fileId in self._fptr_cache[page_id].keys(): - mets_fptrs.append(self._fptr_cache[page_id][fileId]) + for pageId, fptrdict in self._fptr_cache.items(): + if fileId in fptrdict: + mets_fptrs.append(fptrdict[fileId]) else: mets_fptrs = self._tree.getroot().xpath( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, @@ -919,4 +920,3 @@ def merge(self, other_mets, force : bool = False, # FIXME: merge structMap logical and structLink as well if after_add_cb: after_add_cb(f_dest) - diff --git a/src/ocrd_models/ocrd_page.py b/src/ocrd_models/ocrd_page.py index 6accb9241f..3f0cc690fa 100644 --- a/src/ocrd_models/ocrd_page.py +++ b/src/ocrd_models/ocrd_page.py @@ -3,7 +3,6 @@ """ from io import StringIO from typing import Dict, Union -from inspect import getmembers from lxml import etree as ET __all__ = [ diff --git a/src/ocrd_models/ocrd_xml_base.py b/src/ocrd_models/ocrd_xml_base.py index 8579a5b407..ea4798c5b9 100644 --- a/src/ocrd_models/ocrd_xml_base.py +++ b/src/ocrd_models/ocrd_xml_base.py @@ -8,8 +8,8 @@ from .utils import xmllint_format -for curie in NAMESPACES: - ET.register_namespace(curie, NAMESPACES[curie]) +for curie, url in NAMESPACES.items(): + ET.register_namespace(curie, url) class OcrdXmlDocument(): """ diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 851fb42a8c..f33af0264f 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -12,8 +12,12 @@ from tempfile import gettempdir from textwrap import fill, indent -_validator_boolean = lambda val: isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1') -_parser_boolean = lambda val: bool(val) if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1') + +def _validator_boolean(val): + return isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1') + +def _parser_boolean(val): + return bool(val) if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1') class OcrdEnvVariable(): diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index bb771fc0ce..5cea55e5b1 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -5,9 +5,9 @@ Logging can be overridden either programmatically in code using the library or by creating one or more of -- /etc/ocrd_logging.py -- $HOME/ocrd_logging.py -- $PWD/ocrd_logging.py +- ``/etc/ocrd_logging.py`` +- ``$HOME/ocrd_logging.py`` +- ``$PWD/ocrd_logging.py`` These files will be executed in the context of ocrd/ocrd_logging.py, with `logging` global set. @@ -16,20 +16,18 @@ - Try to be less intrusive with OCR-D specific logging conventions to make it easier and less surprising to define logging behavior when using OCR-D/core as a library - - Change setOverrideLogLevel to only override the log level of the ``ocrd`` + - Change :py:meth:`setOverrideLogLevel` to only override the log level of the ``ocrd`` logger and its descendants - - initLogging will set exactly one handler, for the root logger or for the + - :py:meth:`initLogging` will set exactly one handler, for the root logger or for the ``ocrd`` logger. - Child loggers should propagate to the ancestor logging (default - behavior of the logging library - no more PropagationShyLogger) - - disableLogging only removes any handlers from the ``ocrd`` logger + behavior of the logging library - no more ``PropagationShyLogger``) + - :py:meth:`disableLogging` only removes any handlers from the ``ocrd`` logger """ # pylint: disable=no-member from __future__ import absolute_import -from traceback import format_stack - import logging import logging.config from pathlib import Path @@ -81,10 +79,10 @@ def tf_disable_interactive_logs(): try: - from os import environ + from os import environ # pylint: disable=import-outside-toplevel # This env variable must be set before importing from Keras environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - from tensorflow.keras.utils import disable_interactive_logging + from tensorflow.keras.utils import disable_interactive_logging # pylint: disable=import-outside-toplevel # Enabled interactive logging throws an exception # due to a call of sys.stdout.flush() disable_interactive_logging() @@ -143,21 +141,21 @@ def get_logging_config_files(): def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_LOGGING_DEBUG): """ - Reset ``ocrd`` logger, read logging configuration if exists, otherwise use basicConfig + Reset ``ocrd`` logger, read logging configuration if exists, otherwise use :py:meth:`logging.basicConfig` - initLogging is to be called by OCR-D/core once, i.e. + This is to be called by OCR-D/core only once, i.e. - for the ``ocrd`` CLI - for the processor wrapper methods Other processes that use OCR-D/core as a library can, but do not have to, use this functionality. Keyword Args: - - builtin_only (bool, False): Whether to search for logging configuration - on-disk (``False``) or only use the - hard-coded config (``True``). For testing - - force_reinit (bool, False): Whether to ignore the module-level - ``_initialized_flag``. For testing only. - - silent (bool, True): Whether to log logging behavior by printing to stderr + - builtin_only (bool): Whether to search for logging configuration + on-disk (``False``) or only use the hard-coded config (``True``). + For testing + - force_reinit (bool): Whether to ignore the module-level ``_initialized_flag``. + For testing only + - silent (bool): Whether to log logging behavior by printing to stderr """ global _initialized_flag if _initialized_flag and not force_reinit: diff --git a/src/ocrd_utils/os.py b/src/ocrd_utils/os.py index 18463de0c0..70721acbe3 100644 --- a/src/ocrd_utils/os.py +++ b/src/ocrd_utils/os.py @@ -71,9 +71,8 @@ def unzip_file_to_dir(path_to_zip, output_directory): """ Extract a ZIP archive to a directory """ - z = ZipFile(path_to_zip, 'r') - z.extractall(output_directory) - z.close() + with ZipFile(path_to_zip, 'r') as z: + z.extractall(output_directory) @lru_cache() def get_ocrd_tool_json(executable): @@ -87,7 +86,7 @@ def get_ocrd_tool_json(executable): ocrd_tool = ocrd_all_tool[executable] except (JSONDecodeError, OSError, KeyError): try: - ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE).stdout) + ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE, check=False).stdout) except (JSONDecodeError, OSError) as e: getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json produced invalid JSON: {e}') if 'resource_locations' not in ocrd_tool: @@ -102,7 +101,7 @@ def get_moduledir(executable): moduledir = ocrd_all_moduledir[executable] except (JSONDecodeError, OSError, KeyError): try: - moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE).stdout.rstrip('\n') + moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE, check=False).stdout.rstrip('\n') except (JSONDecodeError, OSError) as e: getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}') return moduledir diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py index 4f1e088050..6a973fac73 100644 --- a/src/ocrd_utils/str.py +++ b/src/ocrd_utils/str.py @@ -4,9 +4,9 @@ import re import json -from typing import List, Union +from typing import List from .constants import REGEX_FILE_ID, SPARKLINE_CHARS -from .deprecate import deprecation_warning +#from .deprecate import deprecation_warning from deprecated import deprecated from warnings import warn from numpy import array_split @@ -273,4 +273,3 @@ def sparkline(values : List[int]) -> str: # normalize to 0..1 and convert to index in SPARKLINE_CHARS mapped = [int(x / max_value * max_mapping) for x in values] return ''.join(SPARKLINE_CHARS[x] for x in mapped) - diff --git a/src/ocrd_validators/json_validator.py b/src/ocrd_validators/json_validator.py index 4fb84b3fdb..f21a23afee 100644 --- a/src/ocrd_validators/json_validator.py +++ b/src/ocrd_validators/json_validator.py @@ -2,7 +2,6 @@ Validating JSON-Schema """ import json -from warnings import warn from jsonschema import Draft201909Validator, ValidationError, validators # pylint: disable=import-error @@ -28,8 +27,7 @@ def set_defaults_and_handle_deprecate(validator, properties, instance, schema): if subschema.get('deprecated', False) and instance.get(prop): yield JsonSchemaDeprecationWarning(f"Property {prop} has been deprecated, ocrd-tool.json should be updated.") - for error in validate_properties(validator, properties, instance, schema): - yield error + yield from validate_properties(validator, properties, instance, schema) return validators.extend(validator_class, {"properties": set_defaults_and_handle_deprecate}) diff --git a/src/ocrd_validators/ocrd_tool_validator.py b/src/ocrd_validators/ocrd_tool_validator.py index 827001ef72..00a402c12d 100644 --- a/src/ocrd_validators/ocrd_tool_validator.py +++ b/src/ocrd_validators/ocrd_tool_validator.py @@ -22,5 +22,5 @@ def validate(obj, schema=OCRD_TOOL_SCHEMA): """ return OcrdToolValidator(schema)._validate(obj) # pylint: disable=protected-access - def __init__(self, schema, validator_class=...): - super().__init__(schema, DefaultValidatingDraft20199Validator) + def __init__(self, schema): + super().__init__(schema, validator_class=DefaultValidatingDraft20199Validator) diff --git a/src/ocrd_validators/page_validator.py b/src/ocrd_validators/page_validator.py index d6d8a95b57..0459f17811 100644 --- a/src/ocrd_validators/page_validator.py +++ b/src/ocrd_validators/page_validator.py @@ -34,50 +34,50 @@ _HIERARCHY = [ # page can contain different types of regions - (PageType, 'get_AdvertRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_ChartRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_ChemRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_CustomRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_GraphicRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_ImageRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_LineDrawingRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_MapRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_MathsRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_MusicRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_NoiseRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_SeparatorRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_TableRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_TextRegion', None), # pylint: disable=bad-whitespace - (PageType, 'get_UnknownRegion', None), # pylint: disable=bad-whitespace + (PageType, 'get_AdvertRegion', None), + (PageType, 'get_ChartRegion', None), + (PageType, 'get_ChemRegion', None), + (PageType, 'get_CustomRegion', None), + (PageType, 'get_GraphicRegion', None), + (PageType, 'get_ImageRegion', None), + (PageType, 'get_LineDrawingRegion', None), + (PageType, 'get_MapRegion', None), + (PageType, 'get_MathsRegion', None), + (PageType, 'get_MusicRegion', None), + (PageType, 'get_NoiseRegion', None), + (PageType, 'get_SeparatorRegion', None), + (PageType, 'get_TableRegion', None), + (PageType, 'get_TextRegion', None), + (PageType, 'get_UnknownRegion', None), # all regions can be recursive - (RegionType, 'get_AdvertRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_ChartRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_ChemRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_CustomRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_GraphicRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_ImageRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_LineDrawingRegion', None), # pylint: disable=bad-whitespace - #(RegionType, 'get_MapRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_MathsRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_MusicRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_NoiseRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_SeparatorRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_TableRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_TextRegion', None), # pylint: disable=bad-whitespace - (RegionType, 'get_UnknownRegion', None), # pylint: disable=bad-whitespace + (RegionType, 'get_AdvertRegion', None), + (RegionType, 'get_ChartRegion', None), + (RegionType, 'get_ChemRegion', None), + (RegionType, 'get_CustomRegion', None), + (RegionType, 'get_GraphicRegion', None), + (RegionType, 'get_ImageRegion', None), + (RegionType, 'get_LineDrawingRegion', None), + #(RegionType, 'get_MapRegion', None), + (RegionType, 'get_MathsRegion', None), + (RegionType, 'get_MusicRegion', None), + (RegionType, 'get_NoiseRegion', None), + (RegionType, 'get_SeparatorRegion', None), + (RegionType, 'get_TableRegion', None), + (RegionType, 'get_TextRegion', None), + (RegionType, 'get_UnknownRegion', None), # only TextRegion can contain TextLine - (TextRegionType, 'get_TextLine', '\n'), # pylint: disable=bad-whitespace - (TextLineType, 'get_Word', ' '), # pylint: disable=bad-whitespace - (WordType, 'get_Glyph', ''), # pylint: disable=bad-whitespace - (GlyphType, None, None), # pylint: disable=bad-whitespace + (TextRegionType, 'get_TextLine', '\n'), + (TextLineType, 'get_Word', ' '), + (WordType, 'get_Glyph', ''), + (GlyphType, None, None), ] _ORDER = [ (None, TextLineOrderSimpleType.BOTTOMTOTOP, ReadingDirectionSimpleType.RIGHTTOLEFT), - (PageType, 'get_textLineOrder', 'get_readingDirection'), # pylint: disable=bad-whitespace - (TextRegionType, 'get_textLineOrder', 'get_readingDirection'), # pylint: disable=bad-whitespace - (TextLineType, None, 'get_readingDirection'), # pylint: disable=bad-whitespace - (WordType, None, 'get_readingDirection'), # pylint: disable=bad-whitespace + (PageType, 'get_textLineOrder', 'get_readingDirection'), + (TextRegionType, 'get_textLineOrder', 'get_readingDirection'), + (TextLineType, None, 'get_readingDirection'), + (WordType, None, 'get_readingDirection'), ] # The following parameters control how tolerant we are with respect to @@ -115,9 +115,9 @@ def __init__(self, tag, ID, file_id, actual, expected): self.file_id = file_id self.actual = actual self.expected = expected - super(ConsistencyError, self).__init__( - "INCONSISTENCY in %s ID '%s' of file '%s': text results '%s' != concatenated '%s'" % ( - tag, ID, file_id, actual, expected)) + super().__init__( + f"INCONSISTENCY in {tag} ID '{ID}' of file '{file_id}': " + f"text results '{actual}' != concatenated '{expected}'") class CoordinateConsistencyError(Exception): """ @@ -141,9 +141,9 @@ def __init__(self, tag, ID, file_id, outer, inner): self.file_id = file_id self.outer = outer self.inner = inner - super(CoordinateConsistencyError, self).__init__( - "INCONSISTENCY in %s ID '%s' of '%s': coords '%s' not within parent coords '%s'" % ( - tag, ID, file_id, inner, outer)) + super().__init__( + f"INCONSISTENCY in {tag} ID '{ID}' of '{file_id}': " + f"coords '{inner}' not within parent coords '{outer}'") class CoordinateValidityError(Exception): """ @@ -166,9 +166,8 @@ def __init__(self, tag, ID, file_id, points, reason='unknown'): self.ID = ID self.file_id = file_id self.points = points - super(CoordinateValidityError, self).__init__( - "INVALIDITY in %s ID '%s' of '%s': coords '%s' - %s" % ( - tag, ID, file_id, points, reason)) + super().__init__( + f"INVALIDITY in {tag} ID '{ID}' of '{file_id}': coords '{points}' - {reason}") def compare_without_whitespace(a, b): """ @@ -177,13 +176,14 @@ def compare_without_whitespace(a, b): return re.sub('\\s+', '', a) == re.sub('\\s+', '', b) def page_get_reading_order(ro, rogroup): - """Add all elements from the given reading order group to the given dictionary. - + """ + Add all elements from the given reading order group to the given dictionary. + Given a dict ``ro`` from layout element IDs to ReadingOrder element objects, and an object ``rogroup`` with additional ReadingOrder element objects, add all references to the dict, traversing the group recursively. """ - regionrefs = list() + regionrefs = [] if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): regionrefs = (rogroup.get_RegionRefIndexed() + rogroup.get_OrderedGroupIndexed() + @@ -241,12 +241,12 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate node_id = node.get_pcGtsId() node = node.get_Page() # has no .id if not readingOrder: - readingOrder = dict() + readingOrder = {} ro = node.get_ReadingOrder() if ro: page_get_reading_order(readingOrder, ro.get_OrderedGroup() or ro.get_UnorderedGroup()) if not joinRelations: - joinRelations = list() + joinRelations = [] relations = node.get_Relations() # get RelationsType if relations: relations = relations.get_Relation() # get list of RelationType @@ -358,7 +358,7 @@ def concatenate(nodes, concatenate_with, page_textequiv_strategy, joins=None): if not nodes: return '' if not joins: - joins = list() + joins = [] result = get_text(nodes[0], page_textequiv_strategy) for node, next_node in zip(nodes, nodes[1:]): if (node.id, next_node.id) not in joins: @@ -470,11 +470,11 @@ def validate(filename=None, ocrd_page=None, ocrd_file=None, page = parse(filename, silence=True) file_id = filename else: - raise Exception("At least one of ocrd_page, ocrd_file or filename must be set") + raise ValueError("At least one of ocrd_page, ocrd_file or filename must be set") if page_textequiv_strategy not in ('first'): - raise Exception("page_textequiv_strategy %s not implemented" % page_textequiv_strategy) + raise ValueError("page_textequiv_strategy %s not implemented" % page_textequiv_strategy) if page_textequiv_consistency not in ('strict', 'lax', 'fix', 'off'): - raise Exception("page_textequiv_consistency level %s not implemented" % page_textequiv_consistency) + raise ValueError("page_textequiv_consistency level %s not implemented" % page_textequiv_consistency) report = ValidationReport() log.info("Validating input file '%s'", file_id) validate_consistency(page, page_textequiv_consistency, page_textequiv_strategy, check_baseline, check_coords, report, file_id) diff --git a/src/ocrd_validators/parameter_validator.py b/src/ocrd_validators/parameter_validator.py index 26364f70fc..ca2a7ed8ed 100644 --- a/src/ocrd_validators/parameter_validator.py +++ b/src/ocrd_validators/parameter_validator.py @@ -20,7 +20,7 @@ def validate(self, *args, **kwargs): # pylint: disable=arguments-differ obj (dict): schema (dict): """ - return super(ParameterValidator, self)._validate(*args, **kwargs) + return super()._validate(*args, **kwargs) def __init__(self, ocrd_tool): """ @@ -40,7 +40,7 @@ def __init__(self, ocrd_tool): if p[n]['required']: required.append(n) del(p[n]['required']) - super(ParameterValidator, self).__init__({ + super().__init__({ "type": "object", "required": required, "additionalProperties": False, diff --git a/src/ocrd_validators/resource_list_validator.py b/src/ocrd_validators/resource_list_validator.py index d1a77b59be..47f3c81a96 100644 --- a/src/ocrd_validators/resource_list_validator.py +++ b/src/ocrd_validators/resource_list_validator.py @@ -16,9 +16,10 @@ class OcrdResourceListValidator(JsonValidator): """ @staticmethod - def validate(obj, schema=RESOURCE_LIST_SCHEMA): + def validate(obj, schema=None): """ Validate against ``resource_list.schema.yml`` schema. """ - return JsonValidator(schema, validator_class=DefaultValidatingDraft20199Validator)._validate(obj) - + if schema is None: + schema = RESOURCE_LIST_SCHEMA + return JsonValidator(schema, validator_class=DefaultValidatingDraft20199Validator)._validate(obj) # pylint: disable=protected-access diff --git a/src/ocrd_validators/workspace_validator.py b/src/ocrd_validators/workspace_validator.py index d5be460997..28d45495ea 100644 --- a/src/ocrd_validators/workspace_validator.py +++ b/src/ocrd_validators/workspace_validator.py @@ -103,7 +103,7 @@ def __init__(self, resolver, mets_url, src_dir=None, skip=None, download=False, 'page_xsd'] if check not in self.skip] - self.find_kwargs = dict(include_fileGrp=include_fileGrp, exclude_fileGrp=exclude_fileGrp) + self.find_kwargs = {"include_fileGrp": include_fileGrp, "exclude_fileGrp": exclude_fileGrp} self.src_dir = src_dir self.workspace = None self.mets = None @@ -139,7 +139,7 @@ def _validate(self): self._resolve_workspace() except Exception as e: # pylint: disable=broad-except self.log.warning("Failed to instantiate workspace: %s", e) - self.report.add_error("Failed to instantiate workspace: %s" % e) + self.report.add_error(f"Failed to instantiate workspace: {e}") return self.report with pushd_popd(self.workspace.directory): try: @@ -158,7 +158,7 @@ def _validate(self): if self.page_checks: self._validate_page() except Exception: # pylint: disable=broad-except - self.report.add_error("Validation aborted with exception: %s" % format_exc()) + self.report.add_error(f"Validation aborted with exception: {format_exc()}") return self.report def _resolve_workspace(self): @@ -193,9 +193,9 @@ def _validate_imagefilename(self): page = page_from_file(f).get_Page() imageFilename = page.imageFilename if not self.mets.find_files(url=imageFilename, **self.find_kwargs): - self.report.add_error("PAGE-XML %s : imageFilename '%s' not found in METS" % (f.local_filename, imageFilename)) + self.report.add_error(f"PAGE '{f.ID}': imageFilename '{imageFilename}' not found in METS") if is_local_filename(imageFilename) and not Path(imageFilename).exists(): - self.report.add_warning("PAGE-XML %s : imageFilename '%s' points to non-existent local file" % (f.local_filename, imageFilename)) + self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}' points to non-existent local file") def _validate_dimension(self): """ @@ -210,9 +210,9 @@ def _validate_dimension(self): page = page_from_file(f).get_Page() _, _, exif = self.workspace.image_from_page(page, f.pageId) if page.imageHeight != exif.height: - self.report.add_error("PAGE '%s': @imageHeight != image's actual height (%s != %s)" % (f.ID, page.imageHeight, exif.height)) + self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height ({page.imageHeight} != {exif.height})") if page.imageWidth != exif.width: - self.report.add_error("PAGE '%s': @imageWidth != image's actual width (%s != %s)" % (f.ID, page.imageWidth, exif.width)) + self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width ({page.imageWidth} != {exif.width})") def _validate_multipage(self): """ @@ -229,9 +229,9 @@ def _validate_multipage(self): try: exif = self.workspace.resolve_image_exif(f.local_filename) if exif.n_frames > 1: - self.report.add_error("Image %s: More than 1 frame: %s" % (f.ID, exif.n_frames)) + self.report.add_error(f"Image '{f.ID}': More than 1 frame: {exif.n_frames}") except FileNotFoundError: - self.report.add_error("Image %s: Could not retrieve %s (local_filename=%s, url=%s)" % (f.ID, f.local_filename, f.url)) + self.report.add_error(f"Image '{f.ID}': Could not retrieve (local_filename='{f.local_filename}', url='{f.url}')") return def _validate_pixel_density(self): @@ -250,7 +250,7 @@ def _validate_pixel_density(self): for k in ['xResolution', 'yResolution']: v = exif.__dict__.get(k) if v is None or v <= 72: - self.report.add_notice("Image %s: %s (%s pixels per %s) is suspiciously low" % (f.ID, k, v, exif.resolutionUnit)) + self.report.add_notice(f"Image '{f.ID}': {k} ({v} pixels per {exif.resolutionUnit}) is suspiciously low") def _validate_mets_file_group_names(self): """ @@ -261,7 +261,7 @@ def _validate_mets_file_group_names(self): self.log.debug('_validate_mets_file_group_names') for fileGrp in self.mets.file_groups: if not fileGrp.startswith(FILE_GROUP_PREFIX): - self.report.add_notice("fileGrp USE does not begin with '%s': %s" % (FILE_GROUP_PREFIX, fileGrp)) + self.report.add_notice(f"fileGrp USE '{fileGrp}' does not begin with '{FILE_GROUP_PREFIX}'") else: # OCR-D-FOO-BAR -> ('FOO', 'BAR') # \____/\_/ \_/ @@ -273,9 +273,9 @@ def _validate_mets_file_group_names(self): if '-' in category: category, name = category.split('-', 1) if category not in FILE_GROUP_CATEGORIES: - self.report.add_notice("Unspecified USE category '%s' in fileGrp '%s'" % (category, fileGrp)) + self.report.add_notice(f"Unspecified USE category '{category}' in fileGrp '{fileGrp}'") if name is not None and not re.match(r'^[A-Z0-9-]{3,}$', name): - self.report.add_notice("Invalid USE name '%s' in fileGrp '%s'" % (name, fileGrp)) + self.report.add_notice(f"Invalid USE name '{name}' in fileGrp '{fileGrp}'") def _validate_mets_files(self): """ @@ -288,16 +288,16 @@ def _validate_mets_files(self): self.report.add_error("No files") for f in self.mets.find_files(**self.find_kwargs): if f._el.get('GROUPID'): # pylint: disable=protected-access - self.report.add_notice("File '%s' has GROUPID attribute - document might need an update" % f.ID) + self.report.add_notice(f"File '{f.ID}' has GROUPID attribute - document might need an update") if not (f.url or f.local_filename): - self.report.add_error("File '%s' has neither mets:Flocat[@LOCTYPE='URL']/@xlink:href nor mets:FLocat[@LOCTYPE='OTHER'][@OTHERLOCTYPE='FILE']/xlink:href" % f.ID) + self.report.add_error(f"File '{f.ID}' has neither mets:Flocat[@LOCTYPE='URL']/@xlink:href nor mets:FLocat[@LOCTYPE='OTHER'][@OTHERLOCTYPE='FILE']/xlink:href") continue if f.url and 'url' not in self.skip: if re.match(r'^file:/[^/]', f.url): - self.report.add_error("File '%s' has an invalid (Java-specific) file URL '%s'" % (f.ID, f.url)) + self.report.add_error(f"File '{f.ID}' has an invalid (Java-specific) file URL '{f.url}'") scheme = f.url[0:f.url.index(':')] if scheme not in ('http', 'https', 'file'): - self.report.add_warning("File '%s' has non-HTTP, non-file URL '%s'" % (f.ID, f.url)) + self.report.add_warning(f"File '{f.ID}' has non-HTTP, non-file URL '{f.url}'") def _validate_page(self): """ @@ -323,15 +323,15 @@ def _validate_page(self): if 'dimension' in self.page_checks: _, _, exif = self.workspace.image_from_page(page, f.pageId) if page.imageHeight != exif.height: - self.report.add_error("PAGE '%s': @imageHeight != image's actual height (%s != %s)" % (f.ID, page.imageHeight, exif.height)) + self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height ({page.imageHeight} != {exif.height})") if page.imageWidth != exif.width: - self.report.add_error("PAGE '%s': @imageWidth != image's actual width (%s != %s)" % (f.ID, page.imageWidth, exif.width)) + self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width ({page.imageWidth} != {exif.width})") if 'imagefilename' in self.page_checks: imageFilename = page.imageFilename if not self.mets.find_files(url=imageFilename): - self.report.add_error("PAGE-XML %s : imageFilename '%s' not found in METS" % (f.url, imageFilename)) + self.report.add_error(f"PAGE '{f.ID}': imageFilename '{imageFilename}' not found in METS") if is_local_filename(imageFilename) and not Path(imageFilename).exists(): - self.report.add_warning("PAGE-XML %s : imageFilename '%s' points to non-existent local file" % (f.url, imageFilename)) + self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}' points to non-existent local file") if 'mets_fileid_page_pcgtsid' in self.page_checks and pcgts.pcGtsId != f.ID: self.report.add_warning('pc:PcGts/@pcGtsId differs from mets:file/@ID: "%s" !== "%s"' % (pcgts.pcGtsId or '', f.ID or '')) diff --git a/src/ocrd_validators/xsd_validator.py b/src/ocrd_validators/xsd_validator.py index 81b9457564..92e4502124 100644 --- a/src/ocrd_validators/xsd_validator.py +++ b/src/ocrd_validators/xsd_validator.py @@ -45,7 +45,7 @@ def __init__(self, schema_url): schema_url (str): URI of XML schema to validate against. """ if schema_url not in XSD_PATHS: - raise Exception('XML schema not bundled with OCR-D: %s' % schema_url) + raise ValueError('XML schema not bundled with OCR-D: %s' % schema_url) with open(XSD_PATHS[schema_url], 'r') as f: xmlschema_doc = ET.parse(f) self._xmlschema = ET.XMLSchema(xmlschema_doc) diff --git a/tests/validator/test_workspace_validator.py b/tests/validator/test_workspace_validator.py index bc516d5a53..2e63bb5495 100644 --- a/tests/validator/test_workspace_validator.py +++ b/tests/validator/test_workspace_validator.py @@ -90,7 +90,7 @@ def test_validate_file_groups_non_ocrd(self): self.assertEqual(len(report.errors), 1) self.assertIn('No files', report.errors[0]) self.assertEqual(len(report.notices), 1) - self.assertIn("USE does not begin with 'OCR-D-'", report.notices[0]) + self.assertIn("fileGrp USE 'FOO' does not begin with 'OCR-D-'", report.notices[0]) def test_validate_file_groups_unspecified(self): with TemporaryDirectory() as tempdir: From a95f269d83695e38f502084523868a3d365ec810 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 13:15:13 +0200 Subject: [PATCH 136/228] update pylintrc --- .pylintrc | 18 ++++++++---------- src/ocrd/resource_manager.py | 4 ++++ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/.pylintrc b/.pylintrc index b2125d824c..a4106a1bb7 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,19 +1,21 @@ [MASTER] -extension-pkg-whitelist=lxml -ignored-modules=cv2,tesserocr,ocrd.model +extension-pkg-whitelist=lxml,pydantic +ignored-modules=cv2,tesserocr,ocrd_models.ocrd_page_generateds +ignore-patterns=.*generateds.* [MESSAGES CONTROL] -ignore-patterns='.*generateds.*' disable = fixme, - E501, + line-too-long, + consider-using-f-string, + logging-fstring-interpolation, trailing-whitespace, logging-not-lazy, inconsistent-return-statements, + disallowed-name, invalid-name, line-too-long, missing-docstring, - no-self-use, wrong-import-order, too-many-nested-blocks, superfluous-parens, @@ -25,13 +27,9 @@ disable = ungrouped-imports, useless-object-inheritance, useless-import-alias, - bad-continuation, no-else-return, logging-not-lazy -[FORMAT] -no-space-check=empty-line - [DESIGN] # Maximum number of arguments for function / method max-args=12 @@ -40,7 +38,7 @@ max-locals=30 # Maximum number of return / yield for function / method body max-returns=12 # Maximum number of branch for function / method body -max-branchs=30 +max-branches=30 # Maximum number of statements in function / method body max-statements=60 # Maximum number of parents for a class (see R0901). diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index da1ee48331..3c4c603060 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -13,12 +13,16 @@ from gdown.download import get_url_from_gdrive_confirmation from yaml import safe_load, safe_dump +# pylint: disable=wrong-import-position + # https://github.com/OCR-D/core/issues/867 # https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml import yaml.constructor yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:timestamp'] = \ yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:str'] +# pylint: enable=wrong-import-position + from ocrd_validators import OcrdResourceListValidator from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json From 50c088ecdf0d889e7095f71f99ec93bc08be5dcc Mon Sep 17 00:00:00 2001 From: kba Date: Sat, 24 Aug 2024 16:40:58 +0200 Subject: [PATCH 137/228] processor.metadata_location: use self.__module__ not __package__ --- src/ocrd/processor/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index dacf9b0729..c0b66b2693 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -126,7 +126,7 @@ def metadata_filename(self) -> str: return 'ocrd-tool.json' @cached_property - def metadata_location(self) -> str: + def metadata_location(self) -> Path: """ Absolute path of the ``ocrd-tool.json`` file as distributed with the package. @@ -134,7 +134,7 @@ def metadata_location(self) -> str: (Override if ``ocrd-tool.json`` is not distributed with the Python package.) """ - return resource_filename(__package__.split('.')[0], self.metadata_filename) + return resource_filename(self.__module__.split('.')[0], self.metadata_filename) @cached_property def metadata_rawdict(self) -> dict: From 821123765f7b9854c569f90e88e18ade025b68e8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 25 Aug 2024 02:18:53 +0200 Subject: [PATCH 138/228] pylint: try ignoring generateds (again) --- .pylintrc | 1 + src/ocrd/cli/ocrd_tool.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/.pylintrc b/.pylintrc index a4106a1bb7..2e3af4288b 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,6 +1,7 @@ [MASTER] extension-pkg-whitelist=lxml,pydantic ignored-modules=cv2,tesserocr,ocrd_models.ocrd_page_generateds +ignore-paths=ocrd_page_generateds.py ignore-patterns=.*generateds.* [MESSAGES CONTROL] diff --git a/src/ocrd/cli/ocrd_tool.py b/src/ocrd/cli/ocrd_tool.py index f63a7235a5..fa815daeb9 100644 --- a/src/ocrd/cli/ocrd_tool.py +++ b/src/ocrd/cli/ocrd_tool.py @@ -28,6 +28,8 @@ def __init__(self, filename): self.filename = filename with codecs.open(filename, encoding='utf-8') as f: self.content = f.read() + # perhaps the validator should _always_ run (for default expansion) + # so validate command only for the report? self.json = loads(self.content) self.tool_name = '' From 3e2700cddcfa51af9cc73427fd4ebcfc53458282 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 25 Aug 2024 11:32:14 +0200 Subject: [PATCH 139/228] :memo: update changelog --- CHANGELOG.md | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 43bf85764d..244b168e2d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,13 +5,37 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Fixed: + - actuall apply CLI `--log-filename` + - adapt to Pillow changes + - `ocrd workspace clone`: do pass on `--file-grp` (for download filtering) + Changed: - - :fire: `ocrd_utils`, `ocrd_models`, `ocrd_modelfactory`, `ocrd_validators` and `ocrd_network` are not published as separate packages anymore, everything is contained in `ocrd` and you should adapt your `requirements.txt` accordingly. + - :fire: `ocrd_utils`, `ocrd_models`, `ocrd_modelfactory`, `ocrd_validators` and `ocrd_network` are not published + as separate packages anymore, everything is contained in `ocrd` - you should adapt your `requirements.txt` accordingly + - :fire: `Processor.parameter` now a property (attribute always exists, but `None` for non-processing contexts) + - :fire: `Processor.parameter` is now a `frozendict` (contents immutable) + - :fire: `Processor.parameter` validate when(ever) set instead of (just) the constructor + - setting `Processor.parameter` will also trigger (`Processor.shutdown() and) `Processor.setup()` + - `get_processor(... instance_caching=True)`: use `min(max_instances, OCRD_MAX_PROCESSOR_CACHE)` + - :fire: `Processor.verify` always validates fileGrp cardinalities (because we have `ocrd-tool.json` defaults now) + - :fire: `OcrdMets.add_agent` without positional arguments + - `ocrd bashlib input-files` now uses normal Processor decorator, and gets passed actual `ocrd-tool.json` and tool name + from bashlib's `ocrd__wrap` + +Added: + - `Processor.metadata_filename`: expose to make local path of `ocrd-tool.json` in Python distribution reusable+overridable + - `Processor.metadata_location`: expose to make absolute path of `ocrd-tool.json` reusable+overridable + - `Processor.metadata_rawdict`: expose to make in-memory contents of `ocrd-tool.json` reusable+overridable + - `Processor.metadata`: expose to make validated and default-expanded contents of `ocrd-tool.json` reusable+overridable + - `Processor.shutdown`: to shut down processor after processing, optional + - `Processor.max_instances`: class attribute to control instance caching of this implementation ## [3.0.0a2] - 2024-08-22 Changed: - :fire: `OcrdPage` as proxy of `PcGtsType` instead of alias; also contains `etree` and `mapping` now + - :fire: `page_from_file`: removed kwarg `with_tree` - use `OcrdPage.etree` and `OcrdPage.mapping` instead - :fire: `Processor.zip_input_files` now can throw `ocrd.NonUniqueInputFile` and `ocrd.MissingInputFile` (the latter only if `OCRD_MISSING_INPUT=ABORT`) - :fire: `Processor.zip_input_files` does not by default use `require_first` anymore From 342df5825522c8fa28c570a0cc7fd8f348730c0c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 25 Aug 2024 20:05:52 +0200 Subject: [PATCH 140/228] test_bashlib: allow testing prereleases successfully --- tests/cli/test_bashlib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cli/test_bashlib.py b/tests/cli/test_bashlib.py index b1ab68c7fc..ba7c283e40 100644 --- a/tests/cli/test_bashlib.py +++ b/tests/cli/test_bashlib.py @@ -121,7 +121,7 @@ def test_bashlib_minversion(self): assert f"ERROR: ocrd/core is too old ({VERSION} < {version})" in err # test non-matching prerelease (the 99th alpha pre-release here) - version = "%d.%d.%da99" % (major, minor, patch) + version = "%d.%d.%dz99" % (major, minor, patch) assert VERSION != version # assuming we will never have 99 alpha prereleases ^^ exit_code, out, err = self.invoke_bash("source $(ocrd bashlib filename) && ocrd__minversion " + version) assert exit_code > 0 From 11ed8c568274779f4c6cc25cad36fd9424c40b1c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 25 Aug 2024 20:07:06 +0200 Subject: [PATCH 141/228] Processor.process_page_file / OcrdPageResultImage: allow PageType instead of AlternativeImageType --- src/ocrd/processor/base.py | 20 ++++++++++++++++++-- src/ocrd/processor/ocrd_page_result.py | 6 +++--- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index c0b66b2693..df65748188 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -45,7 +45,15 @@ deprecation_warning ) from ocrd_validators import ParameterValidator -from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType, OcrdPage, to_xml +from ocrd_models.ocrd_page import ( + PageType, + AlternativeImageType, + MetadataItemType, + LabelType, + LabelsType, + OcrdPage, + to_xml, +) from ocrd_modelfactory import page_from_file from ocrd_validators.ocrd_tool_validator import OcrdToolValidator @@ -523,7 +531,15 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: for image_result in result.images: image_file_id = f'{output_file_id}_{image_result.file_id_suffix}' image_file_path = join(self.output_file_grp, f'{image_file_id}.png') - image_result.alternative_image.set_filename(image_file_path) + if isinstance(image_result.alternative_image, PageType): + image_result.alternative_image.set_imageFilename(image_file_path) + image_result.alternative_image.set_imageWidth(image_result.pil.width) + image_result.alternative_image.set_imageHeight(image_result.pil.height) + elif isinstance(image_result.alternative_image, AlternativeImageType): + image_result.alternative_image.set_filename(image_file_path) + else: + raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type " + f"{type(image_result.alternative_image)}") self.workspace.save_image_file( image_result.pil, image_file_id, diff --git a/src/ocrd/processor/ocrd_page_result.py b/src/ocrd/processor/ocrd_page_result.py index c63330c734..dcd8ccd44d 100644 --- a/src/ocrd/processor/ocrd_page_result.py +++ b/src/ocrd/processor/ocrd_page_result.py @@ -1,15 +1,15 @@ from dataclasses import dataclass, field -from typing import List +from typing import List, Union from ocrd_models.ocrd_page import OcrdPage from PIL.Image import Image -from ocrd_models.ocrd_page_generateds import AlternativeImageType +from ocrd_models.ocrd_page_generateds import AlternativeImageType, PageType @dataclass class OcrdPageResultImage(): pil : Image file_id_suffix : str - alternative_image : AlternativeImageType + alternative_image : Union[AlternativeImageType, PageType] @dataclass class OcrdPageResult(): From 77e31f26a5a3beebfb10895d0383790dc69ca23a Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 26 Aug 2024 11:29:04 +0200 Subject: [PATCH 142/228] :package: v3.0.0b1 --- CHANGELOG.md | 2 ++ VERSION | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e3214ab8b8..063e7feb6f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [3.0.0b1] - 2024-08-26 + Fixed: - actuall apply CLI `--log-filename` - adapt to Pillow changes diff --git a/VERSION b/VERSION index 3a5b5bc9d6..2daa89b06c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0a2 +3.0.0b1 From d3ee57c271b9144f26b3e1f357dbddc50abe5f24 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 26 Aug 2024 17:58:40 +0200 Subject: [PATCH 143/228] :fire: bad no good terrible hack to fix integration_test --- src/ocrd/processor/base.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index df65748188..1f05e6a67b 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -142,7 +142,11 @@ def metadata_location(self) -> Path: (Override if ``ocrd-tool.json`` is not distributed with the Python package.) """ - return resource_filename(self.__module__.split('.')[0], self.metadata_filename) + # XXX HACK + module_tokens = self.__module__.split('.') + if module_tokens[0] == 'src': + module_tokens.pop(0) + return resource_filename(module_tokens[0], self.metadata_filename) @cached_property def metadata_rawdict(self) -> dict: From 0245f4ba9fe525c37744e717d312628b66955c5a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 27 Aug 2024 03:37:08 +0200 Subject: [PATCH 144/228] generate_processor_help: avoid repeating docstrings from superclass --- CHANGELOG.md | 2 +- Dockerfile | 2 +- src/ocrd/processor/__init__.py | 2 +- src/ocrd/processor/base.py | 151 ++++++++++++++++++++++++++++++++- src/ocrd/processor/helpers.py | 145 +------------------------------ 5 files changed, 154 insertions(+), 148 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 063e7feb6f..c4e38bc421 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## [3.0.0b1] - 2024-08-26 Fixed: - - actuall apply CLI `--log-filename` + - actually apply CLI `--log-filename` - adapt to Pillow changes - `ocrd workspace clone`: do pass on `--file-grp` (for download filtering) diff --git a/Dockerfile b/Dockerfile index 144ae774dc..77c24bf77e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,9 +50,9 @@ FROM ocrd_core_base as ocrd_core_test ARG SKIP_ASSETS WORKDIR /build/core COPY Makefile . +COPY .gitmodules . RUN if test -z "$SKIP_ASSETS" || test $SKIP_ASSETS -eq 0 ; then make assets ; fi COPY tests ./tests -COPY .gitmodules . COPY requirements_test.txt . RUN pip install -r requirements_test.txt RUN mkdir /ocrd-data && chmod 777 /ocrd-data diff --git a/src/ocrd/processor/__init__.py b/src/ocrd/processor/__init__.py index b6c1188def..7cbcb851de 100644 --- a/src/ocrd/processor/__init__.py +++ b/src/ocrd/processor/__init__.py @@ -3,6 +3,7 @@ ResourceNotFoundError, NonUniqueInputFile, MissingInputFile, + generate_processor_help, ) from .ocrd_page_result import ( OcrdPageResult, @@ -11,5 +12,4 @@ from .helpers import ( run_cli, run_processor, - generate_processor_help ) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index df65748188..61a25e5279 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -23,6 +23,8 @@ import io import weakref from frozendict import frozendict + +from click import wrap_text from deprecated import deprecated from requests import HTTPError @@ -58,7 +60,7 @@ from ocrd_validators.ocrd_tool_validator import OcrdToolValidator # XXX imports must remain for backwards-compatibility -from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import +from .helpers import run_cli, run_processor # pylint: disable=unused-import class ResourceNotFoundError(FileNotFoundError): @@ -838,3 +840,150 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): if ifiles[0] or not require_first: ifts.append(tuple(ifiles)) return ifts + +def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None): + """Generate a string describing the full CLI of this processor including params. + + Args: + ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json`` + processor_instance (object, optional): the processor implementation + (for adding any module/class/function docstrings) + subcommand (string): 'worker' or 'server' + """ + doc_help = '' + if processor_instance: + module = inspect.getmodule(processor_instance) + if module and module.__doc__: + doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n' + if processor_instance.__doc__: + doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n' + # Try to find the most concrete docstring among the various methods that an implementation + # could overload, first serving. + # In doing so, compare with Processor to avoid a glitch in the way py>=3.5 inherits docstrings. + # (They are supposed to only repeat information inspect.getdoc, rather than inherit __doc__ itself.) + for method in ['process_page_pcgts', 'process_page_file', 'process_workspace', 'process']: + instance_method = getattr(processor_instance, method) + superclass_method = getattr(Processor, method) + if instance_method.__doc__ and instance_method.__doc__ != superclass_method.__doc__: + doc_help += '\n' + inspect.cleandoc(instance_method.__doc__) + '\n' + break + if doc_help: + doc_help = '\n\n' + wrap_text(doc_help, width=72, + initial_indent=' > ', + subsequent_indent=' > ', + preserve_paragraphs=True) + subcommands = '''\ + worker Start a processing worker rather than do local processing + server Start a processor server rather than do local processing +''' + + processing_worker_options = '''\ + --queue The RabbitMQ server address in format + "amqp://{user}:{pass}@{host}:{port}/{vhost}" + [amqp://admin:admin@localhost:5672] + --database The MongoDB server address in format + "mongodb://{host}:{port}" + [mongodb://localhost:27018] + --log-filename Filename to redirect STDOUT/STDERR to, + if specified. +''' + + processing_server_options = '''\ + --address The Processor server address in format + "{host}:{port}" + --database The MongoDB server address in format + "mongodb://{host}:{port}" + [mongodb://localhost:27018] +''' + + processing_options = '''\ + -m, --mets URL-PATH URL or file path of METS to process [./mets.xml] + -w, --working-dir PATH Working directory of local workspace [dirname(URL-PATH)] + -I, --input-file-grp USE File group(s) used as input + -O, --output-file-grp USE File group(s) used as output + -g, --page-id ID Physical page ID(s) to process instead of full document [] + --overwrite Remove existing output pages/images + (with "--page-id", remove only those). + Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE + --debug Abort on any errors with full stack trace. + Short-hand for OCRD_MISSING_OUTPUT=ABORT + --profile Enable profiling + --profile-file PROF-PATH Write cProfile stats to PROF-PATH. Implies "--profile" + -p, --parameter JSON-PATH Parameters, either verbatim JSON string + or JSON file path + -P, --param-override KEY VAL Override a single JSON object key-value pair, + taking precedence over --parameter + -U, --mets-server-url URL URL of a METS Server for parallel incremental access to METS + If URL starts with http:// start an HTTP server there, + otherwise URL is a path to an on-demand-created unix socket + -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE] + Override log level globally [INFO] +''' + + information_options = '''\ + -C, --show-resource RESNAME Dump the content of processor resource RESNAME + -L, --list-resources List names of processor resources + -J, --dump-json Dump tool description as JSON + -D, --dump-module-dir Show the 'module' resource location path for this processor + -h, --help Show this message + -V, --version Show version +''' + + parameter_help = '' + if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']: + parameter_help = ' NONE\n' + else: + def wrap(s): + return wrap_text(s, initial_indent=' '*3, + subsequent_indent=' '*4, + width=72, preserve_paragraphs=True) + for param_name, param in ocrd_tool['parameters'].items(): + parameter_help += wrap('"%s" [%s%s]' % ( + param_name, + param['type'], + ' - REQUIRED' if 'required' in param and param['required'] else + ' - %s' % json.dumps(param['default']) if 'default' in param else '')) + parameter_help += '\n ' + wrap(param['description']) + if 'enum' in param: + parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum'])) + parameter_help += "\n" + + if not subcommand: + return f'''\ +Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS] + + {ocrd_tool['description']}{doc_help} + +Subcommands: +{subcommands} +Options for processing: +{processing_options} +Options for information: +{information_options} +Parameters: +{parameter_help} +''' + elif subcommand == 'worker': + return f'''\ +Usage: {ocrd_tool['executable']} worker [OPTIONS] + + Run {ocrd_tool['executable']} as a processing worker. + + {ocrd_tool['description']}{doc_help} + +Options: +{processing_worker_options} +''' + elif subcommand == 'server': + return f'''\ +Usage: {ocrd_tool['executable']} server [OPTIONS] + + Run {ocrd_tool['executable']} as a processor sever. + + {ocrd_tool['description']}{doc_help} + +Options: +{processing_server_options} +''' + else: + pass diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 6483790bd6..a675ff129e 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -8,13 +8,11 @@ from subprocess import run from typing import List, Optional -from click import wrap_text -from ocrd.workspace import Workspace +from ..workspace import Workspace from ocrd_utils import freeze_args, getLogger, config, setOverrideLogLevel, getLevelName, sparkline __all__ = [ - 'generate_processor_help', 'run_cli', 'run_processor' ] @@ -213,147 +211,6 @@ def run_cli( return result.returncode -def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None): - """Generate a string describing the full CLI of this processor including params. - - Args: - ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json`` - processor_instance (object, optional): the processor implementation - (for adding any module/class/function docstrings) - subcommand (string): 'worker' or 'server' - """ - doc_help = '' - if processor_instance: - module = inspect.getmodule(processor_instance) - if module and module.__doc__: - doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n' - if processor_instance.__doc__: - doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n' - if processor_instance.process_workspace.__doc__: - doc_help += '\n' + inspect.cleandoc(processor_instance.process_workspace.__doc__) + '\n' - if processor_instance.process.__doc__: - doc_help += '\n' + inspect.cleandoc(processor_instance.process.__doc__) + '\n' - if doc_help: - doc_help = '\n\n' + wrap_text(doc_help, width=72, - initial_indent=' > ', - subsequent_indent=' > ', - preserve_paragraphs=True) - subcommands = '''\ - worker Start a processing worker rather than do local processing - server Start a processor server rather than do local processing -''' - - processing_worker_options = '''\ - --queue The RabbitMQ server address in format - "amqp://{user}:{pass}@{host}:{port}/{vhost}" - [amqp://admin:admin@localhost:5672] - --database The MongoDB server address in format - "mongodb://{host}:{port}" - [mongodb://localhost:27018] - --log-filename Filename to redirect STDOUT/STDERR to, - if specified. -''' - - processing_server_options = '''\ - --address The Processor server address in format - "{host}:{port}" - --database The MongoDB server address in format - "mongodb://{host}:{port}" - [mongodb://localhost:27018] -''' - - processing_options = '''\ - -m, --mets URL-PATH URL or file path of METS to process [./mets.xml] - -w, --working-dir PATH Working directory of local workspace [dirname(URL-PATH)] - -I, --input-file-grp USE File group(s) used as input - -O, --output-file-grp USE File group(s) used as output - -g, --page-id ID Physical page ID(s) to process instead of full document [] - --overwrite Remove existing output pages/images - (with "--page-id", remove only those). - Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE - --debug Abort on any errors with full stack trace. - Short-hand for OCRD_MISSING_OUTPUT=ABORT - --profile Enable profiling - --profile-file PROF-PATH Write cProfile stats to PROF-PATH. Implies "--profile" - -p, --parameter JSON-PATH Parameters, either verbatim JSON string - or JSON file path - -P, --param-override KEY VAL Override a single JSON object key-value pair, - taking precedence over --parameter - -U, --mets-server-url URL URL of a METS Server for parallel incremental access to METS - If URL starts with http:// start an HTTP server there, - otherwise URL is a path to an on-demand-created unix socket - -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE] - Override log level globally [INFO] -''' - - information_options = '''\ - -C, --show-resource RESNAME Dump the content of processor resource RESNAME - -L, --list-resources List names of processor resources - -J, --dump-json Dump tool description as JSON - -D, --dump-module-dir Show the 'module' resource location path for this processor - -h, --help Show this message - -V, --version Show version -''' - - parameter_help = '' - if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']: - parameter_help = ' NONE\n' - else: - def wrap(s): - return wrap_text(s, initial_indent=' '*3, - subsequent_indent=' '*4, - width=72, preserve_paragraphs=True) - for param_name, param in ocrd_tool['parameters'].items(): - parameter_help += wrap('"%s" [%s%s]' % ( - param_name, - param['type'], - ' - REQUIRED' if 'required' in param and param['required'] else - ' - %s' % json.dumps(param['default']) if 'default' in param else '')) - parameter_help += '\n ' + wrap(param['description']) - if 'enum' in param: - parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum'])) - parameter_help += "\n" - - if not subcommand: - return f'''\ -Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS] - - {ocrd_tool['description']}{doc_help} - -Subcommands: -{subcommands} -Options for processing: -{processing_options} -Options for information: -{information_options} -Parameters: -{parameter_help} -''' - elif subcommand == 'worker': - return f'''\ -Usage: {ocrd_tool['executable']} worker [OPTIONS] - - Run {ocrd_tool['executable']} as a processing worker. - - {ocrd_tool['description']}{doc_help} - -Options: -{processing_worker_options} -''' - elif subcommand == 'server': - return f'''\ -Usage: {ocrd_tool['executable']} server [OPTIONS] - - Run {ocrd_tool['executable']} as a processor sever. - - {ocrd_tool['description']}{doc_help} - -Options: -{processing_server_options} -''' - else: - pass - # not decorated here but at runtime (on first use) #@freeze_args From efe420138141a8c15b3967987c103753274edeb6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 27 Aug 2024 04:07:38 +0200 Subject: [PATCH 145/228] Processor.process_workspace: abort anyway if too many failures (OCRD_MAX_MISSING_OUTPUTS) --- src/ocrd/processor/base.py | 39 ++++++++++++++++++++++++++------------ src/ocrd_utils/config.py | 5 +++++ 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 61a25e5279..cbd819a440 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -424,6 +424,9 @@ def process_workspace(self, workspace: Workspace) -> None: self.workspace = workspace self.verify() try: + nr_succeeded = 0 + nr_skipped = 0 + nr_copied = 0 # FIXME: add page parallelization by running multiprocessing.Pool (#322) for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) @@ -449,29 +452,38 @@ def process_workspace(self, workspace: Workspace) -> None: # - persistent (data) error → skip / dummy / raise try: self.process_page_file(*input_files) - except Exception as err: - # we have to be broad here, but want to exclude NotImplementedError - if isinstance(err, NotImplementedError): + nr_succeeded += 1 + # exclude NotImplementedError, so we can try process() below + except NotImplementedError: + raise + # handle input failures separately + except FileExistsError as err: + if config.OCRD_EXISTING_OUTPUT == 'ABORT': raise err - if isinstance(err, FileExistsError): - if config.OCRD_EXISTING_OUTPUT == 'ABORT': - raise err - if config.OCRD_EXISTING_OUTPUT == 'SKIP': - continue - if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': - # too late here, must not happen - raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") - # FIXME: re-usable/actionable logging + if config.OCRD_EXISTING_OUTPUT == 'SKIP': + continue + if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': + # too late here, must not happen + raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") + # broad coverage of output failures + except Exception as err: + # FIXME: add re-usable/actionable logging self._base_logger.exception(f"Failure on page {page_id}: {err}") if config.OCRD_MISSING_OUTPUT == 'ABORT': raise err if config.OCRD_MISSING_OUTPUT == 'SKIP': + nr_skipped += 1 continue if config.OCRD_MISSING_OUTPUT == 'COPY': self._copy_page_file(input_files[0]) + nr_copied += 1 else: desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") + if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS: + raise Exception(f"too many failures with skipped output ({nr_skipped})") + if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS: + raise Exception(f"too many failures with fallback output ({nr_skipped})") except NotImplementedError: # fall back to deprecated method self.process() @@ -534,6 +546,9 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: image_file_id = f'{output_file_id}_{image_result.file_id_suffix}' image_file_path = join(self.output_file_grp, f'{image_file_id}.png') if isinstance(image_result.alternative_image, PageType): + # special case: not an alternative image, but replacing the original image + # (this is needed by certain processors when the original's coordinate system + # cannot or must not be kept) image_result.alternative_image.set_imageFilename(image_file_path) image_result.alternative_image.set_imageWidth(image_result.pil.width) image_result.alternative_image.set_imageHeight(image_result.pil.height) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 3425bc920e..9f9d924f6a 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -184,6 +184,11 @@ def _ocrd_download_timeout_parser(val): validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'], parser=str) +config.add("OCRD_MAX_MISSING_OUTPUTS", + description="Maximal rate of skipped/fallback pages among all processed pages before aborting.", + default=(True, 0.1), + parser=float) + config.add("OCRD_EXISTING_OUTPUT", description="""\ How to deal with already existing output files (for some fileGrp/pageId) during processing: From fce7627a3963924ef9f3500360f0423fa0b2c6ba Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 27 Aug 2024 11:09:08 +0200 Subject: [PATCH 146/228] adapt tests for OCRD_MAX_MISSING_OUTPUTS --- src/ocrd_utils/config.py | 2 +- tests/processor/test_processor.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 9f9d924f6a..29632f8cce 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -185,7 +185,7 @@ def _ocrd_download_timeout_parser(val): parser=str) config.add("OCRD_MAX_MISSING_OUTPUTS", - description="Maximal rate of skipped/fallback pages among all processed pages before aborting.", + description="Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).", default=(True, 0.1), parser=float) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 19ff1087f2..4e6114763c 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -219,6 +219,8 @@ def test_run_output0(self): def test_run_output_missing(self): ws = self.workspace from ocrd_utils import config + # do not raise for number of failures: + config.OCRD_MAX_MISSING_OUTPUTS = -1 config.OCRD_MISSING_OUTPUT = 'SKIP' run_processor(DummyProcessorWithOutputFailures, workspace=ws, input_file_grp="OCR-D-IMG", @@ -237,6 +239,14 @@ def test_run_output_missing(self): input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-OUT") assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + # do raise for number of failures: + config.OCRD_MAX_MISSING_OUTPUTS = 0.4 + config.OCRD_MISSING_OUTPUT = 'SKIP' + with pytest.raises(Exception) as exc: + run_processor(DummyProcessorWithOutputFailures, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT") + assert "too many failures" in str(exc.value) def test_run_output_overwrite(self): with pushd_popd(tempdir=True) as tempdir: From c08166e177e3080db4b2a9b5be4ea6b218939f05 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 27 Aug 2024 22:47:40 +0200 Subject: [PATCH 147/228] =?UTF-8?q?Processor:=20add=20per-page=20timeouts?= =?UTF-8?q?=20and=20parallelism=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ocrd_utils: introduce `config.OCRD_MAX_PARALLEL_PAGES` - Processor: introduce `max_workers` class attribute (as per-implementation limit of `OCRD_MAX_PARALLEL_PAGES`) - ocrd_utils: introduce `config.OCRD_PROCESSING_PAGE_TIMEOUT` - Processor: introduce `max_page_seconds` class attribute (as per-implementation limit of `OCRD_PROCESSING_PAGE_TIMEOUT`) - Processor.process_workspace: instead of calling `process_page_file` directly for each input file in a loop, submit these invokations as tasks to an internal `ThreadPoolExecutor`, which will run with `OCRD_MAX_PARALLEL_PAGES` workers, and loop over retrieving results from it (with normal error handling) - for each (per-page) task, add a `timeout` limit `OCRD_PROCESSING_PAGE_TIMEOUT` (with TimeoutError an additional error case to be handled in accordance with `OCRD_MISSING_OUTPUT` setting) --- src/ocrd/processor/base.py | 98 +++++++++++++++++++++++++++++--------- src/ocrd_utils/config.py | 10 ++++ 2 files changed, 86 insertions(+), 22 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 4e60024703..269f27b703 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -23,14 +23,16 @@ import io import weakref from frozendict import frozendict +from concurrent.futures import ThreadPoolExecutor, TimeoutError from click import wrap_text from deprecated import deprecated from requests import HTTPError -from ocrd.workspace import Workspace +from ..workspace import Workspace +from ..mets_server import ClientSideOcrdMets from ocrd_models.ocrd_file import OcrdFileType -from ocrd.processor.ocrd_page_result import OcrdPageResult +from .ocrd_page_result import OcrdPageResult from ocrd_utils import ( VERSION as OCRD_VERSION, MIMETYPE_PAGE, @@ -120,7 +122,27 @@ class Processor(): maximum number of cached instances (ignored if negative), to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller). - (Override this if you know how many instances fit into memory at once.) + (Override this if you know how many instances fit into memory - GPU / CPU RAM - at once.) + """ + + max_workers : int = -1 + """ + maximum number of processor threads for page-parallel processing (ignored if negative), + to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e. + whatever is smaller). + + (Override this if you know how many pages fit into processing units - GPU shaders / CPU cores + - at once, or if your class is not thread-safe.) + """ + + max_page_seconds : int = -1 + """ + maximum number of seconds may be spent processing a single page (ignored if negative), + to be applied on top of :py:data:`~ocrd_utils.config.OCRD_PROCESSING_PAGE_TIMEOUT` + (i.e. whatever is smaller). + + (Override this if you know how costly this processor may be, irrespective of image size + or complexity of the page.) """ @property @@ -431,7 +453,26 @@ def process_workspace(self, workspace: Workspace) -> None: nr_succeeded = 0 nr_skipped = 0 nr_copied = 0 - # FIXME: add page parallelization by running multiprocessing.Pool (#322) + + # set up multithreading + if self.max_workers < 0: + max_workers = config.OCRD_MAX_PARALLEL_PAGES + else: + max_workers = min(config.OCRD_MAX_PARALLEL_PAGES, self.max_workers) + if max_workers > 1: + assert isinstance(workspace.mets, ClientSideOcrdMets), \ + "OCRD_MAX_PARALLEL_PAGES>1 requires also using --mets-server-url" + if self.max_page_seconds < 0: + max_seconds = config.OCRD_PROCESSING_PAGE_TIMEOUT + else: + max_seconds = min(config.OCRD_PROCESSING_PAGE_TIMEOUT, self.max_page_seconds) + executor = ThreadPoolExecutor( + max_workers=max_workers, + thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}" + ) + self._base_logger.debug("started executor %s", str(executor)) + tasks = {} + for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) page_id = next(input_file.pageId @@ -450,12 +491,20 @@ def process_workspace(self, workspace: Workspace) -> None: except (ValueError, FileNotFoundError, HTTPError) as e: self._base_logger.error(repr(e)) self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}") + # process page + tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files) + self._base_logger.debug("submitted %d processing tasks", len(tasks)) + + for task in tasks: + # wait for results, handle errors + page_id, input_files = tasks[task] # FIXME: differentiate error cases in various ways: # - ResourceNotFoundError → use ResourceManager to download (once), then retry # - transient (I/O or OOM) error → maybe sleep, retry # - persistent (data) error → skip / dummy / raise try: - self.process_page_file(*input_files) + self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds) + task.result(timeout=max_seconds) nr_succeeded += 1 # exclude NotImplementedError, so we can try process() below except NotImplementedError: @@ -469,10 +518,10 @@ def process_workspace(self, workspace: Workspace) -> None: if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': # too late here, must not happen raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") - # broad coverage of output failures - except Exception as err: + # broad coverage of output failures (including TimeoutError) + except (Exception, TimeoutError) as err: # FIXME: add re-usable/actionable logging - self._base_logger.exception(f"Failure on page {page_id}: {err}") + self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") if config.OCRD_MISSING_OUTPUT == 'ABORT': raise err if config.OCRD_MISSING_OUTPUT == 'SKIP': @@ -484,10 +533,13 @@ def process_workspace(self, workspace: Workspace) -> None: else: desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") + if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS: raise Exception(f"too many failures with skipped output ({nr_skipped})") if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS: raise Exception(f"too many failures with fallback output ({nr_skipped})") + executor.shutdown() + except NotImplementedError: # fall back to deprecated method self.process() @@ -511,13 +563,14 @@ def _copy_page_file(self, input_file : OcrdFileType) -> None: output_file_id = make_file_id(input_file, self.output_file_grp) input_pcgts.set_pcGtsId(output_file_id) self.add_metadata(input_pcgts) - self.workspace.add_file(file_id=output_file_id, - file_grp=self.output_file_grp, - page_id=input_file.pageId, - local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), - mimetype=MIMETYPE_PAGE, - content=to_xml(input_pcgts), - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + self.workspace.add_file( + file_id=output_file_id, + file_grp=self.output_file_grp, + page_id=input_file.pageId, + local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), + mimetype=MIMETYPE_PAGE, + content=to_xml(input_pcgts), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: @@ -571,13 +624,14 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: ) result.pcgts.set_pcGtsId(output_file_id) self.add_metadata(result.pcgts) - self.workspace.add_file(file_id=output_file_id, - file_grp=self.output_file_grp, - page_id=page_id, - local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), - mimetype=MIMETYPE_PAGE, - content=to_xml(result.pcgts), - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + self.workspace.add_file( + file_id=output_file_id, + file_grp=self.output_file_grp, + page_id=page_id, + local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), + mimetype=MIMETYPE_PAGE, + content=to_xml(result.pcgts), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 29632f8cce..0186b8539b 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -120,6 +120,16 @@ def raw_value(self, name): parser=int, default=(True, 128)) +config.add('OCRD_MAX_PARALLEL_PAGES', + description="Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set >1, then a METS Server must be used for METS synchronisation.", + parser=int, + default=(True, 1)) + +config.add('OCRD_PROCESSING_PAGE_TIMEOUT', + description="Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.", + parser=int, + default=(True, 0)) + config.add("OCRD_PROFILE", description="""\ Whether to enable gathering runtime statistics From c3a83800da2d56262234c10d54bb0946087a4994 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 27 Aug 2024 22:59:58 +0200 Subject: [PATCH 148/228] add tests for processor per-page timeout and parallelism --- requirements_test.txt | 1 + tests/data/__init__.py | 26 ++++++++++++++++- tests/processor/test_processor.py | 46 +++++++++++++++++++++++++++++-- tests/test_workspace.py | 6 ++-- 4 files changed, 72 insertions(+), 7 deletions(-) diff --git a/requirements_test.txt b/requirements_test.txt index d8cef1dae7..a6a87918fc 100644 --- a/requirements_test.txt +++ b/requirements_test.txt @@ -3,6 +3,7 @@ cryptography < 43.0.0 pytest >= 4.0.0 generateDS == 2.35.20 pytest-benchmark >= 3.2.3 +pytest-timeout coverage >= 4.5.2 sphinx sphinx_click diff --git a/tests/data/__init__.py b/tests/data/__init__.py index c706546c57..c24a6979b2 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -1,8 +1,9 @@ import json import os import re +from time import sleep from pytest import warns -from ocrd import Processor +from ocrd import Processor, OcrdPageResult from ocrd_utils import make_file_id, config DUMMY_TOOL = { @@ -103,6 +104,29 @@ def process(self): force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) +class DummyProcessorWithOutputPagewise(Processor): + @property + def ocrd_tool(self): + dummy_tool = dict(DUMMY_TOOL) + dummy_tool['parameters']['sleep'] = {'type': 'number'} + return dummy_tool + + @property + def version(self): + return '0.0.1' + + @property + def executable(self): + return 'ocrd-test' + + def __init__(self, *args, **kwargs): + kwargs['download_files'] = False + super().__init__(*args, **kwargs) + + def process_page_pcgts(self, pcgts, page_id=None): + sleep(self.parameter['sleep']) + return OcrdPageResult(pcgts) + class DummyProcessorWithOutputFailures(Processor): @property def ocrd_tool(self): diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 4e6114763c..e0b74fb002 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -9,9 +9,11 @@ DummyProcessor, DummyProcessorWithRequiredParameters, DummyProcessorWithOutput, + DummyProcessorWithOutputPagewise, DummyProcessorWithOutputFailures, IncompleteProcessor ) +from tests.test_mets_server import fixture_start_mets_server from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging from ocrd.resolver import Resolver @@ -232,7 +234,7 @@ def test_run_output_missing(self): run_processor(DummyProcessorWithOutputFailures, workspace=ws, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-OUT") - assert "intermittent" in str(exc.value) + assert "intermittent" in str(exc.value) config.OCRD_MISSING_OUTPUT = 'COPY' config.OCRD_EXISTING_OUTPUT = 'SKIP' run_processor(DummyProcessorWithOutputFailures, workspace=ws, @@ -246,7 +248,28 @@ def test_run_output_missing(self): run_processor(DummyProcessorWithOutputFailures, workspace=ws, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-OUT") - assert "too many failures" in str(exc.value) + assert "too many failures" in str(exc.value) + + def test_run_output_timeout(self): + ws = self.workspace + from ocrd_utils import config + # do not raise for number of failures: + config.OCRD_MAX_MISSING_OUTPUTS = -1 + config.OCRD_MISSING_OUTPUT = 'ABORT' + config.OCRD_PROCESSING_PAGE_TIMEOUT = 3 + run_processor(DummyProcessorWithOutputPagewise, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + parameter={"sleep": 1}) + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' + config.OCRD_PROCESSING_PAGE_TIMEOUT = 1 + from concurrent.futures import TimeoutError + with pytest.raises(TimeoutError) as exc: + run_processor(DummyProcessorWithOutputPagewise, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + parameter={"sleep": 3}) def test_run_output_overwrite(self): with pushd_popd(tempdir=True) as tempdir: @@ -261,7 +284,7 @@ def test_run_output_overwrite(self): run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", output_file_grp="OCR-D-OUT") - assert str(exc.value) == "File with ID='OCR-D-OUT_phys_0001' already exists" + assert str(exc.value) == "File with ID='OCR-D-OUT_phys_0001' already exists" config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", @@ -387,5 +410,22 @@ def ocrd_tool(self): r = self.capture_out_err() assert 'ERROR ocrd.processor.base - Found no file for page phys_0001 in file group GRP1' in r.err +# 2s (+ 2s tolerance) instead of 3*3s (+ 2s tolerance) +@pytest.mark.timeout(4) +def test_run_output_parallel(start_mets_server): + mets_server_url, ws = start_mets_server + from ocrd_utils import config + # do not raise for single-page timeout + config.OCRD_PROCESSING_PAGE_TIMEOUT = -1 + # do not raise for number of failures: + config.OCRD_MAX_MISSING_OUTPUTS = -1 + config.OCRD_MAX_PARALLEL_PAGES = 3 + run_processor(DummyProcessorWithOutputPagewise, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + parameter={"sleep": 2}, + mets_server_url=mets_server_url) + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + if __name__ == "__main__": main(__file__) diff --git a/tests/test_workspace.py b/tests/test_workspace.py index 02cb72d342..9d6b64b1e3 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -87,10 +87,10 @@ def test_workspace_add_file_overwrite(plain_workspace): plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id='phys1', local_filename=fpath) with pytest.raises(FileExistsError) as fn_exc: plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id=None, local_filename=fpath) - assert str(fn_exc.value) == "File with file_id='ID1' already exists" + assert str(fn_exc.value) == "File with file_id='ID1' already exists" with pytest.raises(FileExistsError) as fn_exc: plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id='phys2', local_filename=fpath, force=True) - assert 'cannot mitigate' in str(fn_exc.value) + assert 'cannot mitigate' in str(fn_exc.value) plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT2', page_id='phys1', local_filename=fpath, force=True) f = plain_workspace.mets.find_all_files()[0] @@ -684,7 +684,7 @@ def test_merge_overwrite(tmp_path): ws1.add_file('X', page_id='X', mimetype='X', file_id='id123', local_filename='X/X', content='ws1') ws2.add_file('X', page_id='X', mimetype='X', file_id='id456', local_filename='X/X', content='ws2') ws1.merge(ws2) - assert "would overwrite" == str(exc.value) + assert "would overwrite" == str(exc.value) def test_merge_with_filter(plain_workspace, tmp_path): # arrange From b1b7a491d41c00c2f803c9b4f5386ec33c9e7b8e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 27 Aug 2024 23:06:28 +0200 Subject: [PATCH 149/228] :memo: update changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c4e38bc421..2a3807c0b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Added: + - `Processor.max_workers`: class attribute to control per-page parallelism of this implementation + - `Processor.max_page_seconds`: class attribute to control per-page timeout of this implementation + - `OCRD_MAX_PARALLEL_PAGES` for whether and how many workers should process pages in parallel + - `OCRD_PROCESSING_PAGE_TIMEOUT` for whether and how long processors should wait for single pages + - `OCRD_MAX_MISSING_OUTPUTS` for maximum rate (fraction) of pages before making `OCRD_MISSING_OUTPUT=abort` + ## [3.0.0b1] - 2024-08-26 Fixed: From 9b80ae17ef04dba41bc1f09d5c7be88e7ec8f22c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 15:11:49 +0200 Subject: [PATCH 150/228] ClientSideOcrdMets: use same logger name prefix as server --- src/ocrd/mets_server.py | 51 +++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 81f9e15d0a..41b1f23eca 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -120,7 +120,7 @@ class ClientSideOcrdMets: def __init__(self, url, workspace_path: Optional[str] = None): self.protocol = "tcp" if url.startswith("http://") else "uds" - self.log = getLogger(f"ocrd.mets_client[{url}]") + self.log = getLogger(f"ocrd.models.ocrd_mets.client.{url}") self.url = url if self.protocol == "tcp" else f'http+unix://{url.replace("/", "%2F")}' self.ws_dir_path = workspace_path if workspace_path else None @@ -147,10 +147,9 @@ def save(self): Request writing the changes to the file system """ if not self.multiplexing_mode: - self.session.request("PUT", url=self.url) + self.session.put(url=self.url) else: - self.session.request( - "POST", + self.session.post( self.url, json=MpxReq.save(self.ws_dir_path) ) @@ -161,11 +160,10 @@ def stop(self): """ try: if not self.multiplexing_mode: - self.session.request("DELETE", self.url) + self.session.delete(self.url) return else: - self.session.request( - "POST", + self.session.post( self.url, json=MpxReq.stop(self.ws_dir_path) ) @@ -178,10 +176,9 @@ def reload(self): Request reloading of the mets file from the file system """ if not self.multiplexing_mode: - return self.session.request("POST", f"{self.url}/reload").text + return self.session.post(f"{self.url}/reload").text else: - return self.session.request( - "POST", + return self.session.post( self.url, json=MpxReq.reload(self.ws_dir_path) ).json()["text"] @@ -189,10 +186,9 @@ def reload(self): @property def unique_identifier(self): if not self.multiplexing_mode: - return self.session.request("GET", f"{self.url}/unique_identifier").text + return self.session.get(f"{self.url}/unique_identifier").text else: - return self.session.request( - "POST", + return self.session.post( self.url, json=MpxReq.unique_identifier(self.ws_dir_path) ).json()["text"] @@ -200,11 +196,10 @@ def unique_identifier(self): @property def workspace_path(self): if not self.multiplexing_mode: - self.ws_dir_path = self.session.request("GET", f"{self.url}/workspace_path").text + self.ws_dir_path = self.session.get(f"{self.url}/workspace_path").text return self.ws_dir_path else: - self.ws_dir_path = self.session.request( - "POST", + self.ws_dir_path = self.session.post( self.url, json=MpxReq.workspace_path(self.ws_dir_path) ).json()["text"] @@ -213,10 +208,9 @@ def workspace_path(self): @property def file_groups(self): if not self.multiplexing_mode: - return self.session.request("GET", f"{self.url}/file_groups").json()["file_groups"] + return self.session.get(f"{self.url}/file_groups").json()["file_groups"] else: - return self.session.request( - "POST", + return self.session.post( self.url, json=MpxReq.file_groups(self.ws_dir_path) ).json()["file_groups"] @@ -224,10 +218,9 @@ def file_groups(self): @property def agents(self): if not self.multiplexing_mode: - agent_dicts = self.session.request("GET", f"{self.url}/agent").json()["agents"] + agent_dicts = self.session.get(f"{self.url}/agent").json()["agents"] else: - agent_dicts = self.session.request( - "POST", + agent_dicts = self.session.post( self.url, json=MpxReq.agents(self.ws_dir_path) ).json()["agents"] @@ -238,10 +231,9 @@ def agents(self): def add_agent(self, **kwargs): if not self.multiplexing_mode: - return self.session.request("POST", f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).dict()) + return self.session.post(f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).dict()) else: - self.session.request( - "POST", + self.session.post( self.url, json=MpxReq.add_agent(self.ws_dir_path, OcrdAgentModel.create(**kwargs).dict()) ).json() @@ -258,10 +250,9 @@ def find_files(self, **kwargs): kwargs["file_grp"] = kwargs.pop("fileGrp") if not self.multiplexing_mode: - r = self.session.request(method="GET", url=f"{self.url}/file", params={**kwargs}) + r = self.session.get(url=f"{self.url}/file", params={**kwargs}) else: - r = self.session.request( - "POST", + r = self.session.post( self.url, json=MpxReq.find_files(self.ws_dir_path, {**kwargs}) ) @@ -286,11 +277,11 @@ def add_file( ) if not self.multiplexing_mode: - r = self.session.request("POST", f"{self.url}/file", data=data.dict()) + r = self.session.post(f"{self.url}/file", data=data.dict()) if not r: raise RuntimeError("Add file failed. Please check provided parameters") else: - r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, data.dict())) + r = self.session.post(self.url, json=MpxReq.add_file(self.ws_dir_path, data.dict())) if "error" in r: raise RuntimeError(f"Add file failed: Msg: {r['error']}") From be6b59d03903e3da153e4d64080f35a07fb5082a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 15:14:00 +0200 Subject: [PATCH 151/228] Processor: fix ignore (negative/zero) cases for max_workers / max_page_seconds --- src/ocrd/processor/base.py | 16 ++++++++-------- src/ocrd/workspace.py | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 269f27b703..5f8eabbb05 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -455,19 +455,19 @@ def process_workspace(self, workspace: Workspace) -> None: nr_copied = 0 # set up multithreading - if self.max_workers < 0: - max_workers = config.OCRD_MAX_PARALLEL_PAGES + if self.max_workers <= 0: + max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES) else: - max_workers = min(config.OCRD_MAX_PARALLEL_PAGES, self.max_workers) + max_workers = max(0, min(config.OCRD_MAX_PARALLEL_PAGES, self.max_workers)) if max_workers > 1: assert isinstance(workspace.mets, ClientSideOcrdMets), \ "OCRD_MAX_PARALLEL_PAGES>1 requires also using --mets-server-url" - if self.max_page_seconds < 0: - max_seconds = config.OCRD_PROCESSING_PAGE_TIMEOUT + if self.max_page_seconds <= 0: + max_seconds = max(0, config.OCRD_PROCESSING_PAGE_TIMEOUT) else: - max_seconds = min(config.OCRD_PROCESSING_PAGE_TIMEOUT, self.max_page_seconds) + max_seconds = max(0, min(config.OCRD_PROCESSING_PAGE_TIMEOUT, self.max_page_seconds)) executor = ThreadPoolExecutor( - max_workers=max_workers, + max_workers=max_workers or 1, thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}" ) self._base_logger.debug("started executor %s", str(executor)) @@ -504,7 +504,7 @@ def process_workspace(self, workspace: Workspace) -> None: # - persistent (data) error → skip / dummy / raise try: self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds) - task.result(timeout=max_seconds) + task.result(timeout=max_seconds or None) nr_succeeded += 1 # exclude NotImplementedError, so we can try process() below except NotImplementedError: diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 27c56f048d..270414ec41 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -95,8 +95,8 @@ def __init__( if self.is_remote: mets = ClientSideOcrdMets(mets_server_url, self.directory) if mets.workspace_path != self.directory: - raise ValueError(f"METS server {mets_server_url} workspace directory {mets.workspace_path} differs " - f"from local workspace directory {self.directory}. These are not the same workspaces.") + raise ValueError(f"METS server {mets_server_url} workspace directory '{mets.workspace_path}' differs " + f"from local workspace directory '{self.directory}'. These are not the same workspaces.") else: mets = OcrdMets(filename=self.mets_target) self.mets = mets From 0b5286f75d5f70c151b0ff769f9da380b17a9592 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 15:15:03 +0200 Subject: [PATCH 152/228] test_mets_server: use tmpdir to avoid side effects between suites --- tests/test_mets_server.py | 48 +++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 1487617a71..8f94b95645 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -22,13 +22,16 @@ from requests.exceptions import ConnectionError from ocrd import Resolver, OcrdMetsServer, Workspace -from ocrd_utils import pushd_popd, MIMETYPE_PAGE +from ocrd_utils import pushd_popd, MIMETYPE_PAGE, initLogging, setOverrideLogLevel -WORKSPACE_DIR = '/tmp/ocrd-mets-server' TRANSPORTS = ['/tmp/ocrd-mets-server.sock', 'http://127.0.0.1:12345'] +initLogging() +setOverrideLogLevel(10) + @fixture(scope='function', name='start_mets_server', params=TRANSPORTS) -def fixture_start_mets_server(request) -> Iterable[Tuple[str, Workspace]]: +def fixture_start_mets_server(request, tmpdir) -> Iterable[Tuple[str, Workspace]]: + tmpdir = str(tmpdir) def _start_mets_server(*args, **kwargs): mets_server = OcrdMetsServer(*args, **kwargs) mets_server.startup() @@ -39,21 +42,22 @@ def _start_mets_server(*args, **kwargs): if exists(mets_server_url): remove(mets_server_url) - if exists(WORKSPACE_DIR): - rmtree(WORKSPACE_DIR, ignore_errors=True) + if exists(tmpdir): + rmtree(tmpdir, ignore_errors=True) - copytree(assets.path_to('SBB0000F29300010000/data'), WORKSPACE_DIR) - workspace = Workspace(Resolver(), WORKSPACE_DIR) + copytree(assets.path_to('SBB0000F29300010000/data'), tmpdir) + workspace = Workspace(Resolver(), tmpdir) p = Process(target=_start_mets_server, kwargs={'workspace': workspace, 'url': request.param}) p.start() sleep(1) # sleep to start up server - yield mets_server_url, Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) + workspace_server = Workspace(Resolver(), tmpdir, mets_server_url=mets_server_url) + yield mets_server_url, workspace_server p.terminate() - rmtree(WORKSPACE_DIR, ignore_errors=True) + rmtree(tmpdir, ignore_errors=True) def add_file_server(x): - mets_server_url, i = x - workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) + mets_server_url, directory, i = x + workspace_server = Workspace(Resolver(), directory, mets_server_url=mets_server_url) workspace_server.add_file( 'FOO', local_filename=f'local_filename{i}', @@ -64,8 +68,8 @@ def add_file_server(x): ) def add_agent_server(x): - mets_server_url, i = x - workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) + mets_server_url, directory, i = x + workspace_server = Workspace(Resolver(), directory, mets_server_url=mets_server_url) workspace_server.mets.add_agent( name=f'proc{i}', _type='baz', @@ -82,7 +86,10 @@ def test_mets_server_add_file(start_mets_server): # add NO_FILES files in parallel with Pool() as pool: - pool.map(add_file_server, zip(repeat(mets_server_url), range(NO_FILES))) + pool.map(add_file_server, zip( + repeat(mets_server_url), + repeat(workspace_server.directory), + range(NO_FILES))) assert set(workspace_server.mets.file_groups) == set( [ 'OCR-D-IMG', @@ -107,7 +114,7 @@ def test_mets_server_add_file(start_mets_server): assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == NO_FILES # not yet synced - workspace_file = Workspace(Resolver(), WORKSPACE_DIR) + workspace_file = Workspace(Resolver(), workspace_server.directory) assert len(workspace_file.mets.find_all_files(fileGrp='FOO')) == 0 # sync @@ -125,13 +132,16 @@ def test_mets_server_add_agents(start_mets_server): # add NO_AGENTS agents in parallel with Pool() as pool: - pool.map(add_agent_server, zip(repeat(mets_server_url), list(range(NO_AGENTS)))) + pool.map(add_agent_server, zip( + repeat(mets_server_url), + repeat(workspace_server.directory), + list(range(NO_AGENTS)))) assert len(workspace_server.mets.agents) == NO_AGENTS + no_agents_before # XXX not a tuple assert workspace_server.mets.agents[-1].notes[0][0] == {'{https://ocr-d.de}foo': 'bar'} - workspace_file = Workspace(Resolver(), WORKSPACE_DIR) + workspace_file = Workspace(Resolver(), workspace_server.directory) assert len(workspace_file.mets.agents) == no_agents_before # sync @@ -142,7 +152,7 @@ def test_mets_server_add_agents(start_mets_server): def test_mets_server_str(start_mets_server): mets_server_url, workspace_server = start_mets_server - workspace_server = Workspace(Resolver(), WORKSPACE_DIR, mets_server_url=mets_server_url) + workspace_server = Workspace(Resolver(), workspace_server.directory, mets_server_url=mets_server_url) f = next(workspace_server.find_files()) assert str(f) == '' a = workspace_server.mets.agents[0] @@ -182,7 +192,7 @@ def test_mets_server_socket_stop(start_mets_server): assert True, 'No stop conditions to test for TCP server' else: assert Path(mets_server_url).exists() - assert workspace_server.mets.workspace_path == WORKSPACE_DIR + assert workspace_server.mets.workspace_path == workspace_server.directory workspace_server.mets.stop() with raises(ConnectionError): workspace_server.mets.file_groups From 61e1042303cad08d51761f5a5880cfd7ab73f7d8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 15:16:04 +0200 Subject: [PATCH 153/228] test processor timeout/parallel: avoid side effects to dummy tool json --- tests/data/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index c24a6979b2..2bf564d395 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -107,7 +107,8 @@ def process(self): class DummyProcessorWithOutputPagewise(Processor): @property def ocrd_tool(self): - dummy_tool = dict(DUMMY_TOOL) + # make deep copy + dummy_tool = json.loads(json.dumps(DUMMY_TOOL)) dummy_tool['parameters']['sleep'] = {'type': 'number'} return dummy_tool From e395b562b9210d75ede7a76c692fe0b74d531434 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 15:16:55 +0200 Subject: [PATCH 154/228] tess: adapt to wording of exceptions --- tests/processor/test_processor.py | 2 +- tests/test_workspace.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index e0b74fb002..0f5d4fbba3 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -284,7 +284,7 @@ def test_run_output_overwrite(self): run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", output_file_grp="OCR-D-OUT") - assert str(exc.value) == "File with ID='OCR-D-OUT_phys_0001' already exists" + assert "already exists" in str(exc.value) config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", diff --git a/tests/test_workspace.py b/tests/test_workspace.py index 9d6b64b1e3..ad9cd15575 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -87,7 +87,7 @@ def test_workspace_add_file_overwrite(plain_workspace): plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id='phys1', local_filename=fpath) with pytest.raises(FileExistsError) as fn_exc: plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id=None, local_filename=fpath) - assert str(fn_exc.value) == "File with file_id='ID1' already exists" + assert "already exists" in str(fn_exc.value) with pytest.raises(FileExistsError) as fn_exc: plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id='phys2', local_filename=fpath, force=True) assert 'cannot mitigate' in str(fn_exc.value) @@ -684,7 +684,7 @@ def test_merge_overwrite(tmp_path): ws1.add_file('X', page_id='X', mimetype='X', file_id='id123', local_filename='X/X', content='ws1') ws2.add_file('X', page_id='X', mimetype='X', file_id='id456', local_filename='X/X', content='ws2') ws1.merge(ws2) - assert "would overwrite" == str(exc.value) + assert "would overwrite" in str(exc.value) def test_merge_with_filter(plain_workspace, tmp_path): # arrange From a59ba6acb03f36db5e8e930d9ea5aba3d5329a29 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 15:31:38 +0200 Subject: [PATCH 155/228] ClientSideOcrdMets: partial revert of 9b80ae17ef --- src/ocrd/mets_server.py | 49 ++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 41b1f23eca..0aa4174d3f 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -147,9 +147,10 @@ def save(self): Request writing the changes to the file system """ if not self.multiplexing_mode: - self.session.put(url=self.url) + self.session.request("PUT", url=self.url) else: - self.session.post( + self.session.request( + "POST", self.url, json=MpxReq.save(self.ws_dir_path) ) @@ -160,10 +161,11 @@ def stop(self): """ try: if not self.multiplexing_mode: - self.session.delete(self.url) + self.session.request("DELETE", self.url) return else: - self.session.post( + self.session.request( + "POST", self.url, json=MpxReq.stop(self.ws_dir_path) ) @@ -176,9 +178,10 @@ def reload(self): Request reloading of the mets file from the file system """ if not self.multiplexing_mode: - return self.session.post(f"{self.url}/reload").text + return self.session.request("POST", f"{self.url}/reload").text else: - return self.session.post( + return self.session.request( + "POST", self.url, json=MpxReq.reload(self.ws_dir_path) ).json()["text"] @@ -186,9 +189,10 @@ def reload(self): @property def unique_identifier(self): if not self.multiplexing_mode: - return self.session.get(f"{self.url}/unique_identifier").text + return self.session.request("GET", f"{self.url}/unique_identifier").text else: - return self.session.post( + return self.session.request( + "POST", self.url, json=MpxReq.unique_identifier(self.ws_dir_path) ).json()["text"] @@ -196,10 +200,11 @@ def unique_identifier(self): @property def workspace_path(self): if not self.multiplexing_mode: - self.ws_dir_path = self.session.get(f"{self.url}/workspace_path").text + self.ws_dir_path = self.session.request("GET", f"{self.url}/workspace_path").text return self.ws_dir_path else: - self.ws_dir_path = self.session.post( + self.ws_dir_path = self.session.request( + "POST", self.url, json=MpxReq.workspace_path(self.ws_dir_path) ).json()["text"] @@ -208,9 +213,10 @@ def workspace_path(self): @property def file_groups(self): if not self.multiplexing_mode: - return self.session.get(f"{self.url}/file_groups").json()["file_groups"] + return self.session.request("GET", f"{self.url}/file_groups").json()["file_groups"] else: - return self.session.post( + return self.session.request( + "POST", self.url, json=MpxReq.file_groups(self.ws_dir_path) ).json()["file_groups"] @@ -218,9 +224,10 @@ def file_groups(self): @property def agents(self): if not self.multiplexing_mode: - agent_dicts = self.session.get(f"{self.url}/agent").json()["agents"] + agent_dicts = self.session.request("GET", f"{self.url}/agent").json()["agents"] else: - agent_dicts = self.session.post( + agent_dicts = self.session.request( + "POST", self.url, json=MpxReq.agents(self.ws_dir_path) ).json()["agents"] @@ -231,9 +238,10 @@ def agents(self): def add_agent(self, **kwargs): if not self.multiplexing_mode: - return self.session.post(f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).dict()) + return self.session.request("POST", f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).dict()) else: - self.session.post( + self.session.request( + "POST", self.url, json=MpxReq.add_agent(self.ws_dir_path, OcrdAgentModel.create(**kwargs).dict()) ).json() @@ -250,9 +258,10 @@ def find_files(self, **kwargs): kwargs["file_grp"] = kwargs.pop("fileGrp") if not self.multiplexing_mode: - r = self.session.get(url=f"{self.url}/file", params={**kwargs}) + r = self.session.request(method="GET", url=f"{self.url}/file", params={**kwargs}) else: - r = self.session.post( + r = self.session.request( + "POST", self.url, json=MpxReq.find_files(self.ws_dir_path, {**kwargs}) ) @@ -277,11 +286,11 @@ def add_file( ) if not self.multiplexing_mode: - r = self.session.post(f"{self.url}/file", data=data.dict()) + r = self.session.request("POST", f"{self.url}/file", data=data.dict()) if not r: raise RuntimeError("Add file failed. Please check provided parameters") else: - r = self.session.post(self.url, json=MpxReq.add_file(self.ws_dir_path, data.dict())) + r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, data.dict())) if "error" in r: raise RuntimeError(f"Add file failed: Msg: {r['error']}") From 554a67d016f8ae5b747826582bd486c3faa396bf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 17:04:04 +0200 Subject: [PATCH 156/228] disableLogging: re-instate root logger, to --- src/ocrd_utils/logging.py | 4 +++- tests/test_decorators.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 5cea55e5b1..181805118d 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -210,11 +210,13 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): # logging.basicConfig(level=logging.CRITICAL) # logging.disable(logging.ERROR) # remove all handlers for the ocrd logger - for logger_name in ROOT_OCRD_LOGGERS: + for logger_name in ROOT_OCRD_LOGGERS + ['']: for handler in logging.getLogger(logger_name).handlers[:]: logging.getLogger(logger_name).removeHandler(handler) for logger_name in LOGGING_DEFAULTS: logging.getLogger(logger_name).setLevel(logging.NOTSET) + # Python default log level is WARNING + logging.root.setLevel(logging.WARNING) # Initializing stream handlers at module level # would cause message output in all runtime contexts, diff --git a/tests/test_decorators.py b/tests/test_decorators.py index 5ab2880053..df8d6422be 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -64,6 +64,7 @@ def test_loglevel_override(self): pytest.skip(f"ocrd_logging.conf found at {get_logging_config_files()}, skipping logging test") import logging disableLogging() + assert logging.getLogger('').getEffectiveLevel() == logging.WARNING assert logging.getLogger('ocrd').getEffectiveLevel() == logging.WARNING initLogging() assert logging.getLogger('ocrd').getEffectiveLevel() == logging.INFO From 1114cd9165844aa5bbffa7439177fb00d78fee4e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 17:15:56 +0200 Subject: [PATCH 157/228] test-logging: also remove ocrd.log from tempdir --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e44919d3f3..1a4a6bbdb8 100644 --- a/Makefile +++ b/Makefile @@ -273,7 +273,7 @@ test-logging: assets cp src/ocrd_utils/ocrd_logging.conf $$tempdir; \ cd $$tempdir; \ $(PYTHON) -m pytest --continue-on-collection-errors -k TestLogging -k TestDecorators $(TESTDIR); \ - rm -r $$tempdir/ocrd_logging.conf $$tempdir/.benchmarks; \ + rm -r $$tempdir/ocrd_logging.conf $$tempdir/ocrd.log $$tempdir/.benchmarks; \ rm -rf $$tempdir/.coverage; \ rmdir $$tempdir From ce6d23937c13eae2b5c884e77f2243f94770c737 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 00:27:32 +0200 Subject: [PATCH 158/228] Processor: fix 7966057f (deprecated passing of ocrd_tool or version via init) --- src/ocrd/processor/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 5f8eabbb05..614d5cb44a 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -301,12 +301,12 @@ def __init__( if ocrd_tool is not None: deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - " "use or override metadata/executable/ocrd-tool properties instead") - self._ocrd_tool = ocrd_tool - self._executable = ocrd_tool['executable'] + self.ocrd_tool = ocrd_tool + self.executable = ocrd_tool['executable'] if version is not None: deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - " "use or override metadata/version properties instead") - self._version = version + self.version = version if workspace is not None: deprecation_warning("Passing a workspace argument other than 'None' to Processor " "is deprecated - pass as argument to process_workspace instead") From df9916074b77ebbed3fcad6e84c3bba66e65b0d9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 00:50:08 +0200 Subject: [PATCH 159/228] Processor.generate_processor_help: forgot to include --log-filename --- src/ocrd/decorators/ocrd_cli_options.py | 2 +- src/ocrd/processor/base.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd/decorators/ocrd_cli_options.py b/src/ocrd/decorators/ocrd_cli_options.py index e069b3ea81..944f606458 100644 --- a/src/ocrd/decorators/ocrd_cli_options.py +++ b/src/ocrd/decorators/ocrd_cli_options.py @@ -39,6 +39,7 @@ def cli(mets_url): parameter_option, parameter_override_option, loglevel_option, + option('--log-filename', default=None), option('--address', type=ServerAddressParamType()), option('--queue', type=QueueServerParamType()), option('--database', type=DatabaseParamType()), @@ -48,7 +49,6 @@ def cli(mets_url): option('-D', '--dump-module-dir', is_flag=True, default=False), option('-h', '--help', is_flag=True, default=False), option('-V', '--version', is_flag=True, default=False), - option('--log-filename', default=None), # Subcommand, only used for 'worker'/'server'. Cannot be handled in # click because processors use the @command decorator and even if they # were using `group`, you cannot combine have a command with diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 614d5cb44a..859b5d4f79 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -991,6 +991,7 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None) otherwise URL is a path to an on-demand-created unix socket -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE] Override log level globally [INFO] + --log-filename LOG-PATH File to redirect stderr logging to (overriding ocrd_logging.conf). ''' information_options = '''\ From eb74fab45d1e8fe713801dab92a42710f5ef904d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 00:50:34 +0200 Subject: [PATCH 160/228] bashlib: re-add --log-filename, implement as stderr redirect --- src/ocrd/lib.bash | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index 6b08f669d1..476b410966 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -156,6 +156,7 @@ ocrd__parse_argv () { while [[ "${1:-}" = -* ]];do case "$1" in -l|--log-level) ocrd__argv[log_level]=$2 ; shift ;; + --log-filename) exec 2> "$2" ; shift ;; -h|--help|--usage) ocrd__usage; exit ;; -J|--dump-json) ocrd__dumpjson; exit ;; -D|--dump-module-dir) echo $(dirname "$OCRD_TOOL_JSON"); exit ;; From 8565a8f4187df52ac30c40a5d5b03366751882f9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 01:19:17 +0200 Subject: [PATCH 161/228] test_processor: add legacy (v2-style) dummy case --- tests/data/__init__.py | 27 +++++++++++++++++++++++++-- tests/processor/test_processor.py | 9 +++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index 2bf564d395..1589ae4dbc 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -1,6 +1,5 @@ import json import os -import re from time import sleep from pytest import warns from ocrd import Processor, OcrdPageResult @@ -147,7 +146,7 @@ def __init__(self, *args, **kwargs): # no error handling with old process(), so override new API def process_page_file(self, input_file): - n = int(re.findall(r'\d+', input_file.pageId)[-1]) + n = self.workspace.mets.physical_pages.index(input_file.pageId) + 1 if n % 2: raise Exception(f"intermittent failure on page {input_file.pageId}") output_file_id = make_file_id(input_file, self.output_file_grp) @@ -160,6 +159,30 @@ def process_page_file(self, input_file): force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) +class DummyProcessorWithOutputLegacy(Processor): + def __init__(self, *args, **kwargs): + kwargs['download_files'] = False + kwargs['ocrd_tool'] = DUMMY_TOOL + kwargs['version'] = '0.0.1' + super().__init__(*args, **kwargs) + if hasattr(self, 'output_file_grp'): + self.setup() + + def process(self): + # print([str(x) for x in self.input_files] + for input_file in self.input_files: + file_id = make_file_id(input_file, self.output_file_grp) + # print(input_file.ID, file_id) + self.workspace.add_file( + file_id=file_id, + file_grp=self.output_file_grp, + page_id=input_file.pageId, + mimetype=input_file.mimetype, + local_filename=os.path.join(self.output_file_grp, file_id), + content='CONTENT', + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) + class IncompleteProcessor(Processor): @property def executable(self): diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 0f5d4fbba3..1faef5be70 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -9,6 +9,7 @@ DummyProcessor, DummyProcessorWithRequiredParameters, DummyProcessorWithOutput, + DummyProcessorWithOutputLegacy, DummyProcessorWithOutputPagewise, DummyProcessorWithOutputFailures, IncompleteProcessor @@ -218,6 +219,14 @@ def test_run_output0(self): output_file_grp="OCR-D-OUT") assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 2 + def test_run_output_legacy(self): + ws = self.workspace + run_processor(DummyProcessorWithOutputLegacy, + workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT") + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + def test_run_output_missing(self): ws = self.workspace from ocrd_utils import config From abe069a490ee541ae04ef8f6214ab4250c880570 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 01:48:44 +0200 Subject: [PATCH 162/228] :memo: update changelog --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a3807c0b9..b9e660ebcd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,10 +12,13 @@ Added: - `OCRD_PROCESSING_PAGE_TIMEOUT` for whether and how long processors should wait for single pages - `OCRD_MAX_MISSING_OUTPUTS` for maximum rate (fraction) of pages before making `OCRD_MISSING_OUTPUT=abort` +Fixed: + - `disableLogging`: also re-instate root logger to Python defaults + ## [3.0.0b1] - 2024-08-26 Fixed: - - actually apply CLI `--log-filename` + - actually apply CLI `--log-filename`, and show in `--help` - adapt to Pillow changes - `ocrd workspace clone`: do pass on `--file-grp` (for download filtering) From 11f926412f5b71a05483eac34d9313a652ba3b5b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 01:48:46 +0200 Subject: [PATCH 163/228] :memo: update readmes (esp. new config variables) --- README.md | 40 +++++++++++++++++++++++++++++++--------- README_bashlib.md | 10 +++++++++- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index b401428ee0..d41a2dddb6 100644 --- a/README.md +++ b/README.md @@ -47,17 +47,12 @@ complete stack of OCR-D-related software. The easiest way to install is via `pip`: -```sh -pip install ocrd + pip install ocrd -# or just the functionality you need, e.g. - -pip install ocrd_modelfactory -``` All Python software released by [OCR-D](https://github.com/OCR-D) requires Python 3.8 or higher. -**NOTE** Some OCR-D-Tools (or even test cases) _might_ reveal an unintended behavior if you have specific environment modifications, like: +> **NOTE** Some OCR-D tools (or even test cases) _might_ reveal an unintended behavior if you have specific environment modifications, like: * using a custom build of [ImageMagick](https://github.com/ImageMagick/ImageMagick), whose format delegates are different from what OCR-D supposes * custom Python logging configurations in your personal account @@ -82,7 +77,6 @@ Almost all behaviour of the OCR-D/core software is configured via CLI options an Some parts of the software are configured via environment variables: -* `OCRD_METS_CACHING`: If set to `true`, access to the METS file is cached, speeding in-memory search and modification. * `OCRD_PROFILE`: This variable configures the built-in CPU and memory profiling. If empty, no profiling is done. Otherwise expected to contain any of the following tokens: * `CPU`: Enable CPU profiling of processor runs * `RSS`: Enable RSS memory profiling @@ -95,18 +89,46 @@ Some parts of the software are configured via environment variables: * `XDG_CONFIG_HOME`: Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database) – defaults to `$HOME/.config`. * `XDG_DATA_HOME`: Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location) – defaults to `$HOME/.local/share`. -* `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of workspace files. +* `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of resources or workspace files. * `OCRD_DOWNLOAD_TIMEOUT`: Timeout in seconds for connecting or reading (comma-separated) when downloading. +* `OCRD_MISSING_INPUT`: How to deal with missing input files (for some fileGrp/pageId) during processing: + * `SKIP`: ignore and proceed with next page's input + * `ABORT`: throw `MissingInputFile` exception + +* `OCRD_MISSING_OUTPUT`: How to deal with missing output files (for some fileGrp/pageId) during processing: + * `SKIP`: ignore and proceed processing next page + * `COPY`: fall back to copying input PAGE to output fileGrp for page + * `ABORT`: re-throw whatever caused processing to fail + +* `OCRD_MAX_MISSING_OUTPUTS`: Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative). + +* `OCRD_EXISTING_OUTPUT`: How to deal with already existing output files (for some fileGrp/pageId) during processing: + * `SKIP`: ignore and proceed processing next page + * `OVERWRITE`: force writing result to output fileGrp for page + * `ABORT`: re-throw `FileExistsError` exception + + * `OCRD_METS_CACHING`: Whether to enable in-memory storage of OcrdMets data structures for speedup during processing or workspace operations. * `OCRD_MAX_PROCESSOR_CACHE`: Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers. +* `OCRD_MAX_PARALLEL_PAGES`: Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set `>1`, then a METS Server must be used for METS synchronisation. + +* `OCRD_PROCESSING_PAGE_TIMEOUT`: Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies. + * `OCRD_NETWORK_SERVER_ADDR_PROCESSING`: Default address of Processing Server to connect to (for `ocrd network client processing`). * `OCRD_NETWORK_SERVER_ADDR_WORKFLOW`: Default address of Workflow Server to connect to (for `ocrd network client workflow`). * `OCRD_NETWORK_SERVER_ADDR_WORKSPACE`: Default address of Workspace Server to connect to (for `ocrd network client workspace`). * `OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS`: Number of attempts for a worker to create its queue. Helpful if the rabbitmq-server needs time to be fully started. +* `OCRD_NETWORK_CLIENT_POLLING_SLEEP`: How many seconds to sleep before trying `ocrd network client` again. +* `OCRD_NETWORK_CLIENT_POLLING_TIMEOUT`: Timeout for a blocking `ocrd network client` (in seconds). + +* `OCRD_NETWORK_SOCKETS_ROOT_DIR`: The root directory where all mets server related socket files are created. +* `OCRD_NETWORK_LOGS_ROOT_DIR`: The root directory where all ocrd_network related file logs are stored. + + ## Packages diff --git a/README_bashlib.md b/README_bashlib.md index 09199468cc..20379c3c92 100644 --- a/README_bashlib.md +++ b/README_bashlib.md @@ -21,6 +21,9 @@ For example: * [`ocrd__log`](#ocrd__log) * [`ocrd__minversion`](#ocrd__minversion) * [`ocrd__dumpjson`](#ocrd__dumpjson) +* [`ocrd__resolve_resource`](#ocrd__resolve_resource) +* [`ocrd__show_resource`](#ocrd__show_resource) +* [`ocrd__list_resources`](#ocrd__list_resources) * [`ocrd__usage`](#ocrd__usage) * [`ocrd__parse_argv`](#ocrd__parse_argv) @@ -56,6 +59,10 @@ export OCRD_TOOL_NAME=ocrd-foo-bar (Which you automatically get from [`ocrd__wrap`](#ocrd__wrap).) +### `ocrd__resolve_resource` + +Output given resource file's path. + ### `ocrd__show_resource` Output given resource file's content. @@ -88,6 +95,7 @@ This will be filled by the parser along the following keys: - `profile`: whether `--profile` is enabled - `profile_file`: the argument of `--profile-file` - `log_level`: the argument of `--log-level` +- `mets_server_url`: the argument of `--mets-server-url` argument - `mets_file`: absolute path of the `--mets` argument - `working_dir`: absolute path of the `--working-dir` argument or the parent of `mets_file` - `page_id`: the argument of `--page-id` @@ -95,7 +103,7 @@ This will be filled by the parser along the following keys: - `output_file_grp`: the argument of `--output-file-grp` Moreover, there will be an associative array **`params`** -with the fully expanded runtime values of the ocrd-tool.json parameters. +with the fully validated and default-expanded runtime values of the `ocrd-tool.json` parameters. ### `ocrd__wrap` From ca8812228b87c635adeefe74be4c48a89ff063c0 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 30 Aug 2024 13:26:53 +0200 Subject: [PATCH 164/228] :package: v3.0.0b2 --- CHANGELOG.md | 7 +++---- VERSION | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b9e660ebcd..ccb31bc1ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ Change Log Versioned according to [Semantic Versioning](http://semver.org/). -## Unreleased +## [3.0.0b2] - 2024-08-30 Added: - `Processor.max_workers`: class attribute to control per-page parallelism of this implementation @@ -2244,12 +2244,11 @@ Fixed Initial Release -<<<<<<< HEAD +[3.0.0b2]: ../../compare/v3.0.0b2..v3.0.0b1 +[3.0.0b1]: ../../compare/v3.0.0b1..v3.0.0a2 [3.0.0a2]: ../../compare/v3.0.0a2..v3.0.0a1 [3.0.0a1]: ../../compare/v3.0.0a1..v2.67.2 -======= [2.68.0]: ../../compare/v2.68.0..v2.67.2 ->>>>>>> @{-1} [2.67.2]: ../../compare/v2.67.2..v2.67.1 [2.67.1]: ../../compare/v2.67.1..v2.67.0 [2.67.0]: ../../compare/v2.67.0..v2.66.1 diff --git a/VERSION b/VERSION index 2daa89b06c..2aa4d8f0a8 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0b1 +3.0.0b2 From 837aba7f55fa5b042bda699acf0dea12b3a20f67 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 22:13:01 +0200 Subject: [PATCH 165/228] ocrd_utils.config: add reset_defaults() --- src/ocrd_utils/config.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 0186b8539b..36399870e2 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -78,14 +78,26 @@ def has_default(self, name): raise ValueError(f"Unregistered env variable {name}") return self._variables[name].has_default + def reset_defaults(self): + for name in self._variables: + try: + # we cannot use hasattr, because that delegates to getattr, + # which we override and provide defaults for (which of course + # cannot be removed) + if self.__getattribute__(name): + delattr(self, name) + except AttributeError: + pass + def describe(self, name, *args, **kwargs): if not name in self._variables: raise ValueError(f"Unregistered env variable {name}") return self._variables[name].describe(*args, **kwargs) def __getattr__(self, name): + # will be called if name is not accessible (has not been added directly yet) if not name in self._variables: - raise ValueError(f"Unregistered env variable {name}") + raise AttributeError(f"Unregistered env variable {name}") var_obj = self._variables[name] try: raw_value = self.raw_value(name) From 85e96ffbd787ec3aedac73a9368bdd903124b877 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 22:13:31 +0200 Subject: [PATCH 166/228] add test for OcrdEnvConfig.reset_defaults() --- tests/utils/test_config.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index 99595a864c..a94eb5d3cc 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -57,3 +57,11 @@ def test_OCRD_PROFILE(): with temp_env_var('OCRD_PROFILE', 'some other value'): with raises(ValueError, match="'OCRD_PROFILE' set to invalid value 'some other value'"): config.OCRD_PROFILE + +def test_defaults(): + default = config.OCRD_MAX_PROCESSOR_CACHE + print(type(default)) + config.OCRD_MAX_PROCESSOR_CACHE = 2 + assert config.OCRD_MAX_PROCESSOR_CACHE == 2 + config.reset_defaults() + assert config.OCRD_MAX_PROCESSOR_CACHE == default From 8911c3be2947870a0193a688fa3a6cf72a9751bc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 30 Aug 2024 15:17:14 +0200 Subject: [PATCH 167/228] Processor: improve processing log messages --- src/ocrd/processor/base.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 859b5d4f79..a72e4dd3da 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -470,7 +470,7 @@ def process_workspace(self, workspace: Workspace) -> None: max_workers=max_workers or 1, thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}" ) - self._base_logger.debug("started executor %s", str(executor)) + self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) tasks = {} for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): @@ -478,7 +478,7 @@ def process_workspace(self, workspace: Workspace) -> None: page_id = next(input_file.pageId for input_file in input_file_tuple if input_file) - self._base_logger.info(f"processing page {page_id}") + self._base_logger.info(f"preparing page {page_id}") for i, input_file in enumerate(input_file_tuple): if input_file is None: # file/page not found in this file grp @@ -521,9 +521,10 @@ def process_workspace(self, workspace: Workspace) -> None: # broad coverage of output failures (including TimeoutError) except (Exception, TimeoutError) as err: # FIXME: add re-usable/actionable logging - self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") if config.OCRD_MISSING_OUTPUT == 'ABORT': + self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") raise err + self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") if config.OCRD_MISSING_OUTPUT == 'SKIP': nr_skipped += 1 continue @@ -587,6 +588,7 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files) assert isinstance(input_files[0], get_args(OcrdFileType)) page_id = input_files[0].pageId + self._base_logger.info("processing page %s", page_id) for i, input_file in enumerate(input_files): assert isinstance(input_file, get_args(OcrdFileType)) self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}") From 98d97fc5b2afa7917ffc04335937ccb4fdbbc984 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 30 Aug 2024 15:18:00 +0200 Subject: [PATCH 168/228] ocrd.cli doc: don't rewrap description lists --- src/ocrd/cli/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index fee3c47d88..9e8a37b8bf 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -61,11 +61,11 @@ def get_help(self, ctx): \b {config.describe('OCRD_DOWNLOAD_INPUT')} \b -{config.describe('OCRD_MISSING_INPUT')} +{config.describe('OCRD_MISSING_INPUT', wrap_text=False)} \b -{config.describe('OCRD_MISSING_OUTPUT')} +{config.describe('OCRD_MISSING_OUTPUT', wrap_text=False)} \b -{config.describe('OCRD_EXISTING_OUTPUT')} +{config.describe('OCRD_EXISTING_OUTPUT', wrap_text=False)} \b {config.describe('OCRD_METS_CACHING')} \b From cb758e8dfc97aa5a41f5ee08fae3e459c4c283cc Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 30 Aug 2024 15:42:07 +0200 Subject: [PATCH 169/228] :package: v3.0.0b3 --- CHANGELOG.md | 7 +++++++ VERSION | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ccb31bc1ac..115d68334a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ Change Log Versioned according to [Semantic Versioning](http://semver.org/). +## [3.0.0b3] - 2024-08-30 + +Added: + + * `OcrdConfig.reset_defaults` to reset config variables to their defaults + ## [3.0.0b2] - 2024-08-30 Added: @@ -2244,6 +2250,7 @@ Fixed Initial Release +[3.0.0b3]: ../../compare/v3.0.0b3..v3.0.0b2 [3.0.0b2]: ../../compare/v3.0.0b2..v3.0.0b1 [3.0.0b1]: ../../compare/v3.0.0b1..v3.0.0a2 [3.0.0a2]: ../../compare/v3.0.0a2..v3.0.0a1 diff --git a/VERSION b/VERSION index 2aa4d8f0a8..005e92c1e4 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0b2 +3.0.0b3 From 1ed38a6a7559bc0c109e8130220de49698796efd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 31 Aug 2024 01:56:38 +0200 Subject: [PATCH 170/228] Processor.metadata_location: find location package prefix (necessary for namespace packages) --- src/ocrd/processor/base.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index a72e4dd3da..a18e53f5af 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -166,11 +166,14 @@ def metadata_location(self) -> Path: (Override if ``ocrd-tool.json`` is not distributed with the Python package.) """ - # XXX HACK - module_tokens = self.__module__.split('.') - if module_tokens[0] == 'src': - module_tokens.pop(0) - return resource_filename(module_tokens[0], self.metadata_filename) + module = inspect.getmodule(self) + module_tokens = module.__package__.split('.') + # for namespace packages, we cannot just use the first token + for i in range(len(module_tokens)): + prefix = '.'.join(module_tokens[:i + 1]) + if sys.modules[prefix].__spec__.has_location: + return resource_filename(prefix, self.metadata_filename) + raise Exception("cannot find top-level module prefix for %s", module.__package__) @cached_property def metadata_rawdict(self) -> dict: From 7d98c270110eda161ca7401645aab778a3911542 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 10:13:38 +0200 Subject: [PATCH 171/228] Processor: log when max_workers / max_page_seconds are in effect --- src/ocrd/processor/base.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index a18e53f5af..a28643660a 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -458,17 +458,17 @@ def process_workspace(self, workspace: Workspace) -> None: nr_copied = 0 # set up multithreading - if self.max_workers <= 0: - max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES) - else: - max_workers = max(0, min(config.OCRD_MAX_PARALLEL_PAGES, self.max_workers)) + max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES) + if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES: + self._base_logger.info("limiting number of threads from %d to %d", max_workers, self.max_workers) + max_workers = self.max_workers if max_workers > 1: assert isinstance(workspace.mets, ClientSideOcrdMets), \ "OCRD_MAX_PARALLEL_PAGES>1 requires also using --mets-server-url" - if self.max_page_seconds <= 0: - max_seconds = max(0, config.OCRD_PROCESSING_PAGE_TIMEOUT) - else: - max_seconds = max(0, min(config.OCRD_PROCESSING_PAGE_TIMEOUT, self.max_page_seconds)) + max_seconds = max(0, config.OCRD_PROCESSING_PAGE_TIMEOUT) + if self.max_page_seconds > 0 and self.max_page_seconds < config.OCRD_PROCESSING_PAGE_TIMEOUT: + self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds) + max_seconds = self.max_page_seconds executor = ThreadPoolExecutor( max_workers=max_workers or 1, thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}" From 6b23b659f514f3672c7f5c6d29feeb1d85a26435 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 10:14:13 +0200 Subject: [PATCH 172/228] Workspace.reload_mets: fix for METS server case --- src/ocrd/workspace.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 270414ec41..4a99a112c1 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -121,7 +121,10 @@ def reload_mets(self): """ Reload METS from the filesystem. """ - self.mets = OcrdMets(filename=self.mets_target) + if self.is_remote: + self.mets.reload() + else: + self.mets = OcrdMets(filename=self.mets_target) @deprecated_alias(pageId="page_id") @deprecated_alias(ID="file_id") From cac05cd31a4cf8a6aa4d3aef7b9b772ec4de96a6 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 2 Sep 2024 11:35:52 +0200 Subject: [PATCH 173/228] :memo: changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 115d68334a..767dea5c20 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,13 @@ Change Log Versioned according to [Semantic Versioning](http://semver.org/). +## Unreleased + +Fixed: + + * `Processor.metadata_location`: `src` workaround respects namespace packages, qurator-spk/eynollah#134 + * `Workspace.reload_mets`: handle ClientSideOcrdMets as well + ## [3.0.0b3] - 2024-08-30 Added: From 0b0d419c3304747f25cb3be509b182e061870be5 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 2 Sep 2024 11:36:21 +0200 Subject: [PATCH 174/228] :package: v3.0.0b4 --- CHANGELOG.md | 2 ++ VERSION | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 767dea5c20..7ec12c8934 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [3.0.0b4] - 2024-09-02 + Fixed: * `Processor.metadata_location`: `src` workaround respects namespace packages, qurator-spk/eynollah#134 diff --git a/VERSION b/VERSION index 005e92c1e4..9414e12700 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0b3 +3.0.0b4 From a34beb8ec01c8af34eb7a58fd7fd8ba4f362dc46 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Sep 2024 14:59:42 +0200 Subject: [PATCH 175/228] OcrdMetsServer.add_file: pass on 'force' kwarg, too --- src/ocrd/mets_server.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 0aa4174d3f..2f7b9842b3 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -284,15 +284,17 @@ def add_file( file_id=ID, page_id=pageId, mimetype=mimetype, url=url, local_filename=local_filename ) + # add force+ignore + kwargs = {**kwargs, **data.dict()} if not self.multiplexing_mode: - r = self.session.request("POST", f"{self.url}/file", data=data.dict()) - if not r: - raise RuntimeError("Add file failed. Please check provided parameters") + r = self.session.request("POST", f"{self.url}/file", data=kwargs) + if not r.ok: + raise RuntimeError(f"Failed to add file ({str(data)}): {r.json()}") else: - r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, data.dict())) - if "error" in r: - raise RuntimeError(f"Add file failed: Msg: {r['error']}") + r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, kwargs)) + if not r.ok: + raise RuntimeError(f"Failed to add file ({str(data)}): {r.json()[errors]}") return ClientSideOcrdFile( None, fileGrp=file_grp, @@ -505,7 +507,8 @@ async def add_file( page_id: Optional[str] = Form(), mimetype: str = Form(), url: Optional[str] = Form(None), - local_filename: Optional[str] = Form(None) + local_filename: Optional[str] = Form(None), + force: bool = Form(False), ): """ Add a file @@ -517,7 +520,7 @@ async def add_file( ) # Add to workspace kwargs = file_resource.dict() - workspace.add_file(**kwargs) + workspace.add_file(**kwargs, force=force) return file_resource # ------------- # From dfa715db56ce538e3b6dc4f849983e392bff4296 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Sep 2024 15:00:38 +0200 Subject: [PATCH 176/228] test_mets_server: add test for force (overwrite) --- tests/test_mets_server.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 8f94b95645..dc94d6c560 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -55,7 +55,7 @@ def _start_mets_server(*args, **kwargs): p.terminate() rmtree(tmpdir, ignore_errors=True) -def add_file_server(x): +def add_file_server(x, force=False): mets_server_url, directory, i = x workspace_server = Workspace(Resolver(), directory, mets_server_url=mets_server_url) workspace_server.add_file( @@ -65,6 +65,7 @@ def add_file_server(x): page_id=f'page{i}', file_id=f'FOO_page{i}_foo{i}', # url=f'url{i}' + force=force ) def add_agent_server(x): @@ -123,6 +124,19 @@ def test_mets_server_add_file(start_mets_server): assert len(workspace_file.mets.find_all_files(fileGrp='FOO')) == NO_FILES +def test_mets_server_add_file_overwrite(start_mets_server): + mets_server_url, workspace_server = start_mets_server + + add_file_server((mets_server_url, workspace_server.directory, 5)) + + assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == 1 + + with raises(RuntimeError, match="already exists"): + add_file_server((mets_server_url, workspace_server.directory, 5)) + + add_file_server((mets_server_url, workspace_server.directory, 5), force=True) + assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == 1 + def test_mets_server_add_agents(start_mets_server): NO_AGENTS = 30 From 9a8c41db32e08795dcd5c1614d654e820911abdb Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Sep 2024 15:01:37 +0200 Subject: [PATCH 177/228] test_processor: add test for force (overwrite) w/ METS Server --- tests/data/__init__.py | 2 +- tests/processor/test_processor.py | 35 +++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index 1589ae4dbc..11b7b01cc2 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -103,7 +103,7 @@ def process(self): force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) -class DummyProcessorWithOutputPagewise(Processor): +class DummyProcessorWithOutputSleep(Processor): @property def ocrd_tool(self): # make deep copy diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 1faef5be70..4305f0e68e 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -10,7 +10,7 @@ DummyProcessorWithRequiredParameters, DummyProcessorWithOutput, DummyProcessorWithOutputLegacy, - DummyProcessorWithOutputPagewise, + DummyProcessorWithOutputSleep, DummyProcessorWithOutputFailures, IncompleteProcessor ) @@ -266,7 +266,7 @@ def test_run_output_timeout(self): config.OCRD_MAX_MISSING_OUTPUTS = -1 config.OCRD_MISSING_OUTPUT = 'ABORT' config.OCRD_PROCESSING_PAGE_TIMEOUT = 3 - run_processor(DummyProcessorWithOutputPagewise, workspace=ws, + run_processor(DummyProcessorWithOutputSleep, workspace=ws, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-OUT", parameter={"sleep": 1}) @@ -275,7 +275,7 @@ def test_run_output_timeout(self): config.OCRD_PROCESSING_PAGE_TIMEOUT = 1 from concurrent.futures import TimeoutError with pytest.raises(TimeoutError) as exc: - run_processor(DummyProcessorWithOutputPagewise, workspace=ws, + run_processor(DummyProcessorWithOutputSleep, workspace=ws, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-OUT", parameter={"sleep": 3}) @@ -419,6 +419,33 @@ def ocrd_tool(self): r = self.capture_out_err() assert 'ERROR ocrd.processor.base - Found no file for page phys_0001 in file group GRP1' in r.err +def test_run_output_metsserver(start_mets_server): + mets_server_url, ws = start_mets_server + from ocrd_utils import config + # do not raise for number of failures: + config.OCRD_MAX_MISSING_OUTPUTS = -1 + run_processor(DummyProcessorWithOutputSleep, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + parameter={"sleep": 0}, + mets_server_url=mets_server_url) + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' + run_processor(DummyProcessorWithOutputSleep, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + parameter={"sleep": 0}, + mets_server_url=mets_server_url) + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + config.OCRD_EXISTING_OUTPUT = 'ABORT' + with pytest.raises(Exception) as exc: + run_processor(DummyProcessorWithOutputSleep, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + parameter={"sleep": 0}, + mets_server_url=mets_server_url) + assert "already exists" in str(exc.value) + # 2s (+ 2s tolerance) instead of 3*3s (+ 2s tolerance) @pytest.mark.timeout(4) def test_run_output_parallel(start_mets_server): @@ -429,7 +456,7 @@ def test_run_output_parallel(start_mets_server): # do not raise for number of failures: config.OCRD_MAX_MISSING_OUTPUTS = -1 config.OCRD_MAX_PARALLEL_PAGES = 3 - run_processor(DummyProcessorWithOutputPagewise, workspace=ws, + run_processor(DummyProcessorWithOutputSleep, workspace=ws, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-OUT", parameter={"sleep": 2}, From 65ab63c7bde1fa468c63cb1f29a497d3a8f55fcf Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 26 Aug 2024 11:46:14 +0200 Subject: [PATCH 178/228] add typing, extend docs --- src/ocrd/processor/base.py | 4 ++-- src/ocrd/processor/helpers.py | 9 +++++---- tests/data/__init__.py | 13 +++++++------ tests/processor/test_processor.py | 3 ++- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index a28643660a..6931768009 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -16,7 +16,7 @@ import os from os import getcwd from pathlib import Path -from typing import List, Optional, Union, get_args +from typing import Any, List, Optional, Union, get_args import sys import inspect import tarfile @@ -339,7 +339,7 @@ def __init__( self._finalizer = weakref.finalize(self, self.shutdown) # workaround for deprecated#72 (@deprecated decorator does not work for subclasses): setattr(self, 'process', - deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process'))) + deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')(getattr(self, 'process'))) def show_help(self, subcommand=None): """ diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index a675ff129e..22837e2120 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -234,10 +234,10 @@ def get_cached_processor(parameter: dict, processor_class): def get_processor( processor_class, parameter: Optional[dict] = None, - workspace: Workspace = None, - page_id: str = None, - input_file_grp: List[str] = None, - output_file_grp: List[str] = None, + workspace: Optional[Workspace] = None, + page_id: Optional[str] = None, + input_file_grp: Optional[List[str]] = None, + output_file_grp: Optional[List[str]] = None, instance_caching: bool = False, ): if processor_class: @@ -258,6 +258,7 @@ def get_processor( else: # avoid passing workspace already (deprecated chdir behaviour) processor = processor_class(None, parameter=parameter) + assert processor # set current processing parameters processor.workspace = workspace processor.page_id = page_id diff --git a/tests/data/__init__.py b/tests/data/__init__.py index 11b7b01cc2..4dcf29fa02 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -1,3 +1,4 @@ +from functools import cached_property import json import os from time import sleep @@ -72,15 +73,15 @@ def __init__(self, *args, **kwargs): def process(self): pass class DummyProcessorWithOutput(Processor): - @property + @cached_property def ocrd_tool(self): return DUMMY_TOOL - @property + @cached_property def version(self): return '0.0.1' - @property + @cached_property def executable(self): return 'ocrd-test' @@ -128,15 +129,15 @@ def process_page_pcgts(self, pcgts, page_id=None): return OcrdPageResult(pcgts) class DummyProcessorWithOutputFailures(Processor): - @property + @cached_property def ocrd_tool(self): return DUMMY_TOOL - @property + @cached_property def version(self): return '0.0.1' - @property + @cached_property def executable(self): return 'ocrd-test' diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 4305f0e68e..1497927a08 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -1,3 +1,4 @@ +from functools import cached_property import json from contextlib import ExitStack @@ -188,7 +189,7 @@ def test_params_preset_resolve(self): def test_params(self): class ParamTestProcessor(Processor): - @property + @cached_property def ocrd_tool(self): return {} proc = ParamTestProcessor(None) From 73a395e40ce4aed9007a6959af56476503dcb008 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Sep 2024 22:19:23 +0200 Subject: [PATCH 179/228] Processor.verify: revert 5819c816 (we still have no defaults in json loaded from v2) --- src/ocrd/processor/base.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 6931768009..02595f4c21 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -374,10 +374,13 @@ def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], assert len(grps) >= minimum, msg % (len(grps), str(spec)) if maximum > 0: assert len(grps) <= maximum, msg % (len(grps), str(spec)) - assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'], - "Unexpected number of input file groups %d vs %s") - assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'], - "Unexpected number of output file groups %d vs %s") + # FIXME: enforce unconditionally as soon as grace period for deprecation is over + if 'input_file_grp_cardinality' in self.ocrd_tool: + assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'], + "Unexpected number of input file groups %d vs %s") + if 'output_file_grp_cardinality' in self.ocrd_tool: + assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'], + "Unexpected number of output file groups %d vs %s") for input_file_grp in input_file_grps: assert input_file_grp in self.workspace.mets.file_groups # keep this for backwards compatibility: From 3382ad985f911aadcfb1a1a3b62de8de25d424a0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Sep 2024 22:21:52 +0200 Subject: [PATCH 180/228] Processor.process_page_file / OcrdPageResultImage: allow None instead of AlternativeImageType --- src/ocrd/processor/base.py | 2 ++ src/ocrd/processor/ocrd_page_result.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 02595f4c21..94e8cce547 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -619,6 +619,8 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: image_result.alternative_image.set_imageHeight(image_result.pil.height) elif isinstance(image_result.alternative_image, AlternativeImageType): image_result.alternative_image.set_filename(image_file_path) + elif image_result.alternative_image is None: + pass # do not reference in PAGE result else: raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type " f"{type(image_result.alternative_image)}") diff --git a/src/ocrd/processor/ocrd_page_result.py b/src/ocrd/processor/ocrd_page_result.py index dcd8ccd44d..5f21a72f57 100644 --- a/src/ocrd/processor/ocrd_page_result.py +++ b/src/ocrd/processor/ocrd_page_result.py @@ -1,5 +1,5 @@ from dataclasses import dataclass, field -from typing import List, Union +from typing import List, Union, Optional from ocrd_models.ocrd_page import OcrdPage from PIL.Image import Image @@ -9,7 +9,7 @@ class OcrdPageResultImage(): pil : Image file_id_suffix : str - alternative_image : Union[AlternativeImageType, PageType] + alternative_image : Optional[Union[AlternativeImageType, PageType]] @dataclass class OcrdPageResult(): From cad477723599ef5ae271e73e5f39f91fef8ae1e8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:59:51 +0200 Subject: [PATCH 181/228] PcGts.Page.id / make_xml_id: replace '/' with '_' --- src/ocrd_utils/str.py | 3 ++- tests/model/test_ocrd_page.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py index 6a973fac73..13d03cc5b8 100644 --- a/src/ocrd_utils/str.py +++ b/src/ocrd_utils/str.py @@ -108,10 +108,11 @@ def make_xml_id(idstr: str) -> str: ret = idstr if not REGEX_FILE_ID.fullmatch(ret): ret = ret.replace(':', '_') + ret = ret.replace('/', '_') ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret) ret = re.sub(r'[^\w.-]', r'', ret) return ret - + def nth_url_segment(url, n=-1): """ Return the last /-delimited segment of a URL-like string diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index 7dc130809f..97335775d6 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -460,7 +460,7 @@ def test_id(): # TODO: is this *really* desired? # I would expect for a single Page-Element the ID is like from the top-level-Pgts-Container, not like a fileName - assert pcgts.get_Page().id == 'OCR-D-IMG/INPUT_0017.tif' + assert pcgts.get_Page().id == 'OCR-D-IMG_INPUT_0017.tif' if __name__ == '__main__': From 10b2abc0b0d7cd4125c5ecb66f66306b05119873 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:35:07 +0200 Subject: [PATCH 182/228] ocrd.cli.ocrd-tool resolve-resource: fix (forgot to print result) --- src/ocrd/cli/ocrd_tool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/cli/ocrd_tool.py b/src/ocrd/cli/ocrd_tool.py index fa815daeb9..3ceaba40c5 100644 --- a/src/ocrd/cli/ocrd_tool.py +++ b/src/ocrd/cli/ocrd_tool.py @@ -125,7 +125,7 @@ def ocrd_tool_tool_list_resources(ctx): @click.argument('res_name') @pass_ocrd_tool def ocrd_tool_tool_resolve_resource(ctx, res_name): - ctx.processor(None).resolve_resource(res_name) + print(ctx.processor(None).resolve_resource(res_name)) @ocrd_tool_tool.command('show-resource', help="Dump a tool's file resource") @click.argument('res_name') From bd644441605cba4ae5cb433aba7b92bc3c475ff3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:10:12 +0200 Subject: [PATCH 183/228] processor CLI: delegate --resolve-resource, too --- src/ocrd/decorators/ocrd_cli_options.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ocrd/decorators/ocrd_cli_options.py b/src/ocrd/decorators/ocrd_cli_options.py index 944f606458..a401264ed2 100644 --- a/src/ocrd/decorators/ocrd_cli_options.py +++ b/src/ocrd/decorators/ocrd_cli_options.py @@ -43,6 +43,7 @@ def cli(mets_url): option('--address', type=ServerAddressParamType()), option('--queue', type=QueueServerParamType()), option('--database', type=DatabaseParamType()), + option('-R', '--resolve-resource'), option('-C', '--show-resource'), option('-L', '--list-resources', is_flag=True, default=False), option('-J', '--dump-json', is_flag=True, default=False), From 71e9841ad5a1c6e73c83a605e9f9a26a9b2e2b25 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 15 Sep 2024 16:32:55 +0200 Subject: [PATCH 184/228] METS Server: also export+delegate physical_pages --- src/ocrd/mets_server.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 2f7b9842b3..101727e064 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -88,6 +88,14 @@ def create(file_groups: List[str]): return OcrdFileGroupListModel(file_groups=file_groups) +class OcrdPageListModel(BaseModel): + physical_pages: List[str] = Field() + + @staticmethod + def create(physical_pages: List[str]): + return OcrdPageListModel(physical_pages=physical_pages) + + class OcrdAgentListModel(BaseModel): agents: List[OcrdAgentModel] = Field() @@ -210,6 +218,17 @@ def workspace_path(self): ).json()["text"] return self.ws_dir_path + @property + def physical_pages(self) -> List[str]: + if not self.multiplexing_mode: + return self.session.request("GET", f"{self.url}/physical_pages").json()["physical_pages"] + else: + return self.session.request( + "POST", + self.url, + json=MpxReq.physical_pages(self.ws_dir_path) + ).json()["physical_pages"] + @property def file_groups(self): if not self.multiplexing_mode: @@ -349,6 +368,11 @@ def workspace_path(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( ws_dir_path, method_type="GET", response_type="text", request_url="workspace_path", request_data={}) + @staticmethod + def physical_pages(ws_dir_path: str) -> Dict: + return MpxReq.__args_wrapper( + ws_dir_path, method_type="GET", response_type="dict", request_url="physical_pages", request_data={}) + @staticmethod def file_groups(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( @@ -468,6 +492,10 @@ async def unique_identifier(): async def workspace_path(): return Response(content=workspace.directory, media_type="text/plain") + @app.get(path='/physical_pages', response_model=OcrdPageListModel) + async def physical_pages(): + return {'physical_pages': workspace.mets.physical_pages} + @app.get(path='/file_groups', response_model=OcrdFileGroupListModel) async def file_groups(): return {'file_groups': workspace.mets.file_groups} From 01ccdf152456e261a9334d2bebb9f1703ff53477 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:57:08 +0200 Subject: [PATCH 185/228] ocrd.cli.workspace: consistently pass on --mets-server-url and --backup (also, simplify) --- src/ocrd/cli/workspace.py | 85 +++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 47 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 3aece34933..ca59916cdd 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -36,6 +36,17 @@ def __init__(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, met = self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename, mets_server_url) self.automatic_backup = automatic_backup + def workspace(self): + return Workspace( + self.resolver, + directory=self.directory, + mets_basename=self.mets_basename, + automatic_backup=self.automatic_backup, + mets_server_url=self.mets_server_url, + ) + def backup_manager(self): + return WorkspaceBackupManager(self.workspace()) + pass_workspace = click.make_pass_decorator(WorkspaceCtx) @@ -138,6 +149,7 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir)) ctx.directory = workspace_dir + assert not ctx.mets_server_url workspace = ctx.resolver.workspace_from_url( mets_url, dst_dir=ctx.directory, @@ -173,10 +185,11 @@ def workspace_init(ctx, clobber_mets, directory): if directory: LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory)) ctx.directory = directory + assert not ctx.mets_server_url workspace = ctx.resolver.workspace_from_nothing( directory=ctx.directory, mets_basename=ctx.mets_basename, - clobber_mets=clobber_mets + clobber_mets=clobber_mets, ) workspace.save_mets() print(workspace.directory) @@ -200,13 +213,7 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_ Add a file or http(s) URL FNAME to METS in a workspace. If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace. """ - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - automatic_backup=ctx.automatic_backup, - mets_server_url=ctx.mets_server_url, - ) + workspace = ctx.workspace() log = getLogger('ocrd.cli.workspace.add') if not mimetype: @@ -313,13 +320,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi """ log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - automatic_backup=ctx.automatic_backup, - mets_server_url=ctx.mets_server_url, - ) + workspace = ctx.workspace() try: pat = re.compile(regex) @@ -455,12 +456,7 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl output_field = [snake_to_camel.get(x, x) for x in output_field] modified_mets = False ret = [] - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - mets_server_url=ctx.mets_server_url, - ) + workspace = ctx.workspace() with pushd_popd(workspace.directory): for f in workspace.find_files( file_id=file_id, @@ -510,7 +506,7 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin (If any ``ID`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() for i in id: workspace.remove_file(i, force=force, keep_file=keep_file) workspace.save_mets() @@ -528,7 +524,7 @@ def rename_group(ctx, old, new): """ Rename fileGrp (USE attribute ``NEW`` to ``OLD``). """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() workspace.rename_file_group(old, new) workspace.save_mets() @@ -549,7 +545,7 @@ def remove_group(ctx, group, recursive, force, keep_files): (If any ``GROUP`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() for g in group: workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files) workspace.save_mets() @@ -571,7 +567,7 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id): (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() with pushd_popd(workspace.directory): for f in workspace.find_files( file_id=file_id, @@ -608,8 +604,7 @@ def clean(ctx, dry_run, directories, path_glob): If no PATH_GLOB are specified, then all files and directories may match. """ - log = getLogger('ocrd.cli.workspace.clean') - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() allowed_files = [normpath(f.local_filename) for f in workspace.find_files(local_only=True)] allowed_files.append(relpath(workspace.mets_target, start=workspace.directory)) allowed_dirs = set(dirname(path) for path in allowed_files) @@ -627,7 +622,7 @@ def clean(ctx, dry_run, directories, path_glob): if normpath(path) in allowed_files: continue if dry_run: - log.info('unlink(%s)' % path) + ctx.log.info('unlink(%s)' % path) else: unlink(path) if not directories: @@ -637,7 +632,7 @@ def clean(ctx, dry_run, directories, path_glob): if normpath(path) in allowed_dirs: continue if dry_run: - log.info('rmdir(%s)' % path) + ctx.log.info('rmdir(%s)' % path) else: rmdir(path) @@ -651,7 +646,7 @@ def list_groups(ctx): """ List fileGrp USE attributes """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() print("\n".join(workspace.mets.file_groups)) # ---------------------------------------------------------------------- @@ -677,7 +672,7 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() find_kwargs = {} if page_id_range and 'ID' in output_field: find_kwargs['pageId'] = page_id_range @@ -724,7 +719,7 @@ def get_id(ctx): """ Get METS id if any """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() ID = workspace.mets.unique_identifier if ID: print(ID) @@ -744,7 +739,7 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin Otherwise will create a new {{ ID }}. """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() workspace.mets.unique_identifier = id workspace.save_mets() @@ -767,7 +762,7 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): if contentids: update_kwargs['CONTENTIDS'] = contentids try: - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() workspace.mets.update_physical_page_attributes(page_id, **update_kwargs) workspace.save_mets() except Exception as err: @@ -805,7 +800,7 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa mets_path = Path(mets_path) if filegrp_mapping: filegrp_mapping = loads(filegrp_mapping) - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name)) workspace.merge( other_workspace, @@ -829,11 +824,12 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa # ---------------------------------------------------------------------- @workspace_cli.group('backup') -@click.pass_context +@pass_workspace def workspace_backup_cli(ctx): # pylint: disable=unused-argument """ Backing and restoring workspaces - dev edition """ + assert not ctx.mets_server_url, "Workspace backups currently not interoperable with METS Server" @workspace_backup_cli.command('add') @pass_workspace @@ -841,7 +837,7 @@ def workspace_backup_add(ctx): """ Create a new backup """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() backup_manager.add() @workspace_backup_cli.command('list') @@ -850,7 +846,7 @@ def workspace_backup_list(ctx): """ List backups """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() for b in backup_manager.list(): print(b) @@ -862,7 +858,7 @@ def workspace_backup_restore(ctx, choose_first, bak): """ Restore backup BAK """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() backup_manager.restore(bak, choose_first) @workspace_backup_cli.command('undo') @@ -871,7 +867,7 @@ def workspace_backup_undo(ctx): """ Restore the last backup """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() backup_manager.undo() @@ -888,13 +884,8 @@ def workspace_serve_cli(ctx): # pylint: disable=unused-argument @workspace_serve_cli.command('stop') @pass_workspace def workspace_serve_stop(ctx): # pylint: disable=unused-argument - """Stop the METS server""" - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - mets_server_url=ctx.mets_server_url, - ) + """Stop the METS server (saving changes to disk)""" + workspace = ctx.workspace() workspace.mets.stop() @workspace_serve_cli.command('start') From 3301f9c64071d0fe7a8b99038b8080263c2bb6a1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:57:32 +0200 Subject: [PATCH 186/228] ocrd.cli.workspace server: add 'reload' and 'save' --- src/ocrd/cli/workspace.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index ca59916cdd..ca4e8629db 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -888,6 +888,20 @@ def workspace_serve_stop(ctx): # pylint: disable=unused-argument workspace = ctx.workspace() workspace.mets.stop() +@workspace_serve_cli.command('reload') +@pass_workspace +def workspace_serve_reload(ctx): # pylint: disable=unused-argument + """Reload the METS server from disk""" + workspace = ctx.workspace() + workspace.mets.reload() + +@workspace_serve_cli.command('save') +@pass_workspace +def workspace_serve_save(ctx): # pylint: disable=unused-argument + """Save the METS changes to disk""" + workspace = ctx.workspace() + workspace.mets.save() + @workspace_serve_cli.command('start') @pass_workspace def workspace_serve_start(ctx): # pylint: disable=unused-argument From dc2c758e80318b1f230ad1ce80f5c38ac97d425e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:32:32 +0200 Subject: [PATCH 187/228] ocrd.cli.bashlib input-files: pass on --mets-server-url, too --- src/ocrd/cli/bashlib.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/ocrd/cli/bashlib.py b/src/ocrd/cli/bashlib.py index d46c81ee46..b6817abe91 100644 --- a/src/ocrd/cli/bashlib.py +++ b/src/ocrd/cli/bashlib.py @@ -76,10 +76,10 @@ def bashlib_constants(name): @click.option('--ocrd-tool', help="path to ocrd-tool.json of processor to feed", default=None) @click.option('--executable', help="name of processor executable in ocrd-tool.json", default=None) @click.option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME) -@click.option('-w', '--working-dir', help="Working Directory") +@click.option('-U', '--mets-server-url', help='TCP host URI or UDS path of METS server', default=None) +@click.option('-d', '--working-dir', help="Working Directory") @click.option('-I', '--input-file-grp', help='File group(s) used as input.', default=None) @click.option('-O', '--output-file-grp', help='File group(s) used as output.', default=None) -# repeat some other processor options for convenience (will be ignored here) @click.option('-g', '--page-id', help="ID(s) of the pages to process") @click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist\n" "(with '--page-id', remove only those).\n" @@ -126,9 +126,10 @@ def metadata_location(self): def executable(self): # needed for ocrd_tool lookup return executable + processor_class = FullBashlibProcessor else: # we have no true metadata file, so fill in just to make it work - class FullBashlibProcessor(BashlibProcessor): + class UnknownBashlibProcessor(BashlibProcessor): @property def ocrd_tool(self): # needed to satisfy the validator @@ -142,5 +143,6 @@ def ocrd_tool(self): def version(self): # needed to satisfy the validator and wrapper return '1.0' + processor_class = UnknownBashlibProcessor - ocrd_cli_wrap_processor(FullBashlibProcessor, **kwargs) + ocrd_cli_wrap_processor(processor_class, **kwargs) From 42af6a33ccd6b72899606b4c69789951a1cf445a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:36:03 +0200 Subject: [PATCH 188/228] ocrd.cli.validate tasks: pass on --mets-server-url, too --- src/ocrd/cli/validate.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ocrd/cli/validate.py b/src/ocrd/cli/validate.py index 61d26988a4..a1ec8fafd6 100644 --- a/src/ocrd/cli/validate.py +++ b/src/ocrd/cli/validate.py @@ -102,16 +102,19 @@ def validate_page(page, **kwargs): @validate_cli.command('tasks') @click.option('--workspace', nargs=1, required=False, help='Workspace directory these tasks are to be run. If omitted, only validate syntax') @click.option('-M', '--mets-basename', nargs=1, default=DEFAULT_METS_BASENAME, help='Basename of the METS file, used in conjunction with --workspace') +@click.option('-U', '--mets-server-url', help='TCP host URI or UDS path of METS server') @click.option('--overwrite', is_flag=True, default=False, help='When checking against a concrete workspace, simulate overwriting output or page range.') @click.option('-g', '--page-id', help="ID(s) of the pages to process") @click.argument('tasks', nargs=-1, required=True) -def validate_process(tasks, workspace, mets_basename, overwrite, page_id): +def validate_process(tasks, workspace, mets_basename, mets_server_url, overwrite, page_id): ''' Validate a sequence of tasks passable to `ocrd process` ''' if workspace: - _inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks], - Workspace(Resolver(), directory=workspace, mets_basename=mets_basename), page_id=page_id, overwrite=overwrite)) + _inform_of_result(validate_tasks( + [ProcessorTask.parse(t) for t in tasks], + Workspace(Resolver(), directory=workspace, mets_basename=mets_basename, mets_server_url=mets_server_url), + page_id=page_id, overwrite=overwrite)) else: for t in [ProcessorTask.parse(t) for t in tasks]: _inform_of_result(t.validate()) From 7ea8d57e688621810cf6d04e261edcfd30739d75 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:48:47 +0200 Subject: [PATCH 189/228] Processor.process_workspace(): do not show NotImplementedError context if fallback process() raises anything itself --- src/ocrd/processor/base.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 94e8cce547..fac9825bf8 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -549,7 +549,11 @@ def process_workspace(self, workspace: Workspace) -> None: except NotImplementedError: # fall back to deprecated method - self.process() + try: + self.process() + except Exception as err: + # suppress the NotImplementedError context + raise err from None def _copy_page_file(self, input_file : OcrdFileType) -> None: """ From 975125644da4be0a71f55016dab4a066da2d31b1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:43:03 +0200 Subject: [PATCH 190/228] Processor.verify: check output fileGrps as well (or OCRD_EXISTING_OUTPUT=OVERWRITE|SKIP or disjoint --page-id) --- src/ocrd/processor/base.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index fac9825bf8..d669d29f68 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -358,6 +358,7 @@ def verify(self): """ Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements. """ + # verify input and output file groups in parameters assert self.input_file_grp is not None assert self.output_file_grp is not None input_file_grps = self.input_file_grp.split(',') @@ -381,8 +382,16 @@ def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], if 'output_file_grp_cardinality' in self.ocrd_tool: assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'], "Unexpected number of output file groups %d vs %s") + # verify input and output file groups in METS for input_file_grp in input_file_grps: - assert input_file_grp in self.workspace.mets.file_groups + assert input_file_grp in self.workspace.mets.file_groups, \ + f"input fileGrp {input_file_grp} does not exist in workspace {self.workspace}" + for output_file_grp in output_file_grps: + assert output_file_grp not in self.workspace.mets.file_groups \ + or config.OCRD_EXISTING_OUTPUT in ['OVERWRITE', 'SKIP'] \ + or not any(self.workspace.mets.find_files( + pageId=self.page_id, fileGrp=output_file_grp)), \ + f"output fileGrp {output_file_grp} already exists in workspace {self.workspace}" # keep this for backwards compatibility: return True From f66753ae1f2f35fe3a19642369c9cb96aba49058 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:47:14 +0200 Subject: [PATCH 191/228] run_processor: be robust if ocrd_tool is missing steps --- src/ocrd/processor/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 22837e2120..2cbbbd97e1 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -89,7 +89,7 @@ def run_processor( ocrd_tool = processor.ocrd_tool name = '%s v%s' % (ocrd_tool['executable'], processor.version) - otherrole = ocrd_tool['steps'][0] + otherrole = ocrd_tool.get('steps', [''])[0] logProfile = getLogger('ocrd.process.profile') log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) t0_wall = perf_counter() From eb12a809593e17f9329fc36b55cf9e1f99866e45 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:38:11 +0200 Subject: [PATCH 192/228] lib.bash: fix errexit --- src/ocrd/lib.bash | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index 476b410966..b68829abef 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -27,8 +27,8 @@ ocrd__log () { ## Ensure minimum version # ht https://stackoverflow.com/posts/4025065 ocrd__minversion () { - local minversion_raw="$1" set -e + local minversion_raw="$1" local version_raw=$(ocrd --version|sed 's/ocrd, version //') local version_mmp=$(echo "$version_raw" | grep -Eo '([0-9]+\.?){3}') local version_prerelease_suffix="${version_raw#$version_mmp}" @@ -123,6 +123,7 @@ ocrd__usage () { ## declare -A ocrd__argv=() ## ``` ocrd__parse_argv () { + set -e # if [[ -n "$ZSH_VERSION" ]];then # print -r -- ${+ocrd__argv} ${(t)ocrd__argv} @@ -140,6 +141,7 @@ ocrd__parse_argv () { exit 1 fi + ocrd__argv[debug]=false ocrd__argv[overwrite]=false ocrd__argv[profile]=false ocrd__argv[profile_file]= @@ -170,6 +172,7 @@ ocrd__parse_argv () { -w|--working-dir) ocrd__argv[working_dir]=$(realpath "$2") ; shift ;; -m|--mets) ocrd__argv[mets_file]=$(realpath "$2") ; shift ;; -U|--mets-server-url) ocrd__argv[mets_server_url]="$2" ; shift ;; + --debug) ocrd__argv[debug]=true ;; --overwrite) ocrd__argv[overwrite]=true ;; --profile) ocrd__argv[profile]=true ;; --profile-file) ocrd__argv[profile_file]=$(realpath "$2") ; shift ;; @@ -265,6 +268,7 @@ $params_parsed" } ocrd__wrap () { + set -e declare -gx OCRD_TOOL_JSON="$1" declare -gx OCRD_TOOL_NAME="$2" From 3355ea49f1364483879fad8d0a5b6748ec0f44c7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:41:25 +0200 Subject: [PATCH 193/228] lib.bash input-files: pass on --mets-server-url, --overwrite, and parameters (necessary for required params) --- src/ocrd/lib.bash | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index b68829abef..00c4936c8c 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -136,6 +136,10 @@ ocrd__parse_argv () { ocrd__raise "Must set \$params (declare -A params)" fi + if ! declare -p "params_json" >/dev/null 2>/dev/null ;then + ocrd__raise "Must set \$params_json (declare params_json)" + fi + if [[ $# = 0 ]];then ocrd__usage exit 1 @@ -264,6 +268,7 @@ ocrd__parse_argv () { $params_parsed" } eval "$params_parsed" + params_json="$(ocrd ocrd-tool "$OCRD_TOOL_JSON" tool $OCRD_TOOL_NAME parse-params --json "${__parameters[@]}" "${__parameter_overrides[@]}")" } @@ -276,6 +281,7 @@ ocrd__wrap () { shift declare -Agx params params=() + declare -g params_json declare -Agx ocrd__argv ocrd__argv=() @@ -297,22 +303,26 @@ ocrd__wrap () { ocrd__parse_argv "$@" - i=0 - declare -ag ocrd__files=() - while read line; do - eval declare -Ag "ocrd__file$i=( $line )" - eval "ocrd__files[$i]=ocrd__file$i" - let ++i - done < <(ocrd bashlib input-files \ + declare -ag ocrd__files + IFS=$'\n' + ocrd__files=( $(ocrd bashlib input-files \ --ocrd-tool $OCRD_TOOL_JSON \ --executable $OCRD_TOOL_NAME \ + $(if [[ ${ocrd__argv[debug]} = true ]]; then echo --debug; fi) \ + $(if [[ ${ocrd__argv[overwrite]} = true ]]; then echo --overwrite; fi) \ -m "${ocrd__argv[mets_file]}" \ + -d "${ocrd__argv[working_dir]}" \ + ${ocrd__argv[mets_server_url]:+-U} ${ocrd__argv[mets_server_url]:-} \ + -p "$params_json" \ -I "${ocrd__argv[input_file_grp]}" \ -O "${ocrd__argv[output_file_grp]}" \ - ${ocrd__argv[page_id]:+-g} ${ocrd__argv[page_id]:-}) + ${ocrd__argv[page_id]:+-g} ${ocrd__argv[page_id]:-}) ) + IFS=$' \t\n' } ## usage: pageId=$(ocrd__input_file 3 pageId) ocrd__input_file() { - eval echo "\${${ocrd__files[$1]}[$2]}" + declare -A input_file + eval input_file=( "${ocrd__files[$1]}" ) + eval echo "${input_file[$2]}" } From f05f840b5ae68ffeda99444a217828e2a2ef9904 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:44:04 +0200 Subject: [PATCH 194/228] lib.bash input-files: do not try to validate tasks here (impossible to get right with required parameters, and now covered by wrapped Processor.verify() already) --- src/ocrd/lib.bash | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index 00c4936c8c..52bde30258 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -249,17 +249,6 @@ ocrd__parse_argv () { trap showtime DEBUG fi - # check fileGrps - local _valopts=( --workspace "${ocrd__argv[working_dir]}" --mets-basename "$(basename ${ocrd__argv[mets_file]})" ) - if [[ ${ocrd__argv[overwrite]} = true ]]; then - _valopts+=( --overwrite ) - fi - if [[ -n "${ocrd__argv[page_id]:-}" ]]; then - _valopts+=( --page-id "${ocrd__argv[page_id]}" ) - fi - _valopts+=( "${OCRD_TOOL_NAME#ocrd-} -I ${ocrd__argv[input_file_grp]} -O ${ocrd__argv[output_file_grp]} ${__parameters[*]@Q} ${__parameter_overrides[*]@Q}" ) - ocrd validate tasks "${_valopts[@]}" || exit $? - # check parameters local params_parsed retval params_parsed="$(ocrd ocrd-tool "$OCRD_TOOL_JSON" tool $OCRD_TOOL_NAME parse-params "${__parameters[@]}" "${__parameter_overrides[@]}")" || { From b5c11919c03db90e302a160ff9f28229e55f12dc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:46:31 +0200 Subject: [PATCH 195/228] Processor / Workspace.add_file: always force if config.OCRD_EXISTING_OUTPUT==OVERWRITE --- src/ocrd/processor/base.py | 3 --- src/ocrd/processor/builtin/dummy_processor.py | 2 -- src/ocrd/workspace.py | 3 +++ 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index d669d29f68..26ea532d16 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -590,7 +590,6 @@ def _copy_page_file(self, input_file : OcrdFileType) -> None: local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(input_pcgts), - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: @@ -643,7 +642,6 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: self.output_file_grp, page_id=page_id, file_path=image_file_path, - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) result.pcgts.set_pcGtsId(output_file_id) self.add_metadata(result.pcgts) @@ -654,7 +652,6 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(result.pcgts), - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index a5f217a155..72a260968f 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -47,7 +47,6 @@ def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: mimetype=input_file.mimetype, local_filename=local_filename, content=f.read(), - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) file_id = file_id + '_PAGE' pcgts = page_from_file(output_file) @@ -62,7 +61,6 @@ def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: local_filename=join(self.output_file_grp, file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) else: if self.parameter['copy_files']: diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 4a99a112c1..3cbc58c78c 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -19,6 +19,7 @@ from ocrd_modelfactory import exif_from_filename, page_from_file from ocrd_utils import ( atomic_write, + config, getLogger, image_from_polygon, coordinates_of_segment, @@ -427,6 +428,8 @@ def add_file(self, file_grp, content=None, **kwargs) -> Union[OcrdFile, ClientSi kwargs["pageId"] = kwargs.pop("page_id") if "file_id" in kwargs: kwargs["ID"] = kwargs.pop("file_id") + if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': + kwargs["force"] = True ret = self.mets.add_file(file_grp, **kwargs) From cbe465aabb5a86c07a9120ae07c0721d31b9779f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 01:02:19 +0200 Subject: [PATCH 196/228] test processors: no need for 'force' kwarg anymore --- tests/data/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index 4dcf29fa02..56779a6119 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -101,7 +101,6 @@ def process(self): mimetype=input_file.mimetype, local_filename=os.path.join(self.output_file_grp, file_id), content='CONTENT', - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) class DummyProcessorWithOutputSleep(Processor): @@ -157,7 +156,6 @@ def process_page_file(self, input_file): local_filename=os.path.join(self.output_file_grp, output_file_id), mimetype=input_file.mimetype, content='CONTENT', - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) class DummyProcessorWithOutputLegacy(Processor): @@ -181,7 +179,6 @@ def process(self): mimetype=input_file.mimetype, local_filename=os.path.join(self.output_file_grp, file_id), content='CONTENT', - force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', ) class IncompleteProcessor(Processor): From 3e214cab44cadc307062bc8d4c501863f7f64408 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 01:03:43 +0200 Subject: [PATCH 197/228] tests: make sure ocrd_utils.config gets reset whenever changing it globally --- tests/processor/test_processor.py | 12 ++++++------ tests/test_decorators.py | 6 +++++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 1497927a08..33a9548811 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -17,7 +17,7 @@ ) from tests.test_mets_server import fixture_start_mets_server -from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging +from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging, config from ocrd.resolver import Resolver from ocrd.processor import Processor, run_processor, run_cli, NonUniqueInputFile from ocrd.processor.helpers import get_processor @@ -39,6 +39,10 @@ def setUp(self): self.workspace = self.resolver.workspace_from_url('mets.xml') self.addCleanup(stack.pop_all().close) + def tearDown(self): + super().tearDown() + config.reset_defaults() + def test_incomplete_processor(self): proc = IncompleteProcessor(None) proc.input_file_grp = 'OCR-D-IMG' @@ -230,7 +234,6 @@ def test_run_output_legacy(self): def test_run_output_missing(self): ws = self.workspace - from ocrd_utils import config # do not raise for number of failures: config.OCRD_MAX_MISSING_OUTPUTS = -1 config.OCRD_MISSING_OUTPUT = 'SKIP' @@ -240,6 +243,7 @@ def test_run_output_missing(self): # only half succeed assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) // 2 config.OCRD_MISSING_OUTPUT = 'ABORT' + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' with pytest.raises(Exception) as exc: run_processor(DummyProcessorWithOutputFailures, workspace=ws, input_file_grp="OCR-D-IMG", @@ -262,7 +266,6 @@ def test_run_output_missing(self): def test_run_output_timeout(self): ws = self.workspace - from ocrd_utils import config # do not raise for number of failures: config.OCRD_MAX_MISSING_OUTPUTS = -1 config.OCRD_MISSING_OUTPUT = 'ABORT' @@ -286,7 +289,6 @@ def test_run_output_overwrite(self): ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002') - from ocrd_utils import config config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, file_id='OCR-D-OUT_phys_0001', page_id='phys_0001') config.OCRD_EXISTING_OUTPUT = 'ABORT' @@ -422,7 +424,6 @@ def ocrd_tool(self): def test_run_output_metsserver(start_mets_server): mets_server_url, ws = start_mets_server - from ocrd_utils import config # do not raise for number of failures: config.OCRD_MAX_MISSING_OUTPUTS = -1 run_processor(DummyProcessorWithOutputSleep, workspace=ws, @@ -451,7 +452,6 @@ def test_run_output_metsserver(start_mets_server): @pytest.mark.timeout(4) def test_run_output_parallel(start_mets_server): mets_server_url, ws = start_mets_server - from ocrd_utils import config # do not raise for single-page timeout config.OCRD_PROCESSING_PAGE_TIMEOUT = -1 # do not raise for number of failures: diff --git a/tests/test_decorators.py b/tests/test_decorators.py index df8d6422be..c36577020a 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -15,7 +15,7 @@ ocrd_loglevel, ocrd_cli_wrap_processor, ) # pylint: disable=protected-access -from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging, get_logging_config_files +from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging, get_logging_config_files, config @click.command() @ocrd_cli_options @@ -45,6 +45,10 @@ def setUp(self): super().setUp() disableLogging() + def tearDown(self): + super().tearDown() + config.reset_defaults() + def test_minimal(self): exit_code, out, err = self.invoke_cli(cli_with_ocrd_cli_options, ['-l', 'DEBUG']) print(out, err) From c549c42aef193589262863ba0566bfb311a40080 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 7 Sep 2024 14:25:37 +0200 Subject: [PATCH 198/228] OcrdPage: add PageType.get_ReadingOrderGroups() --- src/ocrd_page_user_methods.py | 1 + .../get_ReadingOrderGroups.py | 33 +++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 src/ocrd_page_user_methods/get_ReadingOrderGroups.py diff --git a/src/ocrd_page_user_methods.py b/src/ocrd_page_user_methods.py index 8a2332e6e5..fe22dd89ab 100644 --- a/src/ocrd_page_user_methods.py +++ b/src/ocrd_page_user_methods.py @@ -116,6 +116,7 @@ def _add_method(class_re, method_name, file_name=None): _add_method(r'^(PageType)$', 'set_Border'), _add_method(r'^(CoordsType)$', 'set_points'), _add_method(r'^(PageType)$', 'get_AllTextLines'), + _add_method(r'^(PageType)$', 'get_ReadingOrderGroups'), # for some reason, pagecontent.xsd does not declare @orientation at the abstract/base RegionType: _add_method(r'^(PageType|AdvertRegionType|MusicRegionType|MapRegionType|ChemRegionType|MathsRegionType|SeparatorRegionType|ChartRegionType|TableRegionType|GraphicRegionType|LineDrawingRegionType|ImageRegionType|TextRegionType)$', 'set_orientation'), ) diff --git a/src/ocrd_page_user_methods/get_ReadingOrderGroups.py b/src/ocrd_page_user_methods/get_ReadingOrderGroups.py new file mode 100644 index 0000000000..e7d6c02b77 --- /dev/null +++ b/src/ocrd_page_user_methods/get_ReadingOrderGroups.py @@ -0,0 +1,33 @@ +def get_ReadingOrderGroups(self) -> dict: + """ + Aggregate recursive ReadingOrder into a dictionary, mapping each regionRef + (i.e. segment `@id`) to its referring group object (i.e one of + + \b + - :py:class:`.RegionRefType` + - :py:class:`.RegionRefIndexedType` + - :py:class:`.OrderedGroupType` + - :py:class:`.OrderedGroupIndexedType` + - :py:class:`.UnoderedGroupType` + - :py:class:`.UnoderedGroupIndexedType` + """ + def get_groupdict(group): + regionrefs = list() + if isinstance(group, (OrderedGroupType, OrderedGroupIndexedType)): + regionrefs = (group.get_RegionRefIndexed() + + group.get_OrderedGroupIndexed() + + group.get_UnorderedGroupIndexed()) + if isinstance(group, (UnorderedGroupType, UnorderedGroupIndexedType)): + regionrefs = (group.get_RegionRef() + + group.get_OrderedGroup() + + group.get_UnorderedGroup()) + refdict = {} + for elem in regionrefs: + refdict[elem.get_regionRef()] = elem + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): + refdict = {**refdict, **get_groupdict(elem)} + return refdict + ro = self.get_ReadingOrder() + if ro is None: + return {} + return get_groupdict(ro.get_OrderedGroup() or ro.get_UnorderedGroup()) From 53b880f0cdf5166bd6101de95213504791082858 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 7 Sep 2024 14:25:58 +0200 Subject: [PATCH 199/228] update OcrdPage from generateds --- src/ocrd_models/ocrd_page_generateds.py | 55 ++++++++++++++++++++----- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/src/ocrd_models/ocrd_page_generateds.py b/src/ocrd_models/ocrd_page_generateds.py index 6fef4c8635..f2b7c0551e 100644 --- a/src/ocrd_models/ocrd_page_generateds.py +++ b/src/ocrd_models/ocrd_page_generateds.py @@ -2,30 +2,28 @@ # -*- coding: utf-8 -*- # -# Generated Wed Nov 3 12:30:32 2021 by generateDS.py version 2.35.20. -# Python 3.6.9 (default, Jan 26 2021, 15:33:00) [GCC 8.4.0] +# Generated Sat Sep 7 14:17:39 2024 by generateDS.py version 2.35.20. +# Python 3.8.17+ (heads/3.8-dirty:1663f8ba84, Aug 15 2023, 18:13:01) [GCC 8.3.0] # # Command line options: # ('-f', '') # ('--root-element', 'PcGts') -# ('-o', 'ocrd_models/ocrd_models/ocrd_page_generateds.py') +# ('-o', 'src/ocrd_models/ocrd_page_generateds.py') # ('--silence', '') # ('--export', 'write etree') # ('--disable-generatedssuper-lookup', '') -# ('--user-methods', 'ocrd_models/ocrd_page_user_methods.py') +# ('--user-methods', 'src/ocrd_page_user_methods.py') # # Command line arguments: -# ocrd_validators/ocrd_validators/page.xsd +# src/ocrd_validators/page.xsd # # Command line: -# /home/kba/monorepo/ocrd_all/venv/bin/generateDS -f --root-element="PcGts" -o "ocrd_models/ocrd_models/ocrd_page_generateds.py" --silence --export="write etree" --disable-generatedssuper-lookup --user-methods="ocrd_models/ocrd_page_user_methods.py" ocrd_validators/ocrd_validators/page.xsd +# /data/ocr-d/ocrd_all/venv38/bin/generateDS -f --root-element="PcGts" -o "src/ocrd_models/ocrd_page_generateds.py" --silence --export="write etree" --disable-generatedssuper-lookup --user-methods="src/ocrd_page_user_methods.py" src/ocrd_validators/page.xsd # # Current working directory (os.getcwd()): # core # -# type: ignore - from itertools import zip_longest import os import sys @@ -223,7 +221,7 @@ def gds_validate_integer_list( try: int(value) except (TypeError, ValueError): - raise_parse_error(node, 'Requires sequence of integer values') + raise_parse_error(node, 'Requires sequence of integer valuess') return values def gds_format_float(self, input_data, input_name=''): return ('%.15f' % input_data).rstrip('0') @@ -1230,9 +1228,10 @@ def __hash__(self): return hash(self.id) @property def id(self): + from ocrd_utils import make_xml_id if hasattr(self, 'pcGtsId'): return self.pcGtsId or '' - return self.imageFilename + return make_xml_id(self.imageFilename) def get_AllAlternativeImagePaths(self, page=True, region=True, line=True, word=True, glyph=True): """ Get all the ``pc:AlternativeImage/@filename`` paths referenced in the PAGE-XML document. @@ -3116,9 +3115,10 @@ def __hash__(self): return hash(self.id) @property def id(self): + from ocrd_utils import make_xml_id if hasattr(self, 'pcGtsId'): return self.pcGtsId or '' - return self.imageFilename + return make_xml_id(self.imageFilename) # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring def _region_class(self, x): # pylint: disable=unused-argument return x.__class__.__name__.replace('RegionType', '') @@ -3314,6 +3314,39 @@ def get_AllTextLines(self, region_order='document', respect_textline_order=True) ret += lines if lo in ['top-to-bottom', 'left-to-right'] else list(reversed(lines)) return ret + def get_ReadingOrderGroups(self) -> dict: + """ + Aggregate recursive ReadingOrder into a dictionary, mapping each regionRef + (i.e. segment `@id`) to its referring group object (i.e one of + + \b + - :py:class:`.RegionRefType` + - :py:class:`.RegionRefIndexedType` + - :py:class:`.OrderedGroupType` + - :py:class:`.OrderedGroupIndexedType` + - :py:class:`.UnoderedGroupType` + - :py:class:`.UnoderedGroupIndexedType` + """ + def get_groupdict(group): + regionrefs = list() + if isinstance(group, (OrderedGroupType, OrderedGroupIndexedType)): + regionrefs = (group.get_RegionRefIndexed() + + group.get_OrderedGroupIndexed() + + group.get_UnorderedGroupIndexed()) + if isinstance(group, (UnorderedGroupType, UnorderedGroupIndexedType)): + regionrefs = (group.get_RegionRef() + + group.get_OrderedGroup() + + group.get_UnorderedGroup()) + refdict = {} + for elem in regionrefs: + refdict[elem.get_regionRef()] = elem + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): + refdict = {**refdict, **get_groupdict(elem)} + return refdict + ro = self.get_ReadingOrder() + if ro is None: + return {} + return get_groupdict(ro.get_OrderedGroup() or ro.get_UnorderedGroup()) def set_orientation(self, orientation): """ Set deskewing angle to given `orientation` number. From 687b06f90784fcf9eac510ecc3442ea8d8c08bb3 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 16 Sep 2024 13:29:26 +0200 Subject: [PATCH 200/228] :package: v3.0.0b5 --- CHANGELOG.md | 29 +++++++++++++++++++++++++++++ VERSION | 2 +- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ec12c8934..bbb91c0782 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,35 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [3.0.0b5] - 2024-09-16 + +TODO + - update OcrdPage from generateds (HEAD -> new-processor-api, bertsky/new-processor-api) + - OcrdPage: add PageType.get_ReadingOrderGroups() + - tests: make sure ocrd_utils.config gets reset whenever changing it globally + - test processors: no need for 'force' kwarg anymore + - Processor / Workspace.add_file: always force if config.OCRD_EXISTING_OUTPUT==OVERWRITE + - lib.bash input-files: do not try to validate tasks here (impossible to get right with required parameters, and now covered by wrapped Processor.verify() already) + - lib.bash input-files: pass on --mets-server-url, --overwrite, and parameters (necessary for required params) + - lib.bash: fix errexit + - run_processor: be robust if ocrd_tool is missing steps + - Processor.verify: check output fileGrps as well (or OCRD_EXISTING_OUTPUT=OVERWRITE|SKIP or disjoint --page-id) + - Processor.process_workspace(): do not show NotImplementedError context if fallback process() raises anything itself + - ocrd.cli.validate tasks: pass on --mets-server-url, too + - ocrd.cli.bashlib input-files: pass on --mets-server-url, too + - ocrd.cli.workspace server: add 'reload' and 'save' + - ocrd.cli.workspace: consistently pass on --mets-server-url and --backup (also, simplify) + - METS Server: also export+delegate physical_pages + - processor CLI: delegate --resolve-resource, too + - ocrd.cli.ocrd-tool resolve-resource: fix (forgot to print result) + - PcGts.Page.id / make_xml_id: replace '/' with '_' + - Processor.process_page_file / OcrdPageResultImage: allow None instead of AlternativeImageType + - Processor.verify: revert 5819c816 (we still have no defaults in json loaded from v2) + - typing, extend docs + - test_processor: add test for force (overwrite) w/ METS Server + - test_mets_server: add test for force (overwrite) + - OcrdMetsServer.add_file: pass on 'force' kwarg, too + ## [3.0.0b4] - 2024-09-02 Fixed: diff --git a/VERSION b/VERSION index 9414e12700..09fb39d267 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0b4 +3.0.0b5 From a43098e9ee01a15a753ace19a8eddcdff4849352 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 16 Sep 2024 14:27:50 +0200 Subject: [PATCH 201/228] :memo: improve b5 changelog --- CHANGELOG.md | 53 ++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bbb91c0782..abbfd5a4d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,32 +7,31 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## [3.0.0b5] - 2024-09-16 -TODO - - update OcrdPage from generateds (HEAD -> new-processor-api, bertsky/new-processor-api) - - OcrdPage: add PageType.get_ReadingOrderGroups() - - tests: make sure ocrd_utils.config gets reset whenever changing it globally - - test processors: no need for 'force' kwarg anymore - - Processor / Workspace.add_file: always force if config.OCRD_EXISTING_OUTPUT==OVERWRITE - - lib.bash input-files: do not try to validate tasks here (impossible to get right with required parameters, and now covered by wrapped Processor.verify() already) - - lib.bash input-files: pass on --mets-server-url, --overwrite, and parameters (necessary for required params) - - lib.bash: fix errexit - - run_processor: be robust if ocrd_tool is missing steps - - Processor.verify: check output fileGrps as well (or OCRD_EXISTING_OUTPUT=OVERWRITE|SKIP or disjoint --page-id) - - Processor.process_workspace(): do not show NotImplementedError context if fallback process() raises anything itself - - ocrd.cli.validate tasks: pass on --mets-server-url, too - - ocrd.cli.bashlib input-files: pass on --mets-server-url, too - - ocrd.cli.workspace server: add 'reload' and 'save' - - ocrd.cli.workspace: consistently pass on --mets-server-url and --backup (also, simplify) - - METS Server: also export+delegate physical_pages - - processor CLI: delegate --resolve-resource, too - - ocrd.cli.ocrd-tool resolve-resource: fix (forgot to print result) - - PcGts.Page.id / make_xml_id: replace '/' with '_' - - Processor.process_page_file / OcrdPageResultImage: allow None instead of AlternativeImageType - - Processor.verify: revert 5819c816 (we still have no defaults in json loaded from v2) - - typing, extend docs - - test_processor: add test for force (overwrite) w/ METS Server - - test_mets_server: add test for force (overwrite) - - OcrdMetsServer.add_file: pass on 'force' kwarg, too +Fixed: + - tests: ensure `ocrd_utils.config` gets reset whenever changing it globally + - `OcrdMetsServer.add_file`: pass on `force` kwarg + - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup` + - `ocrd.cli.validate "tasks"`: pass on `--mets-server-url` + - `ocrd.cli.bashlib "input-files"`: pass on `--mets-server-url` + - `lib.bash input-files`: pass on `--mets-server-url`, `--overwrite`, and parameters + - `lib.bash`: fix `errexit` handling + - `ocrd.cli.ocrd-tool "resolve-resource"`: forgot to actually print result + +Changed: + - :fire: `Processor` / `Workspace.add_file`: always `force` if `OCRD_EXISTING_OUTPUT==OVERWRITE` + - :fire: `Processor.verify`: revert 3.0.0b1 enforcing cardinality checks (stay backwards compatible) + - :fire: `Processor.verify`: check output fileGrps, too + (must not exist unless `OCRD_EXISTING_OUTPUT=OVERWRITE|SKIP` or disjoint `--page-id` range) + - lib.bash `input-files`: do not try to validate tasks here (now covered by `Processor.verify()`) + - `run_processor`: be robust if `ocrd_tool` is missing `steps` + - `PcGtsType.PageType.id` via `make_xml_id`: replace `/` with `_` + +Added: + - `OcrdPage`: new `PageType.get_ReadingOrderGroups()` to retrieve recursive RO as dict + - ocrd.cli.workspace `server`: add subcommands `reload` and `save` + - METS Server: export and delegate `physical_pages` + - processor CLI: delegate `--resolve-resource`, too + - `Processor.process_page_file` / `OcrdPageResultImage`: allow `None` besides `AlternativeImageType` ## [3.0.0b4] - 2024-09-02 @@ -2288,6 +2287,8 @@ Fixed Initial Release +[3.0.0b5]: ../../compare/v3.0.0b5..v3.0.0b4 +[3.0.0b4]: ../../compare/v3.0.0b4..v3.0.0b3 [3.0.0b3]: ../../compare/v3.0.0b3..v3.0.0b2 [3.0.0b2]: ../../compare/v3.0.0b2..v3.0.0b1 [3.0.0b1]: ../../compare/v3.0.0b1..v3.0.0a2 From d2cb0fb663c15c6179bbcf05477051f3d7737149 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 16 Sep 2024 16:55:41 +0200 Subject: [PATCH 202/228] ocrd.cli.workspace: assert non-server in cmds mutating METS --- src/ocrd/cli/workspace.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index ca4e8629db..05b37b6bcc 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -149,7 +149,8 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir)) ctx.directory = workspace_dir - assert not ctx.mets_server_url + assert not ctx.mets_server_url, \ + f"clone cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.resolver.workspace_from_url( mets_url, dst_dir=ctx.directory, @@ -185,7 +186,8 @@ def workspace_init(ctx, clobber_mets, directory): if directory: LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory)) ctx.directory = directory - assert not ctx.mets_server_url + assert not ctx.mets_server_url, \ + f"init cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.resolver.workspace_from_nothing( directory=ctx.directory, mets_basename=ctx.mets_basename, @@ -506,6 +508,8 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin (If any ``ID`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ + assert not ctx.mets_server_url, \ + f"remove cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() for i in id: workspace.remove_file(i, force=force, keep_file=keep_file) @@ -524,6 +528,8 @@ def rename_group(ctx, old, new): """ Rename fileGrp (USE attribute ``NEW`` to ``OLD``). """ + assert not ctx.mets_server_url, \ + f"rename-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() workspace.rename_file_group(old, new) workspace.save_mets() @@ -545,6 +551,8 @@ def remove_group(ctx, group, recursive, force, keep_files): (If any ``GROUP`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ + assert not ctx.mets_server_url, \ + f"remove-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() for g in group: workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files) @@ -567,6 +575,8 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id): (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ + assert not ctx.mets_server_url, \ + f"prune-files cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() with pushd_popd(workspace.directory): for f in workspace.find_files( @@ -762,6 +772,8 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): if contentids: update_kwargs['CONTENTIDS'] = contentids try: + assert not ctx.mets_server_url, \ + f"update-page cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() workspace.mets.update_physical_page_attributes(page_id, **update_kwargs) workspace.save_mets() @@ -800,6 +812,8 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa mets_path = Path(mets_path) if filegrp_mapping: filegrp_mapping = loads(filegrp_mapping) + assert not ctx.mets_server_url, \ + f"merge cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name)) workspace.merge( From f678dca0e42b66d5742209ffb692103fa7f15528 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:35:07 +0200 Subject: [PATCH 203/228] OcrdMets.get_physical_pages: cover return_divs w/o for_fileIds for_pageIds --- src/ocrd_models/ocrd_mets.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index c3fb11f600..de068567e2 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -599,7 +599,16 @@ def get_physical_pages(self, for_fileIds : Optional[List[str]] = None, for_pageI If return_divs is set, returns div memory objects instead of strings of ids """ if for_fileIds is None and for_pageIds is None: + if return_divs: + if self._cache_flag: + return list(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].values()) + + return [x for x in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS)] + return self.physical_pages + # log = getLogger('ocrd.models.ocrd_mets.get_physical_pages') if for_pageIds is not None: ret = [] From 9064db01380cfca0327320cfcfa7c0fd02e2cb21 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:37:03 +0200 Subject: [PATCH 204/228] ocrd.cli.workspace: use physical_pages if possible, fix default output_field --- src/ocrd/cli/workspace.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 05b37b6bcc..77797b3037 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -683,19 +683,15 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page will be interpreted as a regular expression.) """ workspace = ctx.workspace() - find_kwargs = {} - if page_id_range and 'ID' in output_field: - find_kwargs['pageId'] = page_id_range - page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId}) ret = [] - - if output_field == ['ID']: - ret = [[x] for x in page_ids] - else: - for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)): + if page_id_range or list(output_field) != ['ID']: + for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=page_id_range, return_divs=True)): ret.append([]) for k in output_field: ret[i].append(page_div.get(k, 'None')) + else: + for page_id in workspace.mets.physical_pages: + ret.append([page_id]) if numeric_range: start, end = map(int, numeric_range.split('..')) From 9530fcd346357d23f6e914534f87436c206fa038 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:44:45 +0200 Subject: [PATCH 205/228] Processor.process_page_file: avoid process_page_pcgts() if OCRD_EXISTING_OUTPUT!=OVERWRITE --- src/ocrd/processor/base.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 26ea532d16..28cbaf7269 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -618,6 +618,12 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: # not PAGE and not an image to generate PAGE for self._base_logger.error(f"non-PAGE input for page {page_id}: {err}") output_file_id = make_file_id(input_files[0], self.output_file_grp) + output_file = next(self.workspace.mets.find_files(ID=output_file_id), None) + if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE': + # short-cut avoiding useless computation: + raise FileExistsError( + f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set" + ) result = self.process_page_pcgts(*input_pcgts, page_id=page_id) for image_result in result.images: image_file_id = f'{output_file_id}_{image_result.file_id_suffix}' From 31a8474e884812eae614916fcb3e878aa443995a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 9 Oct 2024 16:34:39 +0000 Subject: [PATCH 206/228] ocrd_utils.initLogging: also add handler to root logger (to be consistent with file config and prevent imported libraries from initing logging first), but disable propagation for ocrd loggers (to avoid duplication) --- src/ocrd_utils/logging.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 181805118d..dfac74988b 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -48,6 +48,7 @@ # These are the loggers we add handlers to ROOT_OCRD_LOGGERS = [ + '', 'ocrd', 'ocrd_network' ] @@ -191,7 +192,10 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L ocrd_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT)) ocrd_handler.setLevel(logging.DEBUG) for logger_name in ROOT_OCRD_LOGGERS: - logging.getLogger(logger_name).addHandler(ocrd_handler) + logger = logging.getLogger(logger_name) + logger.addHandler(ocrd_handler) + if logger_name: + logger.propagate = False # avoid duplication (from root handler) for logger_name, logger_level in LOGGING_DEFAULTS.items(): logging.getLogger(logger_name).setLevel(logger_level) _initialized_flag = True @@ -210,7 +214,7 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): # logging.basicConfig(level=logging.CRITICAL) # logging.disable(logging.ERROR) # remove all handlers for the ocrd logger - for logger_name in ROOT_OCRD_LOGGERS + ['']: + for logger_name in ROOT_OCRD_LOGGERS: for handler in logging.getLogger(logger_name).handlers[:]: logging.getLogger(logger_name).removeHandler(handler) for logger_name in LOGGING_DEFAULTS: From d7049b1bffb185723124028882ac0e5d88bfabba Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 10 Oct 2024 01:03:46 +0000 Subject: [PATCH 207/228] CLI decorator: only import ocrd_network when needed --- src/ocrd/decorators/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index f52a13575b..f659bf58a0 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -13,7 +13,6 @@ redirect_stderr_and_stdout_to_file, ) from ocrd_validators import WorkspaceValidator -from ocrd_network import ProcessingWorker, ProcessorServer, AgentType from ..resolver import Resolver from ..processor.base import ResourceNotFoundError, run_processor @@ -23,8 +22,6 @@ from .ocrd_cli_options import ocrd_cli_options from .mets_find_options import mets_find_options -SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER] - def ocrd_cli_wrap_processor( processorClass, @@ -88,11 +85,9 @@ def ocrd_cli_wrap_processor( if list_resources: processor.list_resources() sys.exit() - if subcommand: + if subcommand or address or queue or database: # Used for checking/starting network agents for the WebAPI architecture check_and_run_network_agent(processorClass, subcommand, address, database, queue) - elif address or queue or database: - raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}") # from here: single-run processing context initLogging() @@ -162,6 +157,11 @@ def goexit(): def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str): """ """ + from ocrd_network import ProcessingWorker, ProcessorServer, AgentType + SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER] + + if not subcommand: + raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}") if subcommand not in SUBCOMMANDS: raise ValueError(f"SUBCOMMAND can only be one of {SUBCOMMANDS}") From a9d49c1df906af98f618dbf99b01b2fb9900452b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 10 Oct 2024 14:28:41 +0000 Subject: [PATCH 208/228] =?UTF-8?q?Processor=20w/=20OCRD=5FMAX=5FPARALLEL?= =?UTF-8?q?=5FPAGES:=20ThreadPoolExecutor=E2=86=92ProcessPoolExecutor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/ocrd/processor/base.py | 174 ++++++++++++++++++++----------------- 1 file changed, 95 insertions(+), 79 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 28cbaf7269..8ea53246d8 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -23,7 +23,8 @@ import io import weakref from frozendict import frozendict -from concurrent.futures import ThreadPoolExecutor, TimeoutError +from concurrent.futures import ProcessPoolExecutor, TimeoutError +import multiprocessing as mp from click import wrap_text from deprecated import deprecated @@ -465,11 +466,7 @@ def process_workspace(self, workspace: Workspace) -> None: self.workspace = workspace self.verify() try: - nr_succeeded = 0 - nr_skipped = 0 - nr_copied = 0 - - # set up multithreading + # set up multitasking max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES) if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES: self._base_logger.info("limiting number of threads from %d to %d", max_workers, self.max_workers) @@ -481,80 +478,17 @@ def process_workspace(self, workspace: Workspace) -> None: if self.max_page_seconds > 0 and self.max_page_seconds < config.OCRD_PROCESSING_PAGE_TIMEOUT: self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds) max_seconds = self.max_page_seconds - executor = ThreadPoolExecutor( - max_workers=max_workers or 1, - thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}" - ) - self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) - tasks = {} - - for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): - input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) - page_id = next(input_file.pageId - for input_file in input_file_tuple - if input_file) - self._base_logger.info(f"preparing page {page_id}") - for i, input_file in enumerate(input_file_tuple): - if input_file is None: - # file/page not found in this file grp - continue - input_files[i] = input_file - if not self.download: - continue - try: - input_files[i] = self.workspace.download_file(input_file) - except (ValueError, FileNotFoundError, HTTPError) as e: - self._base_logger.error(repr(e)) - self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}") - # process page - tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files) - self._base_logger.debug("submitted %d processing tasks", len(tasks)) - - for task in tasks: - # wait for results, handle errors - page_id, input_files = tasks[task] - # FIXME: differentiate error cases in various ways: - # - ResourceNotFoundError → use ResourceManager to download (once), then retry - # - transient (I/O or OOM) error → maybe sleep, retry - # - persistent (data) error → skip / dummy / raise - try: - self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds) - task.result(timeout=max_seconds or None) - nr_succeeded += 1 - # exclude NotImplementedError, so we can try process() below - except NotImplementedError: - raise - # handle input failures separately - except FileExistsError as err: - if config.OCRD_EXISTING_OUTPUT == 'ABORT': - raise err - if config.OCRD_EXISTING_OUTPUT == 'SKIP': - continue - if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': - # too late here, must not happen - raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") - # broad coverage of output failures (including TimeoutError) - except (Exception, TimeoutError) as err: - # FIXME: add re-usable/actionable logging - if config.OCRD_MISSING_OUTPUT == 'ABORT': - self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") - raise err - self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") - if config.OCRD_MISSING_OUTPUT == 'SKIP': - nr_skipped += 1 - continue - if config.OCRD_MISSING_OUTPUT == 'COPY': - self._copy_page_file(input_files[0]) - nr_copied += 1 - else: - desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) - raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") - if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS: - raise Exception(f"too many failures with skipped output ({nr_skipped})") - if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS: - raise Exception(f"too many failures with fallback output ({nr_skipped})") - executor.shutdown() + with ProcessPoolExecutor( + max_workers=max_workers or 1, + # only forking method avoids pickling + mp_context=mp.get_context('fork'), + # share processor instance as global to avoid pickling + initializer=_page_worker_set_ctxt, + initargs=(self,), + ) as executor: + self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) + self._process_workspace_run(executor, max_workers, max_seconds) except NotImplementedError: # fall back to deprecated method @@ -564,6 +498,80 @@ def process_workspace(self, workspace: Workspace) -> None: # suppress the NotImplementedError context raise err from None + def _process_workspace_run(self, executor, max_workers, max_seconds): + nr_succeeded = 0 + nr_skipped = 0 + nr_copied = 0 + + tasks = {} + for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): + input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) + page_id = next(input_file.pageId + for input_file in input_file_tuple + if input_file) + self._base_logger.info(f"preparing page {page_id}") + for i, input_file in enumerate(input_file_tuple): + if input_file is None: + # file/page not found in this file grp + continue + input_files[i] = input_file + if not self.download: + continue + try: + input_files[i] = self.workspace.download_file(input_file) + except (ValueError, FileNotFoundError, HTTPError) as e: + self._base_logger.error(repr(e)) + self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}") + # process page + #tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files) + tasks[executor.submit(_page_worker, *input_files)] = (page_id, input_files) + self._base_logger.debug("submitted %d processing tasks", len(tasks)) + + for task in tasks: + # wait for results, handle errors + page_id, input_files = tasks[task] + # FIXME: differentiate error cases in various ways: + # - ResourceNotFoundError → use ResourceManager to download (once), then retry + # - transient (I/O or OOM) error → maybe sleep, retry + # - persistent (data) error → skip / dummy / raise + try: + self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds) + task.result(timeout=max_seconds or None) + nr_succeeded += 1 + # exclude NotImplementedError, so we can try process() below + except NotImplementedError: + raise + # handle input failures separately + except FileExistsError as err: + if config.OCRD_EXISTING_OUTPUT == 'ABORT': + raise err + if config.OCRD_EXISTING_OUTPUT == 'SKIP': + continue + if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': + # too late here, must not happen + raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") + # broad coverage of output failures (including TimeoutError) + except (Exception, TimeoutError) as err: + # FIXME: add re-usable/actionable logging + if config.OCRD_MISSING_OUTPUT == 'ABORT': + self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") + raise err + self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") + if config.OCRD_MISSING_OUTPUT == 'SKIP': + nr_skipped += 1 + continue + if config.OCRD_MISSING_OUTPUT == 'COPY': + self._copy_page_file(input_files[0]) + nr_copied += 1 + else: + desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) + raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") + + if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS: + raise Exception(f"too many failures with skipped output ({nr_skipped})") + if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS: + raise Exception(f"too many failures with fallback output ({nr_skipped})") + def _copy_page_file(self, input_file : OcrdFileType) -> None: """ Copy the given ``input_file`` of the :py:data:`workspace`, @@ -940,6 +948,14 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): ifts.append(tuple(ifiles)) return ifts +_page_worker_processor = None +def _page_worker_set_ctxt(processor): + global _page_worker_processor + _page_worker_processor = processor + +def _page_worker(*input_files): + _page_worker_processor.process_page_file(*input_files) + def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None): """Generate a string describing the full CLI of this processor including params. From 588c91df826951d29b24f1e1677cced3a55b2153 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 17 Oct 2024 08:44:56 +0000 Subject: [PATCH 209/228] Processor.process_workspace: apply timeout on process_page_file worker itself (rather than future query) --- src/ocrd/processor/base.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 8ea53246d8..ce6b3e4949 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -25,6 +25,8 @@ from frozendict import frozendict from concurrent.futures import ProcessPoolExecutor, TimeoutError import multiprocessing as mp +from threading import Timer +from _thread import interrupt_main from click import wrap_text from deprecated import deprecated @@ -524,7 +526,7 @@ def _process_workspace_run(self, executor, max_workers, max_seconds): self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}") # process page #tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files) - tasks[executor.submit(_page_worker, *input_files)] = (page_id, input_files) + tasks[executor.submit(_page_worker, max_seconds, *input_files)] = (page_id, input_files) self._base_logger.debug("submitted %d processing tasks", len(tasks)) for task in tasks: @@ -536,7 +538,12 @@ def _process_workspace_run(self, executor, max_workers, max_seconds): # - persistent (data) error → skip / dummy / raise try: self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds) - task.result(timeout=max_seconds or None) + # timeout kwarg on future is useless: it only raises TimeoutError here, + # but does not stop the running process/thread, and executor offers nothing + # to that effect: + # task.result(timeout=max_seconds or None) + # so we instead apply the timeout within the worker function + task.result() nr_succeeded += 1 # exclude NotImplementedError, so we can try process() below except NotImplementedError: @@ -551,7 +558,7 @@ def _process_workspace_run(self, executor, max_workers, max_seconds): # too late here, must not happen raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") # broad coverage of output failures (including TimeoutError) - except (Exception, TimeoutError) as err: + except Exception as err: # FIXME: add re-usable/actionable logging if config.OCRD_MISSING_OUTPUT == 'ABORT': self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") @@ -953,8 +960,21 @@ def _page_worker_set_ctxt(processor): global _page_worker_processor _page_worker_processor = processor -def _page_worker(*input_files): - _page_worker_processor.process_page_file(*input_files) +def _page_worker(timeout, *input_files): + page_id = next((file.pageId for file in input_files + if hasattr(file, 'pageId')), "") + if timeout > 0: + timer = Timer(timeout, interrupt_main) + timer.start() + try: + _page_worker_processor.process_page_file(*input_files) + _page_worker_processor.logger.debug("page worker completed for page %s", page_id) + except KeyboardInterrupt: + _page_worker_processor.logger.debug("page worker timed out for page %s", page_id) + raise TimeoutError() + finally: + if timeout > 0: + timer.cancel() def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None): """Generate a string describing the full CLI of this processor including params. From d126bdce4ef81c148c1bae4718d000082f863704 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 17 Oct 2024 08:46:21 +0000 Subject: [PATCH 210/228] =?UTF-8?q?Processor=20w/=20OCRD=5FMAX=5FPARALLEL?= =?UTF-8?q?=5FPAGES:=20concurrent.futures=E2=86=92loky?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 1 + src/ocrd/processor/base.py | 23 ++++++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index e78c186618..05d4e9aa44 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,7 @@ httpx>=0.22.0 importlib_metadata ; python_version < '3.8' importlib_resources ; python_version < '3.10' jsonschema>=4 +loky lxml memory-profiler >= 0.58.0 # XXX explicitly do not restrict the numpy version because different diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index ce6b3e4949..b6a41d6b5f 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -23,7 +23,9 @@ import io import weakref from frozendict import frozendict -from concurrent.futures import ProcessPoolExecutor, TimeoutError +# concurrent.futures is buggy in py38, +# this is where the fixes came from: +from loky import ProcessPoolExecutor import multiprocessing as mp from threading import Timer from _thread import interrupt_main @@ -481,16 +483,19 @@ def process_workspace(self, workspace: Workspace) -> None: self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds) max_seconds = self.max_page_seconds - with ProcessPoolExecutor( - max_workers=max_workers or 1, - # only forking method avoids pickling - mp_context=mp.get_context('fork'), - # share processor instance as global to avoid pickling - initializer=_page_worker_set_ctxt, - initargs=(self,), - ) as executor: + executor = ProcessPoolExecutor( + max_workers=max_workers or 1, + # only forking method avoids pickling + context=mp.get_context('fork'), + # share processor instance as global to avoid pickling + initializer=_page_worker_set_ctxt, + initargs=(self,), + ) + try: self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) self._process_workspace_run(executor, max_workers, max_seconds) + finally: + executor.shutdown(kill_workers=True) except NotImplementedError: # fall back to deprecated method From afa7f30a6bf212fece28ebc354da726a658ba121 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 19 Oct 2024 00:23:06 +0000 Subject: [PATCH 211/228] Processor w/o OCRD_MAX_PARALLEL_PAGES: dummy instead of executor --- src/ocrd/processor/base.py | 46 ++++++++++++++++++++++++++++--- tests/processor/test_processor.py | 1 - 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index b6a41d6b5f..7ff271ecab 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -483,7 +483,29 @@ def process_workspace(self, workspace: Workspace) -> None: self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds) max_seconds = self.max_page_seconds - executor = ProcessPoolExecutor( + class DummyExecutor: + """ + Mimics some of ProcessPoolExecutor but runs everything + immediately in this process. + """ + class DummyFuture: + def __init__(self, fn, *args, **kwargs): + self.fn = fn + self.args = args + self.kwargs = kwargs + def result(self): + return self.fn(*self.args, **self.kwargs) + def __init__(self, initializer=None, initargs=(), **kwargs): + initializer(*initargs) + def shutdown(self, **kwargs): + pass + def submit(self, fn, *args, **kwargs): + return DummyExecutor.DummyFuture(fn, *args, **kwargs) + if max_workers > 1: + executor_cls = ProcessPoolExecutor + else: + executor_cls = DummyExecutor + executor = executor_cls( max_workers=max_workers or 1, # only forking method avoids pickling context=mp.get_context('fork'), @@ -493,7 +515,7 @@ def process_workspace(self, workspace: Workspace) -> None: ) try: self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) - self._process_workspace_run(executor, max_workers, max_seconds) + self._process_workspace_run(executor, max_seconds) finally: executor.shutdown(kill_workers=True) @@ -505,7 +527,7 @@ def process_workspace(self, workspace: Workspace) -> None: # suppress the NotImplementedError context raise err from None - def _process_workspace_run(self, executor, max_workers, max_seconds): + def _process_workspace_run(self, executor, max_seconds): nr_succeeded = 0 nr_skipped = 0 nr_copied = 0 @@ -961,11 +983,27 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): return ifts _page_worker_processor = None +""" +This global binding for the processor is required to avoid +squeezing the processor through a mp.Queue (which is impossible +due to unpicklable attributes like .workspace.mets._tree anyway) +when calling Processor.process_page_file as page worker processes +in Processor.process_workspace. Forking allows inheriting global +objects, and with the METS Server we do not mutate the local +processor instance anyway. +""" def _page_worker_set_ctxt(processor): + """ + Overwrites `ocrd.processor.base._page_worker_processor` instance + for sharing with subprocesses in ProcessPoolExecutor initializer. + """ global _page_worker_processor _page_worker_processor = processor - def _page_worker(timeout, *input_files): + """ + Wraps a `Processor.process_page_file` call as payload (call target) + of the ProcessPoolExecutor workers, but also enforces the given timeout. + """ page_id = next((file.pageId for file in input_files if hasattr(file, 'pageId')), "") if timeout > 0: diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 33a9548811..5844cb8774 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -277,7 +277,6 @@ def test_run_output_timeout(self): assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' config.OCRD_PROCESSING_PAGE_TIMEOUT = 1 - from concurrent.futures import TimeoutError with pytest.raises(TimeoutError) as exc: run_processor(DummyProcessorWithOutputSleep, workspace=ws, input_file_grp="OCR-D-IMG", From 58217018d8bcd85df5dc4e3e03eb62a0d9255690 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 19 Oct 2024 01:27:58 +0000 Subject: [PATCH 212/228] ocrd.process.profile logger: account for subprocess CPU time, too --- src/ocrd/processor/helpers.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 2cbbbd97e1..757f7ac045 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -2,6 +2,7 @@ Helper methods for running and documenting processors """ from time import perf_counter, process_time +from os import times from functools import lru_cache import json import inspect @@ -94,6 +95,7 @@ def run_processor( log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) t0_wall = perf_counter() t0_cpu = process_time() + t0_os = times() if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']): backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil' from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel @@ -123,7 +125,13 @@ def run_processor( t1_wall = perf_counter() - t0_wall t1_cpu = process_time() - t0_cpu - logProfile.info("Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']" % ( + t1_os = times() + # add CPU time from child processes (page worker etc) + t1_cpu += t1_os.children_user - t0_os.children_user + t1_cpu += t1_os.children_system - t0_os.children_system + logProfile.info( + "Executing processor '%s' took %fs (wall) %fs (CPU)( " + "[--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']", ocrd_tool['executable'], t1_wall, t1_cpu, @@ -131,7 +139,7 @@ def run_processor( processor.output_file_grp or '', json.dumps(processor.parameter) or '', processor.page_id or '' - )) + ) workspace.mets.add_agent( name=name, _type='OTHER', From 53b1854e139f66e3061d2e4feae5411c9b8d092a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 21 Oct 2024 12:47:33 +0000 Subject: [PATCH 213/228] Processor.process_workspace: improve reporting, raise early if too many failures already (rate will be too low) --- src/ocrd/processor/base.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 7ff271ecab..46b07c7161 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -22,6 +22,7 @@ import tarfile import io import weakref +from collections import defaultdict from frozendict import frozendict # concurrent.futures is buggy in py38, # this is where the fixes came from: @@ -528,9 +529,10 @@ def submit(self, fn, *args, **kwargs): raise err from None def _process_workspace_run(self, executor, max_seconds): + # aggregate info for logging: nr_succeeded = 0 - nr_skipped = 0 - nr_copied = 0 + nr_failed = 0 + nr_errors = defaultdict(int) # count causes tasks = {} for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): @@ -572,8 +574,8 @@ def _process_workspace_run(self, executor, max_seconds): # so we instead apply the timeout within the worker function task.result() nr_succeeded += 1 - # exclude NotImplementedError, so we can try process() below except NotImplementedError: + # exclude NotImplementedError, so we can try process() below raise # handle input failures separately except FileExistsError as err: @@ -587,24 +589,35 @@ def _process_workspace_run(self, executor, max_seconds): # broad coverage of output failures (including TimeoutError) except Exception as err: # FIXME: add re-usable/actionable logging + nr_errors[err.__class__.__name__] += 1 + nr_failed += 1 if config.OCRD_MISSING_OUTPUT == 'ABORT': self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") raise err self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") if config.OCRD_MISSING_OUTPUT == 'SKIP': - nr_skipped += 1 + if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: + # already irredeemably many failures, stop short + raise Exception(f"too many failures with skipped output ({nr_failed} of {nr_failed+nr_succeeded})") continue if config.OCRD_MISSING_OUTPUT == 'COPY': + if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: + # already irredeemably many failures, stop short + raise Exception(f"too many failures with fallback-copied output ({nr_failed} of {nr_failed+nr_succeeded})") self._copy_page_file(input_files[0]) - nr_copied += 1 else: desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") - if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS: - raise Exception(f"too many failures with skipped output ({nr_skipped})") - if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS: - raise Exception(f"too many failures with fallback output ({nr_skipped})") + if nr_failed > 0: + nr_all = nr_succeeded + nr_failed + if config.OCRD_MISSING_OUTPUT == 'SKIP': + reason = "skipped" + if config.OCRD_MISSING_OUTPUT == 'COPY': + reason = "fallback-copied" + if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS: + raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all})") + self._base_logger.info("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(dict(nr_errors))) def _copy_page_file(self, input_file : OcrdFileType) -> None: """ From 4d66e3702dfdd1063307ab09c33126ddc2f930a2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 23 Oct 2024 22:12:57 +0000 Subject: [PATCH 214/228] Processor: refactor process_workspace into overridable subfuncs --- repo/spec | 2 +- src/ocrd/processor/base.py | 299 +++++++++++++++++++++++++------------ 2 files changed, 201 insertions(+), 100 deletions(-) diff --git a/repo/spec b/repo/spec index df2a07e3fd..506b33936d 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit df2a07e3fda634b2eda5785afe67399b61a81173 +Subproject commit 506b33936d89080a683fa8a26837f2a23b23e5e2 diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 46b07c7161..85a0dea212 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -16,7 +16,7 @@ import os from os import getcwd from pathlib import Path -from typing import Any, List, Optional, Union, get_args +from typing import Any, Dict, List, Optional, Tuple, Union, get_args import sys import inspect import tarfile @@ -26,7 +26,7 @@ from frozendict import frozendict # concurrent.futures is buggy in py38, # this is where the fixes came from: -from loky import ProcessPoolExecutor +from loky import Future, ProcessPoolExecutor import multiprocessing as mp from threading import Timer from _thread import interrupt_main @@ -111,6 +111,31 @@ def __init__(self, fileGrp, pageId, mimetype): f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}") super().__init__(self.message) +class DummyFuture: + """ + Mimics some of `concurrent.futures.Future` but runs immediately. + """ + def __init__(self, fn, *args, **kwargs): + self.fn = fn + self.args = args + self.kwargs = kwargs + def result(self): + return self.fn(*self.args, **self.kwargs) +class DummyExecutor: + """ + Mimics some of `concurrent.futures.ProcessPoolExecutor` but runs + everything immediately in this process. + """ + def __init__(self, initializer=None, initargs=(), **kwargs): + initializer(*initargs) + def shutdown(self, **kwargs): + pass + def submit(self, fn, *args, **kwargs) -> DummyFuture: + return DummyFuture(fn, *args, **kwargs) + +TFuture = Union[DummyFuture, Future] +TExecutor = Union[DummyExecutor, ProcessPoolExecutor] + class Processor(): """ A processor is a tool that implements the uniform OCR-D @@ -462,6 +487,9 @@ def process_workspace(self, workspace: Workspace) -> None: for the given :py:data:`page_id` (or all pages) under the given :py:data:`parameter`. + Delegates to :py:meth:`.process_workspace_submit_tasks` + and :py:meth:`.process_workspace_handle_tasks`. + (This will iterate over pages and files, calling :py:meth:`.process_page_file` and handling exceptions. It should be overridden by subclasses to handle cases @@ -484,24 +512,6 @@ def process_workspace(self, workspace: Workspace) -> None: self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds) max_seconds = self.max_page_seconds - class DummyExecutor: - """ - Mimics some of ProcessPoolExecutor but runs everything - immediately in this process. - """ - class DummyFuture: - def __init__(self, fn, *args, **kwargs): - self.fn = fn - self.args = args - self.kwargs = kwargs - def result(self): - return self.fn(*self.args, **self.kwargs) - def __init__(self, initializer=None, initargs=(), **kwargs): - initializer(*initargs) - def shutdown(self, **kwargs): - pass - def submit(self, fn, *args, **kwargs): - return DummyExecutor.DummyFuture(fn, *args, **kwargs) if max_workers > 1: executor_cls = ProcessPoolExecutor else: @@ -516,7 +526,8 @@ def submit(self, fn, *args, **kwargs): ) try: self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) - self._process_workspace_run(executor, max_seconds) + tasks = self.process_workspace_submit_tasks(executor, max_seconds) + stats = self.process_workspace_handle_tasks(tasks) finally: executor.shutdown(kill_workers=True) @@ -528,96 +539,186 @@ def submit(self, fn, *args, **kwargs): # suppress the NotImplementedError context raise err from None - def _process_workspace_run(self, executor, max_seconds): - # aggregate info for logging: - nr_succeeded = 0 - nr_failed = 0 - nr_errors = defaultdict(int) # count causes - + def process_workspace_submit_tasks(self, executor : TExecutor, max_seconds : int) -> Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]: + """ + Look up all input files of the given ``workspace`` + from the given :py:data:`input_file_grp` + for the given :py:data:`page_id` (or all pages), + and schedules calling :py:meth:`.process_page_file` + on them for each page via `executor` (enforcing + a per-page time limit of `max_seconds`). + + When running with `OCRD_MAX_PARALLEL_PAGES>1` and + the workspace via METS Server, the executor will fork + this many worker parallel subprocesses each processing + one page at a time. (Interprocess communication is + done via task and result queues.) + + Otherwise, tasks are run sequentially in the + current process. + + Delegates to :py:meth:`.zip_input_files` to get + the input files for each page, and then calls + :py:meth:`.process_workspace_submit_page_task`. + + Returns a dict mapping the per-page tasks + (i.e. futures submitted to the executor) + to their corresponding pageId and input files. + """ tasks = {} for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): - input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) - page_id = next(input_file.pageId - for input_file in input_file_tuple - if input_file) - self._base_logger.info(f"preparing page {page_id}") - for i, input_file in enumerate(input_file_tuple): - if input_file is None: - # file/page not found in this file grp - continue - input_files[i] = input_file - if not self.download: - continue - try: - input_files[i] = self.workspace.download_file(input_file) - except (ValueError, FileNotFoundError, HTTPError) as e: - self._base_logger.error(repr(e)) - self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}") - # process page - #tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files) - tasks[executor.submit(_page_worker, max_seconds, *input_files)] = (page_id, input_files) + task, page_id, input_files = self.process_workspace_submit_page_task(executor, max_seconds, input_file_tuple) + tasks[task] = (page_id, input_files) self._base_logger.debug("submitted %d processing tasks", len(tasks)) + return tasks + def process_workspace_submit_page_task(self, executor : TExecutor, max_seconds : int, input_file_tuple : List[Optional[OcrdFileType]]) -> Tuple[TFuture, str, List[Optional[OcrdFileType]]]: + """ + Ensure all input files for a single page are + downloaded to the workspace, then schedule + :py:meth:`.process_process_file` to be run on + them via `executor` (enforcing a per-page time + limit of `max_seconds`). + + Delegates to :py:meth:`.process_page_file` + (wrapped in :py:func:`_page_worker` to share + the processor instance across forked processes). + + \b + Returns a tuple of: + - the scheduled future object, + - the corresponding pageId, + - the corresponding input files. + """ + input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) + page_id = next(input_file.pageId + for input_file in input_file_tuple + if input_file) + self._base_logger.info(f"preparing page {page_id}") + for i, input_file in enumerate(input_file_tuple): + if input_file is None: + # file/page not found in this file grp + continue + input_files[i] = input_file + if not self.download: + continue + try: + input_files[i] = self.workspace.download_file(input_file) + except (ValueError, FileNotFoundError, HTTPError) as e: + self._base_logger.error(repr(e)) + self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}") + # process page + #executor.submit(self.process_page_file, *input_files) + return executor.submit(_page_worker, max_seconds, *input_files), page_id, input_files + + def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]) -> Tuple[int, int, Dict[str, int], int]: + """ + Look up scheduled per-page futures one by one, + handle errors (exceptions) and gather results. + + \b + Enforces policies configured by the following + environment variables: + - `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite) + - `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy) + - `OCRD_MAX_MISSING_OUTPUTS` (abort after all). + + \b + Returns a tuple of: + - the number of successfully processed pages + - the number of failed (i.e. skipped or copied) pages + - a dict of the type and corresponding number of exceptions seen + - the number of total requested pages (i.e. success+fail+existing). + + Delegates to :py:meth:`.process_workspace_handle_page_task` + for each page. + """ + # aggregate info for logging: + nr_succeeded = 0 + nr_failed = 0 + nr_errors = defaultdict(int) # count causes + if config.OCRD_MISSING_OUTPUT == 'SKIP': + reason = "skipped" + elif config.OCRD_MISSING_OUTPUT == 'COPY': + reason = "fallback-copied" for task in tasks: # wait for results, handle errors page_id, input_files = tasks[task] - # FIXME: differentiate error cases in various ways: - # - ResourceNotFoundError → use ResourceManager to download (once), then retry - # - transient (I/O or OOM) error → maybe sleep, retry - # - persistent (data) error → skip / dummy / raise - try: - self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds) - # timeout kwarg on future is useless: it only raises TimeoutError here, - # but does not stop the running process/thread, and executor offers nothing - # to that effect: - # task.result(timeout=max_seconds or None) - # so we instead apply the timeout within the worker function - task.result() - nr_succeeded += 1 - except NotImplementedError: - # exclude NotImplementedError, so we can try process() below - raise - # handle input failures separately - except FileExistsError as err: - if config.OCRD_EXISTING_OUTPUT == 'ABORT': - raise err - if config.OCRD_EXISTING_OUTPUT == 'SKIP': - continue - if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': - # too late here, must not happen - raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") - # broad coverage of output failures (including TimeoutError) - except Exception as err: - # FIXME: add re-usable/actionable logging - nr_errors[err.__class__.__name__] += 1 + result = self.process_workspace_handle_page_task(page_id, input_files, task) + if isinstance(result, Exception): + nr_errors[result.__class__.__name__] += 1 nr_failed += 1 - if config.OCRD_MISSING_OUTPUT == 'ABORT': - self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") - raise err - self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") - if config.OCRD_MISSING_OUTPUT == 'SKIP': - if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: - # already irredeemably many failures, stop short - raise Exception(f"too many failures with skipped output ({nr_failed} of {nr_failed+nr_succeeded})") - continue - if config.OCRD_MISSING_OUTPUT == 'COPY': - if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: - # already irredeemably many failures, stop short - raise Exception(f"too many failures with fallback-copied output ({nr_failed} of {nr_failed+nr_succeeded})") - self._copy_page_file(input_files[0]) - else: - desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) - raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") - + # FIXME: this is just prospective, because len(tasks)==nr_failed+nr_succeeded is not guaranteed + if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: + # already irredeemably many failures, stop short + raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded})") + elif result: + nr_succeeded += 1 + # else skipped - already exists + nr_errors = dict(nr_errors) if nr_failed > 0: nr_all = nr_succeeded + nr_failed - if config.OCRD_MISSING_OUTPUT == 'SKIP': - reason = "skipped" - if config.OCRD_MISSING_OUTPUT == 'COPY': - reason = "fallback-copied" if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS: raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all})") - self._base_logger.info("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(dict(nr_errors))) + self._base_logger.info("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors)) + return nr_succeeded, nr_failed, nr_errors, len(tasks) + + def process_workspace_handle_page_task(self, page_id : str, input_files : List[Optional[OcrdFileType]], task : TFuture) -> Union[bool, Exception]: + """ + \b + Await a single page result and handle errors (exceptions), + enforcing policies configured by the following + environment variables: + - `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite) + - `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy) + - `OCRD_MAX_MISSING_OUTPUTS` (abort after all). + + \b + Returns + - true in case of success + - false in case the output already exists + - the exception in case of failure + """ + # FIXME: differentiate error cases in various ways: + # - ResourceNotFoundError → use ResourceManager to download (once), then retry + # - transient (I/O or OOM) error → maybe sleep, retry + # - persistent (data) error → skip / dummy / raise + try: + self._base_logger.debug("waiting for output of task %s (page %s)", task, page_id) + # timeout kwarg on future is useless: it only raises TimeoutError here, + # but does not stop the running process/thread, and executor itself + # offers nothing to that effect: + # task.result(timeout=max_seconds or None) + # so we instead applied the timeout within the worker function + task.result() + return True + except NotImplementedError: + # exclude NotImplementedError, so we can try process() below + raise + # handle input failures separately + except FileExistsError as err: + if config.OCRD_EXISTING_OUTPUT == 'ABORT': + raise err + if config.OCRD_EXISTING_OUTPUT == 'SKIP': + return False + if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': + # too late here, must not happen + raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") + # broad coverage of output failures (including TimeoutError) + except Exception as err: + # FIXME: add re-usable/actionable logging + if config.OCRD_MISSING_OUTPUT == 'ABORT': + self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") + raise err + self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") + if config.OCRD_MISSING_OUTPUT == 'SKIP': + pass + elif config.OCRD_MISSING_OUTPUT == 'COPY': + self._copy_page_file(input_files[0]) + else: + desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) + raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") + return err def _copy_page_file(self, input_file : OcrdFileType) -> None: """ From 71d6d496fdc42bdc9c7b338b1ce78d593b36555d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 30 Oct 2024 20:31:18 +0000 Subject: [PATCH 215/228] Processor.process_workspace_handle_page_task: do not handler sigint --- src/ocrd/processor/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 85a0dea212..297b34647f 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -704,6 +704,8 @@ def process_workspace_handle_page_task(self, page_id : str, input_files : List[O if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': # too late here, must not happen raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") + except KeyboardInterrupt: + raise # broad coverage of output failures (including TimeoutError) except Exception as err: # FIXME: add re-usable/actionable logging @@ -1113,6 +1115,7 @@ def _page_worker_set_ctxt(processor): """ global _page_worker_processor _page_worker_processor = processor + def _page_worker(timeout, *input_files): """ Wraps a `Processor.process_page_file` call as payload (call target) From d2d5290a0fb789979b1ce29690f9e93f64c61c1f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 30 Oct 2024 20:32:22 +0000 Subject: [PATCH 216/228] Processor.process_workspace_handle_tasks: log nr of ignored exceptions in the end --- src/ocrd/processor/base.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 297b34647f..87e6731dfa 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -529,7 +529,7 @@ def process_workspace(self, workspace: Workspace) -> None: tasks = self.process_workspace_submit_tasks(executor, max_seconds) stats = self.process_workspace_handle_tasks(tasks) finally: - executor.shutdown(kill_workers=True) + executor.shutdown(kill_workers=True, wait=False) except NotImplementedError: # fall back to deprecated method @@ -651,7 +651,8 @@ def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[O # FIXME: this is just prospective, because len(tasks)==nr_failed+nr_succeeded is not guaranteed if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: # already irredeemably many failures, stop short - raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded})") + nr_errors = dict(nr_errors) + raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded}, {str(nr_errors)})") elif result: nr_succeeded += 1 # else skipped - already exists @@ -659,8 +660,8 @@ def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[O if nr_failed > 0: nr_all = nr_succeeded + nr_failed if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS: - raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all})") - self._base_logger.info("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors)) + raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all}, {str(nr_errors)})") + self._base_logger.warning("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors)) return nr_succeeded, nr_failed, nr_errors, len(tasks) def process_workspace_handle_page_task(self, page_id : str, input_files : List[Optional[OcrdFileType]], task : TFuture) -> Union[bool, Exception]: From 7d1503ebc40d4bd03d6c6e6a9813e8d6279a70a0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 30 Oct 2024 22:47:18 +0100 Subject: [PATCH 217/228] :package: v3.0.0b6 --- CHANGELOG.md | 23 +++++++++++++++++++++++ VERSION | 2 +- repo/spec | 2 +- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index abbfd5a4d8..da422654bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,28 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [3.0.0b6] - 2024-10-30 + +Fixed: + - `OcrdMets.get_physical_pages`: cover `return_divs` w/o `for_fileIds` and `for_pageIds` + +Changed: + - :fire: `ocrd_utils.initLogging`: also add handler to root logger (as in file config), + but disable message propagation to avoid duplication + - only import `ocrd_network` in `src/ocrd/decorators/__init__.py` once needed + - `Processor.process_page_file`: skip computing `process_page_pcgts` if output already exists, + but `OCRD_EXISTING_OUTPUT!=OVERWRITE` + - :fire: `OCRD_MAX_PARALLEL_PAGES>1`: switch from multithreading to multiprocessing, depend on + `loky` instead of stdlib `concurrent.futures` + - `OCRD_PROCESSING_PAGE_TIMEOUT>0`: actually enforce timeout within worker + - `OCRD_MAX_MISSING_OUTPUTS>0`: abort early if too many failures already, prospectively + - `Processor.process_workspace`: split up into overridable sub-methods: + - `process_workspace_submit_tasks` (iterate input file group and schedule page tasks) + - `process_workspace_submit_page_task` (download input files and submit single page task) + - `process_workspace_handle_tasks` (monitor page tasks and aggregate results) + - `process_workspace_handle_page_task` (await single page task and handle errors) + + ## [3.0.0b5] - 2024-09-16 Fixed: @@ -2287,6 +2309,7 @@ Fixed Initial Release +[3.0.0b6]: ../../compare/v3.0.0b6..v3.0.0b5 [3.0.0b5]: ../../compare/v3.0.0b5..v3.0.0b4 [3.0.0b4]: ../../compare/v3.0.0b4..v3.0.0b3 [3.0.0b3]: ../../compare/v3.0.0b3..v3.0.0b2 diff --git a/VERSION b/VERSION index 09fb39d267..43662e8c29 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0b5 +3.0.0b6 diff --git a/repo/spec b/repo/spec index 506b33936d..df2a07e3fd 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit 506b33936d89080a683fa8a26837f2a23b23e5e2 +Subproject commit df2a07e3fda634b2eda5785afe67399b61a81173 From 08a631ccc89401724caf32b3211529abc0a13382 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:27:25 +0000 Subject: [PATCH 218/228] tests: prevent side effects from ocrd_logging --- tests/base.py | 2 -- tests/cli/test_log.py | 11 +++++-- tests/processor/test_processor.py | 32 +++++++++++++------- tests/test_decorators.py | 17 +++++------ tests/test_logging.py | 6 ++++ tests/test_logging_conf.py | 49 +++++++++++++------------------ tests/test_mets_server.py | 28 ++++++++++++------ 7 files changed, 83 insertions(+), 62 deletions(-) diff --git a/tests/base.py b/tests/base.py index 53f393e08d..9eb1f20db8 100644 --- a/tests/base.py +++ b/tests/base.py @@ -26,8 +26,6 @@ class TestCase(VanillaTestCase): def setUp(self): chdir(dirname(realpath(__file__)) + '/..') - disableLogging() - initLogging(builtin_only=True) class CapturingTestCase(TestCase): """ diff --git a/tests/cli/test_log.py b/tests/cli/test_log.py index c63d78c318..3d81e8266b 100644 --- a/tests/cli/test_log.py +++ b/tests/cli/test_log.py @@ -6,8 +6,8 @@ from tests.base import CapturingTestCase as TestCase, main, assets, copy_of_directory from ocrd.decorators import ocrd_loglevel -from ocrd_utils import setOverrideLogLevel, logging, disableLogging -import logging as python_logging +from ocrd_utils import disableLogging, initLogging +import logging @click.group() @ocrd_loglevel @@ -18,14 +18,19 @@ def mock_ocrd_cli(log_level): class TestLogCli(TestCase): def _get_log_output(self, *args): - disableLogging() code, out, err = self.invoke_cli(mock_ocrd_cli, args) print({'code': code, 'out': out, 'err': err}) return err + def setUp(self): + super().setUp() + initLogging() + def tearDown(self): if 'OCRD_TOOL_NAME' in ENV: del(ENV['OCRD_TOOL_NAME']) + super().tearDown() + disableLogging() def test_loglevel(self): assert 'DEBUG ocrd.log_cli - foo' not in self._get_log_output('log', 'debug', 'foo') diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 5844cb8774..06c129c3ca 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -27,21 +27,21 @@ class TestProcessor(TestCase): + def run(self, result=None): + with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as workdir: + with pushd_popd(workdir): + self.resolver = Resolver() + self.workspace = self.resolver.workspace_from_url('mets.xml') + super().run(result=result) + def setUp(self): super().setUp() - # make sure we get an isolated temporary copy of the testdata each time - # as long as we are not using pytest but unittest, we need to manage contexts - # (enterContext is only supported starting with py311) - with ExitStack() as stack: - self.resolver = Resolver() - self.workdir = stack.enter_context(copy_of_directory(assets.path_to('SBB0000F29300010000/data'))) - stack.enter_context(pushd_popd(self.workdir)) - self.workspace = self.resolver.workspace_from_url('mets.xml') - self.addCleanup(stack.pop_all().close) + initLogging() def tearDown(self): super().tearDown() config.reset_defaults() + disableLogging() def test_incomplete_processor(self): proc = IncompleteProcessor(None) @@ -423,6 +423,7 @@ def ocrd_tool(self): def test_run_output_metsserver(start_mets_server): mets_server_url, ws = start_mets_server + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 0 # do not raise for number of failures: config.OCRD_MAX_MISSING_OUTPUTS = -1 run_processor(DummyProcessorWithOutputSleep, workspace=ws, @@ -446,22 +447,33 @@ def test_run_output_metsserver(start_mets_server): parameter={"sleep": 0}, mets_server_url=mets_server_url) assert "already exists" in str(exc.value) + config.reset_defaults() # 2s (+ 2s tolerance) instead of 3*3s (+ 2s tolerance) -@pytest.mark.timeout(4) +# fixme: pytest-timeout does not shut down / finalize the fixture properly +# (regardless of method or func_only), so the next test in the suite +# does not execute ("previous item was not torn down properly") +# so we must instead wait for completion and assert on the time spent... +#@pytest.mark.timeout(timeout=4, func_only=True, method="signal") def test_run_output_parallel(start_mets_server): + import time mets_server_url, ws = start_mets_server + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 0 # do not raise for single-page timeout config.OCRD_PROCESSING_PAGE_TIMEOUT = -1 # do not raise for number of failures: config.OCRD_MAX_MISSING_OUTPUTS = -1 config.OCRD_MAX_PARALLEL_PAGES = 3 + start_time = time.time() run_processor(DummyProcessorWithOutputSleep, workspace=ws, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-OUT", parameter={"sleep": 2}, mets_server_url=mets_server_url) + run_time = time.time() - start_time + assert run_time < 3, f"run_processor took {run_time}s" assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + config.reset_defaults() if __name__ == "__main__": main(__file__) diff --git a/tests/test_decorators.py b/tests/test_decorators.py index c36577020a..561fdc762d 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -41,22 +41,20 @@ def cli_dummy_processor(*args, **kwargs): class TestDecorators(TestCase): - def setUp(self): - super().setUp() - disableLogging() - def tearDown(self): super().tearDown() config.reset_defaults() + disableLogging() def test_minimal(self): - exit_code, out, err = self.invoke_cli(cli_with_ocrd_cli_options, ['-l', 'DEBUG']) - print(out, err) - assert not exit_code + initLogging() + code, out, err = self.invoke_cli(cli_with_ocrd_cli_options, ['-l', 'DEBUG']) + assert not code, (out, err) def test_loglevel_invalid(self): - code, _, err = self.invoke_cli(cli_with_ocrd_loglevel, ['--log-level', 'foo']) - assert code + initLogging() + code, out, err = self.invoke_cli(cli_with_ocrd_loglevel, ['--log-level', 'foo']) + assert code, (out, err) import click if int(click.__version__[0]) < 8: assert 'invalid choice: foo' in err @@ -67,7 +65,6 @@ def test_loglevel_override(self): if get_logging_config_files(): pytest.skip(f"ocrd_logging.conf found at {get_logging_config_files()}, skipping logging test") import logging - disableLogging() assert logging.getLogger('').getEffectiveLevel() == logging.WARNING assert logging.getLogger('ocrd').getEffectiveLevel() == logging.WARNING initLogging() diff --git a/tests/test_logging.py b/tests/test_logging.py index c2b6913b10..091fc25bee 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -26,16 +26,22 @@ class TestLogging(TestCase): def setUp(self): pass # do not chdir + def tearDown(self): + super().tearDown() + disableLogging() + def test_loglevel_inheritance(self): initLogging(builtin_only=True) ocrd_logger = logging.getLogger('ocrd') assert ocrd_logger.getEffectiveLevel() == logging.INFO some_logger = getLogger('ocrd.foo') + assert some_logger.level == logging.NOTSET assert some_logger.getEffectiveLevel() == logging.INFO setOverrideLogLevel('ERROR') assert ocrd_logger.getEffectiveLevel() == logging.ERROR assert some_logger.getEffectiveLevel() == logging.ERROR another_logger = getLogger('ocrd.bar') + assert another_logger.level == logging.NOTSET assert another_logger.getEffectiveLevel() == logging.ERROR def test_getLevelName(self): diff --git a/tests/test_logging_conf.py b/tests/test_logging_conf.py index f8e0e9e894..0717674103 100644 --- a/tests/test_logging_conf.py +++ b/tests/test_logging_conf.py @@ -21,74 +21,67 @@ # sys.path.append(os.path.dirname(os.path.realpath(__file__)) + '/../ocrd') TEST_ROOT = pathlib.Path(os.path.dirname(os.path.abspath(__file__))).parent -def resetLogging(): - disableLogging() - initLogging() - - @pytest.fixture(name="logging_conf") -def _fixture_logging_conf(tmpdir): +def _fixture_logging_conf(tmpdir, capfd): path_logging_conf_orig = os.path.join( str(TEST_ROOT), 'src', 'ocrd_utils', 'ocrd_logging.conf') path_logging_conf_dest = os.path.join(str(tmpdir), 'ocrd_logging.conf') shutil.copy(path_logging_conf_orig, path_logging_conf_dest) - return str(tmpdir) + with pushd_popd(tmpdir): + with capfd.disabled(): + initLogging() + yield str(tmpdir) + disableLogging() -def test_configured_dateformat(logging_conf, capsys): +def test_configured_dateformat(logging_conf, capfd): """Ensure example ocrd_logging.conf is valid and produces desired record format""" # arrange - with pushd_popd(logging_conf): - resetLogging() - test_logger = getLogger('') + test_logger = getLogger('ocrd') - # act - test_logger.info("test logger initialized") + # act + test_logger.info("test logger initialized") - log_info_output = capsys.readouterr().err - must_not_match = r"^\d{4}-\d{2}-\d{2}.*" - assert not re.match(must_not_match, log_info_output) - match_pattern = r"^\d{2}:\d{2}:\d{2}.*" - assert re.match(match_pattern, log_info_output) + log_info_output = capfd.readouterr().err + must_not_match = r"^\d{4}-\d{2}-\d{2}.*" + assert not re.match(must_not_match, log_info_output) + match_pattern = r"^\d{2}:\d{2}:\d{2}.*" + assert re.match(match_pattern, log_info_output), log_info_output -def test_configured_tensorflow_logger_present(logging_conf, capsys): +def test_configured_tensorflow_logger_present(logging_conf, capfd): """Ensure example ocrd_logging.conf is valid and contains logger tensorflow""" # arrange - os.chdir(logging_conf) - resetLogging() logger_under_test = getLogger('tensorflow') # act info logger_under_test.info("tensorflow logger initialized") - log_info_output = capsys.readouterr().err + log_info_output = capfd.readouterr().err assert not log_info_output # act error logger_under_test.error("tensorflow has error") - log_error_output = capsys.readouterr().err + log_error_output = capfd.readouterr().err assert log_error_output -def test_configured_shapely_logger_present(logging_conf, capsys): +def test_configured_shapely_logger_present(logging_conf, capfd): """Ensure example ocrd_logging.conf is valid and contains logger shapely.geos""" # arrange - os.chdir(logging_conf) - resetLogging() logger_under_test = getLogger('shapely.geos') # act info logger_under_test.info("shapely.geos logger initialized") - log_info_output = capsys.readouterr().err + log_info_output = capfd.readouterr().err assert not log_info_output # act error logger_under_test.error("shapely alert") - log_error_output = capsys.readouterr().err + log_error_output = capfd.readouterr().err assert log_error_output if __name__ == '__main__': diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index dc94d6c560..3bb96535c0 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -22,20 +22,17 @@ from requests.exceptions import ConnectionError from ocrd import Resolver, OcrdMetsServer, Workspace -from ocrd_utils import pushd_popd, MIMETYPE_PAGE, initLogging, setOverrideLogLevel +from ocrd_utils import pushd_popd, MIMETYPE_PAGE, initLogging, setOverrideLogLevel, disableLogging, getLogger TRANSPORTS = ['/tmp/ocrd-mets-server.sock', 'http://127.0.0.1:12345'] -initLogging() -setOverrideLogLevel(10) - @fixture(scope='function', name='start_mets_server', params=TRANSPORTS) def fixture_start_mets_server(request, tmpdir) -> Iterable[Tuple[str, Workspace]]: - tmpdir = str(tmpdir) - def _start_mets_server(*args, **kwargs): - mets_server = OcrdMetsServer(*args, **kwargs) - mets_server.startup() + initLogging() + #setOverrideLogLevel(10) + logger = getLogger('ocrd') + tmpdir = str(tmpdir) mets_server_url = request.param if mets_server_url == TRANSPORTS[0]: @@ -47,13 +44,26 @@ def _start_mets_server(*args, **kwargs): copytree(assets.path_to('SBB0000F29300010000/data'), tmpdir) workspace = Workspace(Resolver(), tmpdir) - p = Process(target=_start_mets_server, kwargs={'workspace': workspace, 'url': request.param}) + class MetsServerProcess(Process): + def __init__(self, *args, **kwargs): + self.server = OcrdMetsServer(*args, **kwargs) + super().__init__() + def run(self): + self.server.startup() + def terminate(self): + self.server.workspace.save_mets() + super().terminate() + p = MetsServerProcess(workspace=workspace, url=request.param) p.start() + logger.info("started METS Server") sleep(1) # sleep to start up server workspace_server = Workspace(Resolver(), tmpdir, mets_server_url=mets_server_url) yield mets_server_url, workspace_server p.terminate() + p.join() + logger.info("terminated METS Server") rmtree(tmpdir, ignore_errors=True) + disableLogging() def add_file_server(x, force=False): mets_server_url, directory, i = x From f3e423ac52f5293596cf88ac2031384857be4145 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:36:17 +0000 Subject: [PATCH 219/228] initLogging: do not remove any previous handlers/levels --- src/ocrd_utils/logging.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index dfac74988b..404ac7ddbc 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -161,18 +161,6 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L global _initialized_flag if _initialized_flag and not force_reinit: return - # disableLogging() - - # https://docs.python.org/3/library/logging.html#logging.disable - # If logging.disable(logging.NOTSET) is called, it effectively removes this - # overriding level, so that logging output again depends on the effective - # levels of individual loggers. - logging.disable(logging.NOTSET) - - # remove all handlers for the ocrd root loggers - for logger_name in ROOT_OCRD_LOGGERS: - for handler in logging.getLogger(logger_name).handlers[:]: - logging.getLogger(logger_name).removeHandler(handler) config_file = None if not builtin_only: From 31435187dffb43c692f24f3108f24d0ed1093cfd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:38:44 +0000 Subject: [PATCH 220/228] initLogging: only add root handler instead of multiple redundant handlers with propagate=false --- src/ocrd_utils/logging.py | 7 ++----- src/ocrd_utils/ocrd_logging.conf | 28 +++++++++++++--------------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 404ac7ddbc..7f59221c8e 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -179,11 +179,8 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L ocrd_handler = logging.StreamHandler(stream=sys.stderr) ocrd_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT)) ocrd_handler.setLevel(logging.DEBUG) - for logger_name in ROOT_OCRD_LOGGERS: - logger = logging.getLogger(logger_name) - logger.addHandler(ocrd_handler) - if logger_name: - logger.propagate = False # avoid duplication (from root handler) + root_logger = logging.getLogger('') + root_logger.addHandler(ocrd_handler) for logger_name, logger_level in LOGGING_DEFAULTS.items(): logging.getLogger(logger_name).setLevel(logger_level) _initialized_flag = True diff --git a/src/ocrd_utils/ocrd_logging.conf b/src/ocrd_utils/ocrd_logging.conf index 5cf161398e..0af039b2ac 100644 --- a/src/ocrd_utils/ocrd_logging.conf +++ b/src/ocrd_utils/ocrd_logging.conf @@ -56,22 +56,22 @@ handlers=consoleHandler,fileHandler # ocrd loggers [logger_ocrd] level=INFO -handlers=consoleHandler,fileHandler +handlers= qualname=ocrd -propagate=0 [logger_ocrd_network] level=INFO -handlers=consoleHandler,processingServerHandler +#handlers=consoleHandler,processingServerHandler +handlers=processingServerHandler qualname=ocrd_network -propagate=0 +#propagate=0 # # logger tensorflow # [logger_ocrd_tensorflow] level=ERROR -handlers=consoleHandler +handlers= qualname=tensorflow # @@ -79,7 +79,7 @@ qualname=tensorflow # [logger_ocrd_shapely_geos] level=ERROR -handlers=consoleHandler +handlers= qualname=shapely.geos @@ -88,7 +88,7 @@ qualname=shapely.geos # [logger_ocrd_PIL] level=INFO -handlers=consoleHandler +handlers= qualname=PIL # @@ -96,34 +96,32 @@ qualname=PIL # [logger_paramiko] level=INFO -handlers=consoleHandler +handlers= qualname=paramiko -propagate=0 [logger_paramiko_transport] level=INFO -handlers=consoleHandler +handlers= qualname=paramiko.transport -propagate=0 # # uvicorn loggers # [logger_uvicorn] level=INFO -handlers=consoleHandler +handlers= qualname=uvicorn [logger_uvicorn_access] level=WARN -handlers=consoleHandler +handlers= qualname=uvicorn.access [logger_uvicorn_error] level=INFO -handlers=consoleHandler +handlers= qualname=uvicorn.error [logger_multipart] level=INFO -handlers=consoleHandler +handlers= qualname=multipart From 27323c665edc608958a484ce7ae4aebaa65f45f6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:41:20 +0000 Subject: [PATCH 221/228] disableLogging: remove all handlers, reset all levels --- src/ocrd_utils/logging.py | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 7f59221c8e..db7921c843 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -46,13 +46,6 @@ 'setOverrideLogLevel', ] -# These are the loggers we add handlers to -ROOT_OCRD_LOGGERS = [ - '', - 'ocrd', - 'ocrd_network' -] - LOGGING_DEFAULTS = { 'ocrd': logging.INFO, 'ocrd_network': logging.INFO, @@ -196,24 +189,16 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): if _initialized_flag and not silent: print("[LOGGING] Disabling logging", file=sys.stderr) _initialized_flag = False - # logging.basicConfig(level=logging.CRITICAL) - # logging.disable(logging.ERROR) - # remove all handlers for the ocrd logger - for logger_name in ROOT_OCRD_LOGGERS: - for handler in logging.getLogger(logger_name).handlers[:]: - logging.getLogger(logger_name).removeHandler(handler) - for logger_name in LOGGING_DEFAULTS: - logging.getLogger(logger_name).setLevel(logging.NOTSET) + # remove all handlers we might have added (via initLogging on builtin or file config) + for logger_name in logging.root.manager.loggerDict: + if not silent: + print(f'[LOGGING] Resetting {logger_name} log level and handlers') + logger = logging.getLogger(logger_name) + logger.setLevel(logging.NOTSET) + for handler in logger.handlers[:]: + logger.removeHandler(handler) + for handler in logging.root.handlers[:]: + logging.root.removeHandler(handler) # Python default log level is WARNING logging.root.setLevel(logging.WARNING) -# Initializing stream handlers at module level -# would cause message output in all runtime contexts, -# including those which are already run for std output -# (--dump-json, --version, ocrd-tool, bashlib etc). -# So this needs to be an opt-in from the CLIs/decorators: -#initLogging() -# Also, we even have to block log output for libraries -# (like matplotlib/tensorflow) which set up logging -# themselves already: -disableLogging() From eb3120d77fab33ce2da91515dc452ffe438833e9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:42:52 +0000 Subject: [PATCH 222/228] setOverrideLogLevel: override all currently active loggers' level --- src/ocrd_utils/logging.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index db7921c843..98c2f58b2c 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -107,18 +107,15 @@ def setOverrideLogLevel(lvl, silent=not config.OCRD_LOGGING_DEBUG): lvl (string): Log level name. silent (boolean): Whether to log the override call """ - if not _initialized_flag: - initLogging(silent=silent) - ocrd_logger = logging.getLogger('ocrd') - - if lvl is None: - if not silent: - print('[LOGGING] Reset log level override', file=sys.stderr) - ocrd_logger.setLevel(logging.NOTSET) - else: - if not silent: - print(f'[LOGGING] Overriding ocrd log level to {lvl}', file=sys.stderr) - ocrd_logger.setLevel(lvl) + if lvl is not None: + lvl = getLevelName(lvl) + if not _initialized_flag: + initLogging(silent=silent) + # affect all configured loggers + for logger_name in logging.root.manager.loggerDict: + if not silent: + print(f'[LOGGING] Overriding {logger_name} log level to {lvl}', file=sys.stderr) + logging.getLogger(logger_name).setLevel(lvl) def get_logging_config_files(): """ From 0186c53795c0f32167a148172ea123906db79c41 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:43:40 +0000 Subject: [PATCH 223/228] logging: increase default root (not ocrd) level from INFO to WARNING --- src/ocrd_utils/logging.py | 1 + src/ocrd_utils/ocrd_logging.conf | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 98c2f58b2c..ddb8b88b2a 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -47,6 +47,7 @@ ] LOGGING_DEFAULTS = { + '': logging.WARNING, 'ocrd': logging.INFO, 'ocrd_network': logging.INFO, # 'ocrd.resolver': logging.INFO, diff --git a/src/ocrd_utils/ocrd_logging.conf b/src/ocrd_utils/ocrd_logging.conf index 0af039b2ac..41e6d5af7a 100644 --- a/src/ocrd_utils/ocrd_logging.conf +++ b/src/ocrd_utils/ocrd_logging.conf @@ -34,7 +34,7 @@ keys=defaultFormatter,detailedFormatter # default logger "root" using consoleHandler # [logger_root] -level=INFO +level=WARNING handlers=consoleHandler,fileHandler From 5ba27209d396c44eb4d5e53f784a9fd42167a9ee Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:44:06 +0000 Subject: [PATCH 224/228] Processor: update max_workers docstring --- src/ocrd/processor/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 87e6731dfa..f0d453f4ac 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -158,12 +158,12 @@ class Processor(): max_workers : int = -1 """ - maximum number of processor threads for page-parallel processing (ignored if negative), + maximum number of processor forks for page-parallel processing (ignored if negative), to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e. whatever is smaller). (Override this if you know how many pages fit into processing units - GPU shaders / CPU cores - - at once, or if your class is not thread-safe.) + - at once, or if your class already creates threads prior to forking, e.g. during ``setup``.) """ max_page_seconds : int = -1 From f8f71d809207f3bf1fc94dbdb9525272c13cd286 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 11 Nov 2024 13:34:10 +0000 Subject: [PATCH 225/228] initLogging: call disableLogging if already initialized and force_reinit --- src/ocrd_utils/logging.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index ddb8b88b2a..52b01883f1 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -150,8 +150,11 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L - silent (bool): Whether to log logging behavior by printing to stderr """ global _initialized_flag - if _initialized_flag and not force_reinit: - return + if _initialized_flag: + if force_reinit: + disableLogging(silent=silent) + else: + return config_file = None if not builtin_only: From 5f2f602f5917d2f0970ff0fc15d64b148083b98b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 11 Nov 2024 16:02:44 +0000 Subject: [PATCH 226/228] Processor: replace weakref with __del__ to trigger shutdown --- src/ocrd/processor/base.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index f0d453f4ac..7ec77162ee 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -21,7 +21,6 @@ import inspect import tarfile import io -import weakref from collections import defaultdict from frozendict import frozendict # concurrent.futures is buggy in py38, @@ -366,12 +365,14 @@ def __init__( self._base_logger = getLogger('ocrd.processor.base') if parameter is not None: self.parameter = parameter - # ensure that shutdown gets called at destruction - self._finalizer = weakref.finalize(self, self.shutdown) # workaround for deprecated#72 (@deprecated decorator does not work for subclasses): setattr(self, 'process', deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')(getattr(self, 'process'))) + def __del__(self): + self._base_logger.debug("shutting down") + self.shutdown() + def show_help(self, subcommand=None): """ Print a usage description including the standard CLI and all of this processor's ocrd-tool From 0446b82be55093536c5c0818de3b49d0aecc727a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 11 Nov 2024 23:23:15 +0000 Subject: [PATCH 227/228] Processor parallel pages: log via QueueHandler in subprocess, QueueListener in main --- repo/spec | 2 +- src/ocrd/processor/base.py | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/repo/spec b/repo/spec index df2a07e3fd..506b33936d 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit df2a07e3fda634b2eda5785afe67399b61a81173 +Subproject commit 506b33936d89080a683fa8a26837f2a23b23e5e2 diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 7ec77162ee..d6348b40e1 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -18,6 +18,8 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union, get_args import sys +import logging +import logging.handlers import inspect import tarfile import io @@ -515,22 +517,31 @@ def process_workspace(self, workspace: Workspace) -> None: if max_workers > 1: executor_cls = ProcessPoolExecutor + log_queue = mp.Queue() + # forward messages from log queue (in subprocesses) to all root handlers + log_listener = logging.handlers.QueueListener(log_queue, *logging.root.handlers, respect_handler_level=True) else: executor_cls = DummyExecutor + log_queue = None + log_listener = None executor = executor_cls( max_workers=max_workers or 1, # only forking method avoids pickling context=mp.get_context('fork'), # share processor instance as global to avoid pickling initializer=_page_worker_set_ctxt, - initargs=(self,), + initargs=(self, log_queue), ) + if max_workers > 1: + log_listener.start() try: self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) tasks = self.process_workspace_submit_tasks(executor, max_seconds) stats = self.process_workspace_handle_tasks(tasks) finally: executor.shutdown(kill_workers=True, wait=False) + if max_workers > 1: + log_listener.stop() except NotImplementedError: # fall back to deprecated method @@ -1110,13 +1121,16 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): objects, and with the METS Server we do not mutate the local processor instance anyway. """ -def _page_worker_set_ctxt(processor): +def _page_worker_set_ctxt(processor, log_queue): """ Overwrites `ocrd.processor.base._page_worker_processor` instance for sharing with subprocesses in ProcessPoolExecutor initializer. """ global _page_worker_processor _page_worker_processor = processor + if log_queue: + # replace all log handlers with just one queue handler + logging.root.handlers = [logging.handlers.QueueHandler(log_queue)] def _page_worker(timeout, *input_files): """ From 53c4c18240684936d2cd4e87051b5bbcc57f9cb2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 12 Nov 2024 00:46:38 +0000 Subject: [PATCH 228/228] :package: v3.0.0b7 --- CHANGELOG.md | 13 +++++++++++++ VERSION | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index da422654bc..04ea2d42a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,19 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [3.0.0b7] - 2024-11-12 + +Fixed: + - `initLogging`: only add root handler instead of multiple redundant handlers with `propagate=false` + - `setOverrideLogLevel`: override all currently active loggers' level + +Changed: + - :fire: logging: increase default root (not `ocrd`) level from `INFO` to `WARNING` + - :fire: `initLogging`: do not remove any previous handlers/levels, unless `force_reinit` + - :fire: `disableLogging`: remove all handlers, reset all levels - instead of being selective + - :fire: Processor: replace `weakref` with `__del__` to trigger `shutdown` + - :fire: `OCRD_MAX_PARALLEL_PAGES>1`: log via `QueueHandler` in subprocess, `QueueListener` in main + ## [3.0.0b6] - 2024-10-30 Fixed: diff --git a/VERSION b/VERSION index 43662e8c29..1129dfd443 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0b6 +3.0.0b7