diff --git a/.circleci/config.yml b/.circleci/config.yml index a17560d..43fa5cc 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -28,6 +28,7 @@ jobs: key: v01-pydeps-<< parameters.python-image >>-{{ checksum "requirements.txt" }}-{{ checksum "requirements-dev.txt" }} paths: - "~/.cache/pip" + resource_class: large workflows: build: diff --git a/Makefile b/Makefile index f3164dc..e7fbc83 100644 --- a/Makefile +++ b/Makefile @@ -40,8 +40,6 @@ install: $(MODEL): ocrd resmgr download ocrd-calamari-recognize $@ - # Workaround, see #91 https://github.com/OCR-D/ocrd_calamari/issues/91 - fix-calamari1-model ~/.local/share/ocrd-resources/ocrd-calamari-recognize/$@ # Download example data (for the README) example: $(EXAMPLE) @@ -84,7 +82,7 @@ assets-clean: # Run unit tests test: test/assets $(MODEL) # declare -p HTTP_PROXY - $(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS) + $(PYTHON) -m pytest --continue-on-collection-errors --durations=0 test $(PYTEST_ARGS) # Run unit tests and determine test coverage coverage: test/assets $(MODEL) diff --git a/ocrd_calamari/config.py b/ocrd_calamari/config.py deleted file mode 100644 index 1729f8c..0000000 --- a/ocrd_calamari/config.py +++ /dev/null @@ -1,5 +0,0 @@ -import json - -from pkg_resources import resource_string - -OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8")) diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index 6bdb971..6bce4cb 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -11,12 +11,8 @@ "recognition/text-recognition" ], "description": "Recognize lines with Calamari", - "input_file_grp": [ - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-OCR-CALAMARI" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "parameters": { "checkpoint_dir": { "description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory", diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 1ab11f5..02d3702 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -1,25 +1,21 @@ from __future__ import absolute_import +from typing import Optional import itertools -import os from glob import glob +from concurrent.futures import ThreadPoolExecutor import numpy as np -from ocrd import Processor -from ocrd_modelfactory import page_from_file +from ocrd import Processor, OcrdPage, OcrdPageResult from ocrd_models.ocrd_page import ( CoordsType, GlyphType, TextEquivType, WordType, - to_xml, ) from ocrd_utils import ( - MIMETYPE_PAGE, - assert_file_grp_cardinality, + VERSION as OCRD_VERSION, coordinates_for_segment, - getLogger, - make_file_id, points_from_polygon, polygon_from_x0y0x1y1, tf_disable_interactive_logs, @@ -36,34 +32,40 @@ from calamari_ocr.ocr import MultiPredictor from calamari_ocr.ocr.voting import voter_from_proto from calamari_ocr.proto import VoterParams +from tensorflow import config as tensorflow_config # ruff: isort: on -from ocrd_calamari.config import OCRD_TOOL - -TOOL = "ocrd-calamari-recognize" - +BATCH_SIZE = 64 +if not hasattr(itertools, 'batched'): + def batched(iterable, n): + # batched('ABCDEFG', 3) → ABC DEF G + if n < 1: + raise ValueError('n must be at least one') + iterator = iter(iterable) + while batch := tuple(itertools.islice(iterator, n)): + yield batch + itertools.batched = batched class CalamariRecognize(Processor): - def __init__(self, *args, **kwargs): - kwargs["ocrd_tool"] = OCRD_TOOL["tools"][TOOL] - kwargs["version"] = "%s (calamari %s, tensorflow %s)" % ( - OCRD_TOOL["version"], - calamari_version, - tensorflow_version, - ) - super(CalamariRecognize, self).__init__(*args, **kwargs) - if hasattr(self, "output_file_grp"): - # processing context - self.setup() + @property + def executable(self): + return 'ocrd-calamari-recognize' + + def show_version(self): + print(f"Version {self.version}, calamari {calamari_version}, tensorflow {tensorflow_version}, ocrd/core {OCRD_VERSION}") def setup(self): """ Set up the model prior to processing. """ + devices = tensorflow_config.list_physical_devices("GPU") + for device in devices: + self.logger.info("using GPU device %s", device) + tensorflow_config.experimental.set_memory_growth(device, True) resolved = self.resolve_resource(self.parameter["checkpoint_dir"]) checkpoints = glob("%s/*.ckpt.json" % resolved) - self.predictor = MultiPredictor(checkpoints=checkpoints) + self.predictor = MultiPredictor(checkpoints=checkpoints, batch_size=BATCH_SIZE) self.network_input_channels = self.predictor.predictors[ 0 @@ -85,285 +87,268 @@ def setup(self): voter_params.type = VoterParams.Type.Value(self.parameter["voter"].upper()) self.voter = voter_from_proto(voter_params) - def process(self): + # run in a background thread so GPU parts can be interleaved with CPU pre-/post-processing across pages + self.executor = ThreadPoolExecutor(max_workers=1) + + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """ - Perform text recognition with Calamari on the workspace. + Perform text recognition with Calamari. If ``texequiv_level`` is ``word`` or ``glyph``, then additionally create word / glyph level segments by splitting at white space characters / glyph boundaries. In the case of ``glyph``, add all alternative character hypotheses down to ``glyph_conf_cutoff`` confidence threshold. """ - log = getLogger("processor.CalamariRecognize") - - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - - for n, input_file in enumerate(self.input_files): - page_id = input_file.pageId or input_file.ID - log.info("INPUT FILE %i / %s", n, page_id) - pcgts = page_from_file(self.workspace.download_file(input_file)) + pcgts = input_pcgts[0] + page = pcgts.get_Page() + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id, feature_selector=self.features + ) - page = pcgts.get_Page() - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, feature_selector=self.features + lines = [] + for region in page.get_AllRegions(classes=["Text"]): + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, feature_selector=self.features ) - for region in page.get_AllRegions(classes=["Text"]): - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_selector=self.features + textlines = region.get_TextLine() + self.logger.info( + "About to recognize %i lines of region '%s'", + len(textlines), + region.id, + ) + for line in textlines: + self.logger.debug( + "Recognizing line '%s' in region '%s'", line.id, region.id ) - textlines = region.get_TextLine() - log.info( - "About to recognize %i lines of region '%s'", - len(textlines), - region.id, + line_image, line_coords = self.workspace.image_from_segment( + line, + region_image, + region_coords, + feature_selector=self.features, ) - line_images_np = [] - line_coordss = [] - for line in textlines: - log.debug( - "Recognizing line '%s' in region '%s'", line.id, region.id + if ( + "binarized" not in line_coords["features"] + and "grayscale_normalized" not in line_coords["features"] + and self.network_input_channels == 1 + ): + # We cannot use a feature selector for this since we don't + # know whether the model expects (has been trained on) + # binarized or grayscale images; but raw images are likely + # always inadequate: + self.logger.warning( + "Using raw image for line '%s' in region '%s'", + line.id, + region.id, ) - line_image, line_coords = self.workspace.image_from_segment( - line, - region_image, - region_coords, - feature_selector=self.features, + if ( + not all(line_image.size) + or line_image.height <= 8 + or line_image.width <= 8 + or "binarized" in line_coords["features"] + and line_image.convert("1").getextrema()[0] == 255 + ): + # empty size or too tiny or no foreground at all: skip + self.logger.warning( + "Skipping empty line '%s' in region '%s'", + line.id, + region.id, ) - if ( - "binarized" not in line_coords["features"] - and "grayscale_normalized" not in line_coords["features"] - and self.network_input_channels == 1 - ): - # We cannot use a feature selector for this since we don't - # know whether the model expects (has been trained on) - # binarized or grayscale images; but raw images are likely - # always inadequate: - log.warning( - "Using raw image for line '%s' in region '%s'", - line.id, - region.id, - ) + continue + lines.append((line, line_coords, np.array(line_image, dtype=np.uint8))) + + if not len(lines): + self.logger.warning("No text lines on page '%s'", page_id) + return OcrdPageResult(pcgts) + + lines, coords, images = zip(*lines) + # not exposed in MultiPredictor yet, cf. calamari#361: + # results = self.executor.submit(self.predictor.predict_raw, images, progress_bar=False, batch_size=BATCH_SIZE).result() + # avoid too large a batch size (causing OOM on CPU or GPU) + fun = lambda x: self.executor.submit(self.predictor.predict_raw, x, progress_bar=False).result() + results = itertools.chain.from_iterable( + map(fun, itertools.batched(images, BATCH_SIZE))) + for line, line_coords, raw_results in zip(lines, coords, results): + for i, p in enumerate(raw_results): + p.prediction.id = "fold_{}".format(i) + + prediction = self.voter.vote_prediction_result(raw_results) + prediction.id = "voted" + + # Build line text on our own + # + # Calamari does whitespace post-processing on prediction.sentence, + # while it does not do the same on prediction.positions. Do it on + # our own to have consistency. + # + # XXX Check Calamari's built-in post-processing on + # prediction.sentence + + def _sort_chars(p): + """Filter and sort chars of prediction p""" + chars = p.chars + chars = [ + c for c in chars if c.char + ] # XXX Note that omission probabilities are not normalized?! + chars = [ + c + for c in chars + if c.probability >= self.parameter["glyph_conf_cutoff"] + ] + chars = sorted(chars, key=lambda k: k.probability, reverse=True) + return chars + + def _drop_leading_spaces(positions): + return list( + itertools.dropwhile( + lambda p: _sort_chars(p)[0].char == " ", positions + ) + ) - if ( - not all(line_image.size) - or line_image.height <= 8 - or line_image.width <= 8 - or "binarized" in line_coords["features"] - and line_image.convert("1").getextrema()[0] == 255 - ): - # empty size or too tiny or no foreground at all: skip - log.warning( - "Skipping empty line '%s' in region '%s'", - line.id, - region.id, - ) - line_image_np = np.array([[0]], dtype=np.uint8) - else: - line_image_np = np.array(line_image, dtype=np.uint8) - line_images_np.append(line_image_np) - line_coordss.append(line_coords) - raw_results_all = self.predictor.predict_raw( - line_images_np, progress_bar=False + def _drop_trailing_spaces(positions): + return list(reversed(_drop_leading_spaces(reversed(positions)))) + + def _drop_double_spaces(positions): + def _drop_double_spaces_generator(positions): + last_was_space = False + for p in positions: + if p.chars[0].char == " ": + if not last_was_space: + yield p + last_was_space = True + else: + yield p + last_was_space = False + + return list(_drop_double_spaces_generator(positions)) + + positions = prediction.positions + positions = _drop_leading_spaces(positions) + positions = _drop_trailing_spaces(positions) + positions = _drop_double_spaces(positions) + positions = list(positions) + + line_text = "".join(_sort_chars(p)[0].char for p in positions) + if line_text != prediction.sentence: + self.logger.warning( + f"Our own line text is not the same as Calamari's:" + f"'{line_text}' != '{prediction.sentence}'" ) - for line, line_coords, raw_results in zip( - textlines, line_coordss, raw_results_all - ): - for i, p in enumerate(raw_results): - p.prediction.id = "fold_{}".format(i) - - prediction = self.voter.vote_prediction_result(raw_results) - prediction.id = "voted" - - # Build line text on our own - # - # Calamari does whitespace post-processing on prediction.sentence, - # while it does not do the same on prediction.positions. Do it on - # our own to have consistency. - # - # XXX Check Calamari's built-in post-processing on - # prediction.sentence - - def _sort_chars(p): - """Filter and sort chars of prediction p""" - chars = p.chars - chars = [ - c for c in chars if c.char - ] # XXX Note that omission probabilities are not normalized?! - chars = [ - c - for c in chars - if c.probability >= self.parameter["glyph_conf_cutoff"] - ] - chars = sorted(chars, key=lambda k: k.probability, reverse=True) - return chars - - def _drop_leading_spaces(positions): - return list( - itertools.dropwhile( - lambda p: _sort_chars(p)[0].char == " ", positions - ) - ) + # Delete existing results + if line.get_TextEquiv(): + self.logger.warning("Line '%s' already contained text results", line.id) + line.set_TextEquiv([]) + if line.get_Word(): + self.logger.warning( + "Line '%s' already contained word segmentation", line.id + ) + line.set_Word([]) - def _drop_trailing_spaces(positions): - return list(reversed(_drop_leading_spaces(reversed(positions)))) + # Save line results + line_conf = prediction.avg_char_probability + line.set_TextEquiv( + [TextEquivType(Unicode=line_text, conf=line_conf)] + ) - def _drop_double_spaces(positions): - def _drop_double_spaces_generator(positions): - last_was_space = False - for p in positions: - if p.chars[0].char == " ": - if not last_was_space: - yield p - last_was_space = True - else: - yield p - last_was_space = False - - return list(_drop_double_spaces_generator(positions)) - - positions = prediction.positions - positions = _drop_leading_spaces(positions) - positions = _drop_trailing_spaces(positions) - positions = _drop_double_spaces(positions) - positions = list(positions) - - line_text = "".join(_sort_chars(p)[0].char for p in positions) - if line_text != prediction.sentence: - log.warning( - f"Our own line text is not the same as Calamari's:" - f"'{line_text}' != '{prediction.sentence}'" + # Save word results + # + # Calamari OCR does not provide word positions, so we infer word + # positions from a. text segmentation and b. the glyph positions. + # This is necessary because the PAGE XML format enforces a strict + # hierarchy of lines > words > glyphs. + + def _words(s): + """Split words based on spaces and include spaces as 'words'""" + spaces = None + word = "" + for c in s: + if c == " " and spaces is True: + word += c + elif c != " " and spaces is False: + word += c + else: + if word: + yield word + word = c + spaces = c == " " + yield word + + if self.parameter["textequiv_level"] in ["word", "glyph"]: + word_no = 0 + i = 0 + + for word_text in _words(line_text): + word_length = len(word_text) + if not all(c == " " for c in word_text): + word_positions = positions[i : i + word_length] + word_start = word_positions[0].global_start + word_end = word_positions[-1].global_end + + polygon = polygon_from_x0y0x1y1( + [word_start, 0, word_end, line_image.height] ) - - # Delete existing results - if line.get_TextEquiv(): - log.warning("Line '%s' already contained text results", line.id) - line.set_TextEquiv([]) - if line.get_Word(): - log.warning( - "Line '%s' already contained word segmentation", line.id + points = points_from_polygon( + coordinates_for_segment(polygon, None, line_coords) ) - line.set_Word([]) + # XXX Crop to line polygon? - # Save line results - line_conf = prediction.avg_char_probability - line.set_TextEquiv( - [TextEquivType(Unicode=line_text, conf=line_conf)] - ) + word = WordType( + id="%s_word%04d" % (line.id, word_no), + Coords=CoordsType(points), + ) + word.add_TextEquiv(TextEquivType(Unicode=word_text)) - # Save word results - # - # Calamari OCR does not provide word positions, so we infer word - # positions from a. text segmentation and b. the glyph positions. - # This is necessary because the PAGE XML format enforces a strict - # hierarchy of lines > words > glyphs. - - def _words(s): - """Split words based on spaces and include spaces as 'words'""" - spaces = None - word = "" - for c in s: - if c == " " and spaces is True: - word += c - elif c != " " and spaces is False: - word += c - else: - if word: - yield word - word = c - spaces = c == " " - yield word - - if self.parameter["textequiv_level"] in ["word", "glyph"]: - word_no = 0 - i = 0 - - for word_text in _words(line_text): - word_length = len(word_text) - if not all(c == " " for c in word_text): - word_positions = positions[i : i + word_length] - word_start = word_positions[0].global_start - word_end = word_positions[-1].global_end + if self.parameter["textequiv_level"] == "glyph": + for glyph_no, p in enumerate(word_positions): + glyph_start = p.global_start + glyph_end = p.global_end polygon = polygon_from_x0y0x1y1( - [word_start, 0, word_end, line_image.height] + [ + glyph_start, + 0, + glyph_end, + line_image.height, + ] ) points = points_from_polygon( - coordinates_for_segment(polygon, None, line_coords) + coordinates_for_segment( + polygon, None, line_coords + ) ) - # XXX Crop to line polygon? - word = WordType( - id="%s_word%04d" % (line.id, word_no), + glyph = GlyphType( + id="%s_glyph%04d" % (word.id, glyph_no), Coords=CoordsType(points), ) - word.add_TextEquiv(TextEquivType(Unicode=word_text)) - - if self.parameter["textequiv_level"] == "glyph": - for glyph_no, p in enumerate(word_positions): - glyph_start = p.global_start - glyph_end = p.global_end - - polygon = polygon_from_x0y0x1y1( - [ - glyph_start, - 0, - glyph_end, - line_image.height, - ] - ) - points = points_from_polygon( - coordinates_for_segment( - polygon, None, line_coords - ) - ) - glyph = GlyphType( - id="%s_glyph%04d" % (word.id, glyph_no), - Coords=CoordsType(points), + # Add predictions (= TextEquivs) + char_index_start = 1 + # Index must start with 1, see + # https://ocr-d.github.io/page#multiple-textequivs + for char_index, char in enumerate( + _sort_chars(p), start=char_index_start + ): + glyph.add_TextEquiv( + TextEquivType( + Unicode=char.char, + index=char_index, + conf=char.probability, ) + ) - # Add predictions (= TextEquivs) - char_index_start = 1 - # Index must start with 1, see - # https://ocr-d.github.io/page#multiple-textequivs - for char_index, char in enumerate( - _sort_chars(p), start=char_index_start - ): - glyph.add_TextEquiv( - TextEquivType( - Unicode=char.char, - index=char_index, - conf=char.probability, - ) - ) - - word.add_Glyph(glyph) - - line.add_Word(word) - word_no += 1 - - i += word_length - - _page_update_higher_textequiv_levels("line", pcgts) - - # Add metadata about this operation and its runtime parameters: - self.add_metadata(pcgts) - file_id = make_file_id(input_file, self.output_file_grp) - pcgts.set_pcGtsId(file_id) - self.workspace.add_file( - file_id=file_id, - file_grp=self.output_file_grp, - page_id=input_file.pageId, - mimetype=MIMETYPE_PAGE, - local_filename=os.path.join(self.output_file_grp, file_id + ".xml"), - content=to_xml(pcgts), - ) + word.add_Glyph(glyph) + + line.add_Word(word) + word_no += 1 + + i += word_length + _page_update_higher_textequiv_levels("line", pcgts) + return OcrdPageResult(pcgts) # TODO: This is a copy of ocrd_tesserocr's function, and should probably be moved to a # ocrd lib diff --git a/requirements.txt b/requirements.txt index 5eebd46..b637015 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ tensorflow >= 2.5.0, < 2.16 numpy -calamari-ocr == 1.0.*, >= 1.0.6 +calamari-ocr == 1.0.*, >= 1.0.7 setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? click -ocrd >= 2.54.0 +ocrd >= 3.0.0b5 diff --git a/test/base.py b/test/base.py deleted file mode 100644 index d2dc025..0000000 --- a/test/base.py +++ /dev/null @@ -1,7 +0,0 @@ -from test.assets import assets - -from ocrd_utils import initLogging - -initLogging() - -__all__ = ["assets"] diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 0000000..2403cc7 --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,64 @@ +from multiprocessing import Process +from time import sleep +import pytest + +from ocrd import Resolver, Workspace, OcrdMetsServer +from ocrd_utils import pushd_popd, disableLogging, initLogging, setOverrideLogLevel, config + +from .assets import assets + +CONFIGS = ['', 'pageparallel', 'metscache', 'pageparallel+metscache'] + +@pytest.fixture(params=CONFIGS) +def workspace(tmpdir, pytestconfig, request): + def _make_workspace(workspace_path): + initLogging() + if pytestconfig.getoption('verbose') > 0: + setOverrideLogLevel('DEBUG') + with pushd_popd(tmpdir): + directory = str(tmpdir) + resolver = Resolver() + workspace = resolver.workspace_from_url(workspace_path, dst_dir=directory, download=True) + config.OCRD_MISSING_OUTPUT = "ABORT" + if 'metscache' in request.param: + config.OCRD_METS_CACHING = True + print("enabled METS caching") + if 'pageparallel' in request.param: + config.OCRD_MAX_PARALLEL_PAGES = 4 + print("enabled page-parallel processing") + def _start_mets_server(*args, **kwargs): + print("running with METS server") + server = OcrdMetsServer(*args, **kwargs) + server.startup() + process = Process(target=_start_mets_server, + kwargs={'workspace': workspace, 'url': 'mets.sock'}) + process.start() + sleep(1) + workspace = Workspace(resolver, directory, mets_server_url='mets.sock') + yield {'workspace': workspace, 'mets_server_url': 'mets.sock'} + process.terminate() + else: + yield {'workspace': workspace} + config.reset_defaults() + return _make_workspace + + +@pytest.fixture +def workspace_manifesto(workspace): + yield from workspace(assets.path_to('communist_manifesto/data/mets.xml')) + +@pytest.fixture +def workspace_aufklaerung(workspace): + yield from workspace(assets.path_to('kant_aufklaerung_1784/data/mets.xml')) + +@pytest.fixture +def workspace_aufklaerung_binarized(workspace): + yield from workspace(assets.path_to('kant_aufklaerung_1784-binarized/data/mets.xml')) + +@pytest.fixture +def workspace_aufklaerung_glyph(workspace): + yield from workspace(assets.path_to('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml')) + +@pytest.fixture +def workspace_sbb(workspace): + yield from workspace(assets.url_of('SBB0000F29300010000/data/mets_one_file.xml')) diff --git a/test/test_recognize.py b/test/test_recognize.py index f4e3587..3419214 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -2,197 +2,143 @@ import os import shutil import subprocess -import tempfile -import pytest from lxml import etree -from ocrd.resolver import Resolver +from ocrd import run_processor +from ocrd_utils import MIMETYPE_PAGE as PAGE +from ocrd_models.constants import NAMESPACES as NS +from ocrd_modelfactory import page_from_file from ocrd_calamari import CalamariRecognize -from .base import assets - -METS_KANT = assets.url_of( - "kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml" -) -WORKSPACE_DIR = tempfile.mkdtemp(prefix="test-ocrd-calamari-") CHECKPOINT_DIR = os.getenv("MODEL", "qurator-gt4histocr-1.0") DEBUG = os.getenv("DEBUG", False) -def page_namespace(tree): - """Return the PAGE content namespace used in the given ElementTree. - - This relies on the assumption that, in any given PAGE content file, the root element - has the local name "PcGts". We do not check if the files uses any valid PAGE - namespace. - """ - root_name = etree.QName(tree.getroot().tag) - if root_name.localname == "PcGts": - return root_name.namespace - else: - raise ValueError("Not a PAGE tree") - - -def assertFileContains(fn, text): +def assertFileContains(fn, text, msg=""): """Assert that the given file contains a given string.""" with open(fn, "r", encoding="utf-8") as f: - assert text in f.read() + assert text in f.read(), msg -def assertFileDoesNotContain(fn, text): +def assertFileDoesNotContain(fn, text, msg=""): """Assert that the given file does not contain given string.""" with open(fn, "r", encoding="utf-8") as f: - assert text not in f.read() - - -@pytest.fixture -def workspace(): - if os.path.exists(WORKSPACE_DIR): - shutil.rmtree(WORKSPACE_DIR) - os.makedirs(WORKSPACE_DIR) - - resolver = Resolver() - # due to core#809 this does not always work: - # workspace = resolver.workspace_from_url(METS_KANT, dst_dir=WORKSPACE_DIR) - # workaround: - shutil.rmtree(WORKSPACE_DIR) - shutil.copytree(os.path.dirname(METS_KANT), WORKSPACE_DIR) - workspace = resolver.workspace_from_url(os.path.join(WORKSPACE_DIR, "mets.xml")) - - # The binarization options I have are: - # - # a. ocrd_kraken which tries to install cltsm, whose installation is borken on my - # machine (protobuf) - # b. ocrd_olena which 1. I cannot fully install via pip and 2. whose dependency - # olena doesn't compile on my machine - # c. just fumble with the original files - # - # So I'm going for option c. - for imgf in workspace.mets.find_files(fileGrp="OCR-D-IMG"): - imgf = workspace.download_file(imgf) - path = os.path.join(workspace.directory, imgf.local_filename) - subprocess.call(["mogrify", "-threshold", "50%", path]) - - # Remove GT Words and TextEquivs, to not accidently check GT text instead of the - # OCR text - # XXX Review data again - for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-WORD-GLYPH"): - workspace.download_file(of) - path = os.path.join(workspace.directory, of.local_filename) - tree = etree.parse(path) - nsmap_gt = {"pc": page_namespace(tree)} - for to_remove in ["//pc:Word", "//pc:TextEquiv"]: - for e in tree.xpath(to_remove, namespaces=nsmap_gt): - e.getparent().remove(e) - tree.write(path, xml_declaration=True, encoding="utf-8") - assertFileDoesNotContain(path, "TextEquiv") - - yield workspace - - if not DEBUG: - shutil.rmtree(WORKSPACE_DIR) - - -def test_recognize(workspace): - CalamariRecognize( - workspace, - input_file_grp="OCR-D-GT-SEG-WORD-GLYPH", + assert text not in f.read(), msg + + + +def test_recognize(workspace_aufklaerung_binarized, caplog): + caplog.set_level(logging.WARNING) + ws = workspace_aufklaerung_binarized['workspace'] + page1 = ws.mets.physical_pages[0] + file1 = list(ws.find_files(file_grp="OCR-D-GT-WORD", page_id=page1, mimetype=PAGE))[0] + text1 = page_from_file(file1).etree.xpath( + '//page:TextLine/page:TextEquiv[1]/page:Unicode/text()', namespaces=NS) + assert len(text1) > 10 + assert "verſchuldeten" in "\n".join(text1) + run_processor( + CalamariRecognize, + input_file_grp="OCR-D-GT-WORD", output_file_grp="OCR-D-OCR-CALAMARI", parameter={ "checkpoint_dir": CHECKPOINT_DIR, }, - ).process() - workspace.save_mets() - - page1 = os.path.join( - workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_phys_0001.xml" + **workspace_aufklaerung_binarized, ) - assert os.path.exists(page1) - assertFileContains(page1, "verſchuldeten") + overwrite_text_log_messages = [t[2] for t in caplog.record_tuples + if "already contained text results" in t[2]] + assert len(overwrite_text_log_messages) > 10 # For every line! + overwrite_word_log_messages = [t[2] for t in caplog.record_tuples + if "already contained word segmentation" in t[2]] + assert len(overwrite_word_log_messages) > 10 # For every line! + ws.save_mets() + file1 = next(ws.find_files(file_grp="OCR-D-OCR-CALAMARI", page_id=page1, mimetype=PAGE), False) + assert file1, "result for first page not referenced in METS" + assert os.path.exists(file1.local_filename), "result for first page not found in filesystem" + text1_out = page_from_file(file1).etree.xpath( + '//page:TextLine/page:TextEquiv[1]/page:Unicode/text()', namespaces=NS) + assert len(text1_out) == len(text1), "not all lines have been recognized" + assert "verſchuldeten" in "\n".join(text1_out), "result for first page is inaccurate" + assert "\n".join(text1_out) != "\n".join(text1), "result is suspiciously identical to GT" def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model( - workspace, caplog + workspace_aufklaerung, caplog ): caplog.set_level(logging.WARNING) - CalamariRecognize( - workspace, - input_file_grp="OCR-D-GT-SEG-WORD-GLYPH", - output_file_grp="OCR-D-OCR-CALAMARI-BROKEN", + run_processor( + CalamariRecognize, + input_file_grp="OCR-D-GT-PAGE", + output_file_grp="OCR-D-OCR-CALAMARI", parameter={"checkpoint_dir": CHECKPOINT_DIR}, - ).process() - - interesting_log_messages = [ - t[2] for t in caplog.record_tuples if "Using raw image" in t[2] - ] + **workspace_aufklaerung, + ) + interesting_log_messages = [t[2] for t in caplog.record_tuples + if "Using raw image" in t[2]] assert len(interesting_log_messages) > 10 # For every line! -def test_word_segmentation(workspace): - CalamariRecognize( - workspace, - input_file_grp="OCR-D-GT-SEG-WORD-GLYPH", +def test_word_segmentation(workspace_aufklaerung_binarized): + run_processor( + CalamariRecognize, + input_file_grp="OCR-D-GT-WORD", output_file_grp="OCR-D-OCR-CALAMARI", parameter={ "checkpoint_dir": CHECKPOINT_DIR, - "textequiv_level": "word", # Note that we're going down to word level here + "textequiv_level": "word", }, - ).process() - workspace.save_mets() - - page1 = os.path.join( - workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_phys_0001.xml" + **workspace_aufklaerung_binarized ) - assert os.path.exists(page1) - tree = etree.parse(page1) - nsmap = {"pc": page_namespace(tree)} - - # The result should contain a TextLine that contains the text "December" - line = tree.xpath( - ".//pc:TextLine[pc:TextEquiv/pc:Unicode[contains(text(),'December')]]", - namespaces=nsmap, - )[0] - assert line is not None - + ws = workspace_aufklaerung_binarized['workspace'] + ws.save_mets() + page1 = ws.mets.physical_pages[0] + file1 = next(ws.find_files(file_grp="OCR-D-OCR-CALAMARI", page_id=page1, mimetype=PAGE), False) + assert file1, "result for first page not referenced in METS" + assert os.path.exists(file1.local_filename), "result for first page not found in filesystem" + tree1 = page_from_file(file1).etree + # The result should contain a TextLine that contains the text "Berliniſche" + line = tree1.xpath( + "//page:TextLine[page:TextEquiv/page:Unicode[contains(text(),'Berliniſche')]]", + namespaces=NS, + ) + assert len(line) == 1, "result is inaccurate" + line = line[0] # The textline should # a. contain multiple words and # b. these should concatenate fine to produce the same line text - words = line.xpath(".//pc:Word", namespaces=nsmap) - assert len(words) >= 2 + words = line.xpath(".//page:Word", namespaces=NS) + assert len(words) >= 2, "result does not contain words" words_text = " ".join( - word.xpath("pc:TextEquiv/pc:Unicode", namespaces=nsmap)[0].text + word.xpath("page:TextEquiv[1]/page:Unicode/text()", namespaces=NS)[0] for word in words ) - line_text = line.xpath("pc:TextEquiv/pc:Unicode", namespaces=nsmap)[0].text - assert words_text == line_text - + line_text = line.xpath("page:TextEquiv[1]/page:Unicode/text()", namespaces=NS)[0] + assert words_text == line_text, "word-level text result does not concatenate to line-level text result" # For extra measure, check that we're not seeing any glyphs, as we asked for # textequiv_level == "word" - glyphs = tree.xpath("//pc:Glyph", namespaces=nsmap) - assert len(glyphs) == 0 + glyphs = tree1.xpath("//page:Glyph", namespaces=NS) + assert len(glyphs) == 0, "result must not contain glyph-level segments" -def test_glyphs(workspace): - CalamariRecognize( - workspace, - input_file_grp="OCR-D-GT-SEG-WORD-GLYPH", +def test_glyphs(workspace_aufklaerung_binarized): + run_processor( + CalamariRecognize, + input_file_grp="OCR-D-GT-WORD", output_file_grp="OCR-D-OCR-CALAMARI", parameter={ "checkpoint_dir": CHECKPOINT_DIR, - # Note that we're going down to glyph level here "textequiv_level": "glyph", }, - ).process() - workspace.save_mets() - - page1 = os.path.join( - workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_phys_0001.xml" + **workspace_aufklaerung_binarized, ) - assert os.path.exists(page1) - tree = etree.parse(page1) - nsmap = {"pc": page_namespace(tree)} - + ws = workspace_aufklaerung_binarized['workspace'] + ws.save_mets() + page1 = ws.mets.physical_pages[0] + file1 = next(ws.find_files(file_grp="OCR-D-OCR-CALAMARI", page_id=page1, mimetype=PAGE), False) + assert file1, "result for first page not referenced in METS" + assert os.path.exists(file1.local_filename), "result for first page not found in filesystem" + tree1 = page_from_file(file1).etree # The result should contain a lot of glyphs - glyphs = tree.xpath("//pc:Glyph", namespaces=nsmap) - assert len(glyphs) >= 100 + glyphs = tree1.xpath("//page:Glyph", namespaces=NS) + assert len(glyphs) >= 100, "result must contain lots of glyphs"