diff --git a/.circleci/config.yml b/.circleci/config.yml index a97d20b..b90ef37 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7,15 +7,23 @@ jobs: build-python36: docker: - image: ubuntu:18.04 + environment: + - PYTHONIOENCODING: utf-8 steps: - - run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget imagemagick + - run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget imagemagick locales + - run: locale-gen "en_US.UTF-8"; update-locale LC_ALL="en_US.UTF-8" - checkout + - run: pip3 install --upgrade pip - run: make install PIP_INSTALL="pip3 install" - run: pip3 install -r requirements-test.txt - - run: make coverage + - run: make coverage LC_ALL=en_US.utf8 - codecov/upload workflows: build: jobs: - - build-python36 + - build-python36: + filters: + branches: + ignore: + - screenshots diff --git a/.gitignore b/.gitignore index 42c4957..0bea6c4 100644 --- a/.gitignore +++ b/.gitignore @@ -107,5 +107,7 @@ venv.bak/ /calamari /calamari_models /gt4histocr-calamari +/actevedef_718448162* /repo /test/assets +gt4histocr-calamari* diff --git a/Dockerfile b/Dockerfile index 6bd7f73..6d63150 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM ocrd/core:edge +FROM ocrd/core MAINTAINER OCR-D ENV DEBIAN_FRONTEND noninteractive ENV PYTHONIOENCODING utf8 @@ -10,10 +10,12 @@ COPY Makefile . COPY setup.py . COPY ocrd-tool.json . COPY requirements.txt . +COPY README.md . COPY ocrd_calamari ocrd_calamari -RUN make calamari/build -RUN pip3 install . +RUN pip3 install --upgrade pip && \ + pip3 install . && \ + pip3 check ENTRYPOINT ["/usr/local/bin/ocrd-calamari-recognize"] diff --git a/LICENSE b/LICENSE index 261eeb9..bc7973a 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2018-2020 Konstantin Baierer, Mike Gerber Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/Makefile b/Makefile index 0508505..00a8f69 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,4 @@ +export # export variables to subshells PIP_INSTALL = pip3 install GIT_CLONE = git clone PYTHON = python3 @@ -10,10 +11,8 @@ help: @echo " Targets" @echo "" @echo " install Install ocrd_calamari" - @echo " calamari Clone calamari repo" - @echo " calamari_models Clone calamari_models repo" - @echo " gt4histocr-calamari Get GT4HistOCR Calamari model (from SBB)" - @echo " calamari/build pip install calamari" + @echo " gt4histocr-calamari1 Get GT4HistOCR Calamari model (from SBB)" + @echo " actevedef_718448162 Download example data" @echo " deps-test Install testing python deps via pip" @echo " repo/assets Clone OCR-D/assets to ./repo/assets" @echo " test/assets Setup test assets" @@ -33,29 +32,21 @@ help: install: $(PIP_INSTALL) . -# Clone calamari repo -calamari: - $(GIT_CLONE) https://github.com/chwick/calamari -# Clone calamari_models repo -calamari_models: - $(GIT_CLONE) -n https://github.com/chwick/calamari_models - # Checkout latest version that works with calamari-ocr==0.3.5: - cd calamari_models && git checkout f76b1d3ec - -gt4histocr-calamari: - mkdir gt4histocr-calamari - cd gt4histocr-calamari && \ - wget https://file.spk-berlin.de:8443/calamari-models/GT4HistOCR/model.tar.xz && \ +# Get GT4HistOCR Calamari model (from SBB) +gt4histocr-calamari1: + mkdir -p gt4histocr-calamari1 + cd gt4histocr-calamari1 && \ + wget https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz && \ tar xfv model.tar.xz && \ rm model.tar.xz +# Download example data +actevedef_718448162: + wget https://qurator-data.de/examples/actevedef_718448162.zip && \ + unzip actevedef_718448162.zip -# pip install calamari -calamari/build: calamari calamari_models - cd calamari && $(PIP_INSTALL) . - # # Assets and Tests @@ -82,12 +73,12 @@ assets-clean: rm -rf test/assets # Run unit tests -test: test/assets gt4histocr-calamari +test: test/assets gt4histocr-calamari1 # declare -p HTTP_PROXY $(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS) # Run unit tests and determine test coverage -coverage: test/assets calamari_models +coverage: test/assets gt4histocr-calamari1 coverage erase make test PYTHON="coverage run" coverage report diff --git a/README-DEV.md b/README-DEV.md index da2025a..40a237a 100644 --- a/README-DEV.md +++ b/README-DEV.md @@ -4,17 +4,20 @@ In a Python 3 virtualenv: ~~~ pip install -e . +pip install -r requirements-test.txt make test ~~~ -Release -------- -* Update ocrd-tool.json version -* Update setup.py version -* git commit -m 'v' -* git tag -m 'v' 'v' -* git push --tags +Releasing +--------- +* Update `ocrd-tool.json` version +* Update `setup.py` version +* `git commit -m 'v'` +* `git tag -m 'v' 'v'` +* `git push --tags` +* Do a release on GitHub -PyPI: -* python sdist bdist_wheel -* twine upload dist/ocrd_calamari-* +### Uploading to PyPI +* `rm -rf dist/` or backup if `dist/` exists already +* In the virtualenv: `python setup.py sdist bdist_wheel` +* `twine upload dist/ocrd_calamari-*` diff --git a/README.md b/README.md index 4d7dc96..d277479 100644 --- a/README.md +++ b/README.md @@ -8,11 +8,22 @@ ## Introduction -This offers a OCR-D compliant workspace processor for some of the functionality of Calamari OCR. +**ocrd_calamari** offers a [OCR-D](https://ocr-d.de) compliant workspace processor for the functionality of Calamari OCR. It uses OCR-D workspaces (METS) with [PAGE XML](https://github.com/PRImA-Research-Lab/PAGE-XML) documents as input and output. This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized image) as its input. +In addition to the line text it may also output word and glyph segmentation +including per-glyph confidence values and per-glyph alternative predictions as +provided by the Calamari OCR engine, using a `textequiv_level` of `word` or +`glyph`. Note that while Calamari does not provide word segmentation, this +processor produces word segmentation inferred from text +segmentation and the glyph positions. The provided glyph and word segmentation +can be used for text extraction and highlighting, but is probably not useful for +further image-based processing. + +![Example output as viewed in PAGE Viewer](https://github.com/OCR-D/ocrd_calamari/raw/screenshots/output-in-page-viewer.jpg) + ## Installation ### From PyPI @@ -29,32 +40,44 @@ pip install . ## Install models -Download standard models: +Download models trained on GT4HistOCR data: ``` -wget https://github.com/Calamari-OCR/calamari_models/archive/master.zip -unzip master.zip +make gt4histocr-calamari1 +ls gt4histocr-calamari1 ``` -Download models trained on GT4HistOCR data: +Manual download: [model.tar.xz](https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz) +## Example Usage +Before using `ocrd-calamari-recognize` get some example data and model, and +prepare the document for OCR: ``` -make gt4histocr-calamari -ls gt4histocr-calamari +# Download model and example data +make gt4histocr-calamari1 +make actevedef_718448162 + +# Create binarized images and line segmentation using other OCR-D projects +cd actevedef_718448162 +ocrd-olena-binarize -P impl sauvola-ms-split -I OCR-D-IMG -O OCR-D-IMG-BIN +ocrd-tesserocr-segment-region -I OCR-D-IMG-BIN -O OCR-D-SEG-REGION +ocrd-tesserocr-segment-line -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE ``` -## Example Usage +Finally recognize the text using ocrd_calamari and the downloaded model: +``` +ocrd-calamari-recognize -P checkpoint "../gt4histocr-calamari1/*.ckpt.json" -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI +``` + +or + +``` +ocrd-calamari-recognize -P checkpoint_dir "../gt4histocr-calamari1" -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI +``` -~~~ -ocrd-calamari-recognize -p test-parameters.json -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI -~~~ -With `test-parameters.json`: -~~~ -{ - "checkpoint": "/path/to/some/trained/models/*.ckpt.json" -} -~~~ +You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions +for additional parameters and default values. ## Development & Testing For information regarding development and testing, please see diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index 54d2206..d4f83fa 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -1,6 +1,6 @@ { - "git_url": "https://github.com/kba/ocrd_calamari", - "version": "0.0.3", + "git_url": "https://github.com/OCR-D/ocrd_calamari", + "version": "1.0.1", "tools": { "ocrd-calamari-recognize": { "executable": "ocrd-calamari-recognize", @@ -18,6 +18,10 @@ "OCR-D-OCR-CALAMARI" ], "parameters": { + "checkpoint_dir": { + "description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory", + "type": "string", "format": "file", "cacheable": true, "default": "qurator-gt4histocr-1.0" + }, "checkpoint": { "description": "The calamari model files (*.ckpt.json)", "type": "string", "format": "file", "cacheable": true @@ -25,6 +29,18 @@ "voter": { "description": "The voting algorithm to use", "type": "string", "default": "confidence_voter_default_ctc" + }, + "textequiv_level": { + "type": "string", + "enum": ["line", "word", "glyph"], + "default": "line", + "description": "Deepest PAGE XML hierarchy level to include TextEquiv results for" + }, + "glyph_conf_cutoff": { + "type": "number", + "format": "float", + "default": 0.001, + "description": "Only include glyph alternatives with confidences above this threshold" } } } diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 31a37e1..5c6807e 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -1,33 +1,50 @@ from __future__ import absolute_import import os +import itertools from glob import glob import numpy as np +from calamari_ocr import __version__ as calamari_version from calamari_ocr.ocr import MultiPredictor from calamari_ocr.ocr.voting import voter_from_proto from calamari_ocr.proto import VoterParams from ocrd import Processor from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import to_xml -from ocrd_models.ocrd_page_generateds import TextEquivType -from ocrd_utils import getLogger, concat_padded, MIMETYPE_PAGE +from ocrd_models.ocrd_page import ( + LabelType, LabelsType, + MetadataItemType, + TextEquivType, + WordType, GlyphType, CoordsType, + to_xml +) +from ocrd_utils import ( + getLogger, concat_padded, + coordinates_for_segment, points_from_polygon, polygon_from_x0y0x1y1, + make_file_id, assert_file_grp_cardinality, + MIMETYPE_PAGE +) from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL -log = getLogger('processor.CalamariRecognize') +os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL +from tensorflow import __version__ as tensorflow_version + +TOOL = 'ocrd-calamari-recognize' class CalamariRecognize(Processor): def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-recognize'] - kwargs['version'] = OCRD_TOOL['version'] + kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] + kwargs['version'] = '%s (calamari %s, tensorflow %s)' % (OCRD_TOOL['version'], calamari_version, tensorflow_version) super(CalamariRecognize, self).__init__(*args, **kwargs) def _init_calamari(self): - os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL + if not self.parameter.get('checkpoint', None) and self.parameter.get('checkpoint_dir', None): + resolved = self.resolve_resource(self.parameter['checkpoint_dir']) + self.parameter['checkpoint'] = '%s/*.ckpt.json' % resolved checkpoints = glob(self.parameter['checkpoint']) self.predictor = MultiPredictor(checkpoints=checkpoints) @@ -43,16 +60,14 @@ class CalamariRecognize(Processor): voter_params.type = VoterParams.Type.Value(self.parameter['voter'].upper()) self.voter = voter_from_proto(voter_params) - def _make_file_id(self, input_file, n): - file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) - if file_id == input_file.ID: - file_id = concat_padded(self.output_file_grp, n) - return file_id - def process(self): """ Performs the recognition. """ + log = getLogger('processor.CalamariRecognize') + + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) self._init_calamari() @@ -71,44 +86,169 @@ class CalamariRecognize(Processor): textlines = region.get_TextLine() log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) + line_images_np = [] for line in textlines: log.debug("Recognizing line '%s' in region '%s'", line.id, region.id) - line_image, line_coords = self.workspace.image_from_segment( - line, region_image, region_coords, feature_selector=self.features) - if ('binarized' not in line_coords['features'] and - 'grayscale_normalized' not in line_coords['features'] and - self.input_channels == 1): + line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_coords, feature_selector=self.features) + if ('binarized' not in line_coords['features'] and 'grayscale_normalized' not in line_coords['features'] and self.input_channels == 1): # We cannot use a feature selector for this since we don't # know whether the model expects (has been trained on) # binarized or grayscale images; but raw images are likely # always inadequate: - log.warning("Using raw image for line '%s' in region '%s'", - line.id, region.id) - + log.warning("Using raw image for line '%s' in region '%s'", line.id, region.id) + + line_image = line_image if all(line_image.size) else [[0]] line_image_np = np.array(line_image, dtype=np.uint8) + line_images_np.append(line_image_np) + raw_results_all = self.predictor.predict_raw(line_images_np, progress_bar=False) + + for line, raw_results in zip(textlines, raw_results_all): - raw_results = list(self.predictor.predict_raw([line_image_np], progress_bar=False))[0] for i, p in enumerate(raw_results): p.prediction.id = "fold_{}".format(i) prediction = self.voter.vote_prediction_result(raw_results) prediction.id = "voted" - line_text = prediction.sentence - line_conf = prediction.avg_char_probability + # Build line text on our own + # + # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same + # on prediction.positions. Do it on our own to have consistency. + # + # XXX Check Calamari's built-in post-processing on prediction.sentence + + + def _sort_chars(p): + """Filter and sort chars of prediction p""" + chars = p.chars + chars = [c for c in chars if c.char] # XXX Note that omission probabilities are not normalized?! + chars = [c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff']] + chars = sorted(chars, key=lambda k: k.probability, reverse=True) + return chars + def _drop_leading_spaces(positions): + return list(itertools.dropwhile(lambda p: _sort_chars(p)[0].char == " ", positions)) + def _drop_trailing_spaces(positions): + return list(reversed(_drop_leading_spaces(reversed(positions)))) + def _drop_double_spaces(positions): + def _drop_double_spaces_generator(positions): + last_was_space = False + for p in positions: + if p.chars[0].char == " ": + if not last_was_space: + yield p + last_was_space = True + else: + yield p + last_was_space = False + return list(_drop_double_spaces_generator(positions)) + positions = prediction.positions + positions = _drop_leading_spaces(positions) + positions = _drop_trailing_spaces(positions) + positions = _drop_double_spaces(positions) + positions = list(positions) + + line_text = ''.join(_sort_chars(p)[0].char for p in positions) + if line_text != prediction.sentence: + log.warning("Our own line text is not the same as Calamari's: '%s' != '%s'", + line_text, prediction.sentence) + # Delete existing results if line.get_TextEquiv(): log.warning("Line '%s' already contained text results", line.id) - line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)]) - + line.set_TextEquiv([]) if line.get_Word(): log.warning("Line '%s' already contained word segmentation", line.id) line.set_Word([]) + # Save line results + line_conf = prediction.avg_char_probability + line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)]) + + + # Save word results + # + # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation + # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict + # hierarchy of lines > words > glyphs. + + def _words(s): + """Split words based on spaces and include spaces as 'words'""" + spaces = None + word = '' + for c in s: + if c == ' ' and spaces is True: + word += c + elif c != ' ' and spaces is False: + word += c + else: + if word: + yield word + word = c + spaces = (c == ' ') + yield word + + if self.parameter['textequiv_level'] in ['word', 'glyph']: + word_no = 0 + i = 0 + + + + for word_text in _words(line_text): + word_length = len(word_text) + if not all(c == ' ' for c in word_text): + word_positions = positions[i:i+word_length] + word_start = word_positions[0].global_start + word_end = word_positions[-1].global_end + + polygon = polygon_from_x0y0x1y1([word_start, 0, word_end, line_image.height]) + points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords)) + # XXX Crop to line polygon? + + word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points)) + word.add_TextEquiv(TextEquivType(Unicode=word_text)) + + if self.parameter['textequiv_level'] == 'glyph': + for glyph_no, p in enumerate(word_positions): + glyph_start = p.global_start + glyph_end = p.global_end + + polygon = polygon_from_x0y0x1y1([glyph_start, 0, glyph_end, line_image.height]) + points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords)) + + glyph = GlyphType(id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points)) + + # Add predictions (= TextEquivs) + char_index_start = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs + for char_index, char in enumerate(_sort_chars(p), start=char_index_start): + glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=char_index, conf=char.probability)) + + word.add_Glyph(glyph) + + line.add_Word(word) + word_no += 1 + + i += word_length + + _page_update_higher_textequiv_levels('line', pcgts) - file_id = self._make_file_id(input_file, n) + + # Add metadata about this operation and its runtime parameters: + metadata = pcgts.get_Metadata() # ensured by from_file() + metadata.add_MetadataItem( + MetadataItemType(type_="processingStep", + name=self.ocrd_tool['steps'][0], + value=TOOL, + Labels=[LabelsType( + externalModel="ocrd-tool", + externalId="parameters", + Label=[LabelType(type_=name, value=self.parameter[name]) + for name in self.parameter.keys()])])) + + + file_id = make_file_id(input_file, self.output_file_grp) + pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, @@ -151,3 +291,5 @@ def _page_update_higher_textequiv_levels(level, pcgts): else u'' for line in lines) region.set_TextEquiv( [TextEquivType(Unicode=region_unicode)]) # remove old + +# vim:tw=120: diff --git a/requirements.txt b/requirements.txt index 17de3dc..cbfb800 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -numpy -tensorflow-gpu == 1.14.0 -calamari-ocr == 0.3.5 +h5py < 3 # XXX tensorflow 2.4.0rc3 requires h5py~=2.10.0, but you'll have h5py 3.1.0 which is incompatible. +tensorflow >= 2.3.0rc2 +calamari-ocr == 1.0.* setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? click -ocrd >= 1.0.0b11 +ocrd >= 2.22.0 diff --git a/setup.py b/setup.py index 323d68a..2a98d62 100644 --- a/setup.py +++ b/setup.py @@ -5,15 +5,15 @@ from setuptools import setup, find_packages setup( name='ocrd_calamari', - version='0.0.3', + version='1.0.1', description='Calamari bindings', long_description=Path('README.md').read_text(), long_description_content_type='text/markdown', author='Konstantin Baierer, Mike Gerber', author_email='unixprog@gmail.com, mike.gerber@sbb.spk-berlin.de', - url='https://github.com/kba/ocrd_calamari', + url='https://github.com/OCR-D/ocrd_calamari', license='Apache License 2.0', - packages=find_packages(exclude=('tests', 'docs')), + packages=find_packages(exclude=('test', 'docs')), install_requires=Path('requirements.txt').read_text().split('\n'), package_data={ '': ['*.json', '*.yml', '*.yaml'], diff --git a/test/test_recognize.py b/test/test_recognize.py index f97ef91..b3e8540 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -2,6 +2,8 @@ import os import shutil import subprocess import urllib.request +from lxml import etree +from glob import glob import pytest import logging @@ -10,9 +12,14 @@ from ocrd.resolver import Resolver from ocrd_calamari import CalamariRecognize from .base import assets -METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml') -CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json') +METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml') WORKSPACE_DIR = '/tmp/test-ocrd-calamari' +CHECKPOINT_DIR = os.path.join(os.getcwd(), 'gt4histocr-calamari1') +CHECKPOINT = os.path.join(CHECKPOINT_DIR, '*.ckpt.json') + +# Because XML namespace versions are so much fun, we not only use one, we use TWO! +NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" } +NSMAP_GT = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" } @pytest.fixture @@ -32,10 +39,6 @@ def workspace(): "https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f, os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)) - return workspace - - -def test_recognize(workspace): # The binarization options I have are: # # a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf) @@ -48,17 +51,49 @@ def test_recognize(workspace): ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f) subprocess.call(['convert', ff, '-threshold', '50%', ff]) - # XXX Should remove GT text to really test this + # Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text + # XXX Review data again + # XXX Make this more robust against namespace version changes + for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-LINE"): + workspace.download_file(of) + for to_remove in ["//pc:Word", "//pc:TextEquiv"]: + for ff in glob(os.path.join(WORKSPACE_DIR, "OCR-D-GT-SEG-LINE", "*")): + tree = etree.parse(ff) + for e in tree.xpath(to_remove, namespaces=NSMAP_GT): + e.getparent().remove(e) + tree.write(ff, xml_declaration=True, encoding="utf-8") + + return workspace + +def test_recognize(workspace): CalamariRecognize( workspace, input_file_grp="OCR-D-GT-SEG-LINE", output_file_grp="OCR-D-OCR-CALAMARI", - parameter={'checkpoint': CHECKPOINT} + parameter={ + "checkpoint": CHECKPOINT, + } ).process() workspace.save_mets() - page1 = os.path.join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml') + page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml") + assert os.path.exists(page1) + with open(page1, "r", encoding="utf-8") as f: + assert "verſchuldeten" in f.read() + +def test_recognize_with_checkpoint_dir(workspace): + CalamariRecognize( + workspace, + input_file_grp="OCR-D-GT-SEG-LINE", + output_file_grp="OCR-D-OCR-CALAMARI", + parameter={ + "checkpoint_dir": CHECKPOINT_DIR, + } + ).process() + workspace.save_mets() + + page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml") assert os.path.exists(page1) with open(page1, 'r', encoding='utf-8') as f: assert 'verſchuldeten' in f.read() @@ -75,3 +110,61 @@ def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(works interesting_log_messages = [t[2] for t in caplog.record_tuples if "Using raw image" in t[2]] assert len(interesting_log_messages) > 10 # For every line! + with open(page1, "r", encoding="utf-8") as f: + assert "verſchuldeten" in f.read() + + +def test_word_segmentation(workspace): + CalamariRecognize( + workspace, + input_file_grp="OCR-D-GT-SEG-LINE", + output_file_grp="OCR-D-OCR-CALAMARI", + parameter={ + "checkpoint": CHECKPOINT, + "textequiv_level": "word", # Note that we're going down to word level here + } + ).process() + workspace.save_mets() + + page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml") + assert os.path.exists(page1) + tree = etree.parse(page1) + + # The result should contain a TextLine that contains the text "December" + line = tree.xpath(".//pc:TextLine[pc:TextEquiv/pc:Unicode[contains(text(),'December')]]", namespaces=NSMAP)[0] + assert line + + # The textline should a. contain multiple words and b. these should concatenate fine to produce the same line text + words = line.xpath(".//pc:Word", namespaces=NSMAP) + assert len(words) >= 2 + words_text = " ".join(word.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text for word in words) + line_text = line.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text + assert words_text == line_text + + # For extra measure, check that we're not seeing any glyphs, as we asked for textequiv_level == "word" + glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP) + assert len(glyphs) == 0 + + +def test_glyphs(workspace): + CalamariRecognize( + workspace, + input_file_grp="OCR-D-GT-SEG-LINE", + output_file_grp="OCR-D-OCR-CALAMARI", + parameter={ + "checkpoint": CHECKPOINT, + "textequiv_level": "glyph", # Note that we're going down to glyph level here + } + ).process() + workspace.save_mets() + + page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml") + assert os.path.exists(page1) + tree = etree.parse(page1) + + # The result should contain a lot of glyphs + glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP) + assert len(glyphs) >= 100 + + +# vim:tw=120: