Merge branch 'master' into image-features

3 years ago · c0902cdef5
parent 377466a71a 1bb72cbaf1
commit c0902cdef5
12 changed files with 382 additions and 102 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -7,15 +7,23 @@ jobs:
  build-python36:
    docker:
      - image: ubuntu:18.04
+    environment:
+      - PYTHONIOENCODING: utf-8
    steps:
-      - run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget imagemagick
+      - run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget imagemagick locales
+      - run: locale-gen "en_US.UTF-8"; update-locale LC_ALL="en_US.UTF-8"
      - checkout
+      - run: pip3 install --upgrade pip
      - run: make install PIP_INSTALL="pip3 install"
      - run: pip3 install -r requirements-test.txt
-      - run: make coverage
+      - run: make coverage LC_ALL=en_US.utf8
      - codecov/upload

 workflows:
  build:
    jobs:
-      - build-python36
+      - build-python36:
+          filters:
+            branches:
+              ignore:
+                - screenshots
--- a/.gitignore
+++ b/.gitignore
@ -107,5 +107,7 @@ venv.bak/
 /calamari
 /calamari_models
 /gt4histocr-calamari
+/actevedef_718448162*
 /repo
 /test/assets
+gt4histocr-calamari*
--- a/8
+++ b/8
@ -1,4 +1,4 @@
-FROM ocrd/core:edge
+FROM ocrd/core
 MAINTAINER OCR-D
 ENV DEBIAN_FRONTEND noninteractive
 ENV PYTHONIOENCODING utf8
@ -10,10 +10,12 @@ COPY Makefile .
 COPY setup.py .
 COPY ocrd-tool.json .
 COPY requirements.txt .
+COPY README.md .
 COPY ocrd_calamari ocrd_calamari

-RUN make calamari/build
-RUN pip3 install .
+RUN pip3 install --upgrade pip && \
+    pip3 install . && \
+    pip3 check

 ENTRYPOINT ["/usr/local/bin/ocrd-calamari-recognize"]

--- a/2
+++ b/2
@ -186,7 +186,7 @@
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2018-2020 Konstantin Baierer, Mike Gerber

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
--- a/37
+++ b/37
@ -1,3 +1,4 @@
+export  # export variables to subshells
 PIP_INSTALL = pip3 install
 GIT_CLONE = git clone
 PYTHON = python3
@ -10,10 +11,8 @@ help:
 	@echo "  Targets"
 	@echo ""
 	@echo "    install          Install ocrd_calamari"
-	@echo "    calamari         Clone calamari repo"
-	@echo "    calamari_models  Clone calamari_models repo"
-	@echo "    gt4histocr-calamari Get GT4HistOCR Calamari model (from SBB)"
-	@echo "    calamari/build   pip install calamari"
+	@echo "    gt4histocr-calamari1 Get GT4HistOCR Calamari model (from SBB)"
+	@echo "    actevedef_718448162 Download example data"
 	@echo "    deps-test        Install testing python deps via pip"
 	@echo "    repo/assets      Clone OCR-D/assets to ./repo/assets"
 	@echo "    test/assets      Setup test assets"
@ -33,29 +32,21 @@ help:
 install:
 	$(PIP_INSTALL) .

-# Clone calamari repo
-calamari:
-	$(GIT_CLONE) https://github.com/chwick/calamari

-# Clone calamari_models repo
-calamari_models:
-	$(GIT_CLONE) -n https://github.com/chwick/calamari_models
-	# Checkout latest version that works with calamari-ocr==0.3.5:
-	cd calamari_models && git checkout f76b1d3ec
-
-gt4histocr-calamari:
-	mkdir gt4histocr-calamari
-	cd gt4histocr-calamari && \
-	wget https://file.spk-berlin.de:8443/calamari-models/GT4HistOCR/model.tar.xz && \
+# Get GT4HistOCR Calamari model (from SBB)
+gt4histocr-calamari1:
+	mkdir -p gt4histocr-calamari1
+	cd gt4histocr-calamari1 && \
+	wget https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz && \
 	tar xfv model.tar.xz && \
 	rm model.tar.xz

+# Download example data
+actevedef_718448162:
+	wget https://qurator-data.de/examples/actevedef_718448162.zip && \
+	unzip actevedef_718448162.zip


-# pip install calamari
-calamari/build: calamari calamari_models
-	cd calamari && $(PIP_INSTALL) .
-

 #
 # Assets and Tests
@ -82,12 +73,12 @@ assets-clean:
 	rm -rf test/assets

 # Run unit tests
-test: test/assets gt4histocr-calamari
+test: test/assets gt4histocr-calamari1
 	# declare -p HTTP_PROXY
 	$(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS)

 # Run unit tests and determine test coverage
-coverage: test/assets calamari_models
+coverage: test/assets gt4histocr-calamari1
 	coverage erase
 	make test PYTHON="coverage run"
 	coverage report
--- a/README-DEV.md
+++ b/README-DEV.md
@ -4,17 +4,20 @@ In a Python 3 virtualenv:

 ~~~
 pip install -e .
+pip install -r requirements-test.txt
 make test
 ~~~

-Release
-------
-* Update ocrd-tool.json version
-* Update setup.py version
-* git commit -m 'v<version>'
-* git tag -m 'v<version>' 'v<version>'
-* git push --tags
+Releasing
+---------
+* Update `ocrd-tool.json` version
+* Update `setup.py` version
+* `git commit -m 'v<version>'`
+* `git tag -m 'v<version>' 'v<version>'`
+* `git push --tags`
+* Do a release on GitHub

-PyPI:
-* python sdist bdist_wheel
-* twine upload dist/ocrd_calamari-<version>*
+### Uploading to PyPI
+* `rm -rf dist/` or backup if `dist/` exists already
+* In the virtualenv: `python setup.py sdist bdist_wheel`
+* `twine upload dist/ocrd_calamari-<version>*`
--- a/README.md
+++ b/README.md
@ -8,11 +8,22 @@

 ## Introduction

-This offers a OCR-D compliant workspace processor for some of the functionality of Calamari OCR.
+**ocrd_calamari** offers a [OCR-D](https://ocr-d.de) compliant workspace processor for the functionality of Calamari OCR. It uses OCR-D workspaces (METS) with [PAGE XML](https://github.com/PRImA-Research-Lab/PAGE-XML) documents as input and output.

 This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized 
 image) as its input.

+In addition to the line text it may also output word and glyph segmentation
+including per-glyph confidence values and per-glyph alternative predictions as
+provided by the Calamari OCR engine, using a `textequiv_level` of `word` or
+`glyph`. Note that while Calamari does not provide word segmentation, this
+processor produces word segmentation inferred from text
+segmentation and the glyph positions. The provided glyph and word segmentation
+can be used for text extraction and highlighting, but is probably not useful for
+further image-based processing.
+
+![Example output as viewed in PAGE Viewer](https://github.com/OCR-D/ocrd_calamari/raw/screenshots/output-in-page-viewer.jpg)
+
 ## Installation

 ### From PyPI
@ -29,32 +40,44 @@ pip install .

 ## Install models

-Download standard models:
+Download models trained on GT4HistOCR data:

 ```
-wget https://github.com/Calamari-OCR/calamari_models/archive/master.zip
-unzip master.zip
+make gt4histocr-calamari1
+ls gt4histocr-calamari1
 ```

-Download models trained on GT4HistOCR data:
+Manual download: [model.tar.xz](https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz)

+## Example Usage
+Before using `ocrd-calamari-recognize` get some example data and model, and
+prepare the document for OCR:
 ```
-make gt4histocr-calamari
-ls gt4histocr-calamari
+# Download model and example data
+make gt4histocr-calamari1
+make actevedef_718448162
+
+# Create binarized images and line segmentation using other OCR-D projects
+cd actevedef_718448162
+ocrd-olena-binarize -P impl sauvola-ms-split -I OCR-D-IMG -O OCR-D-IMG-BIN
+ocrd-tesserocr-segment-region -I OCR-D-IMG-BIN -O OCR-D-SEG-REGION
+ocrd-tesserocr-segment-line -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE
 ```

-## Example Usage
+Finally recognize the text using ocrd_calamari and the downloaded model:
+```
+ocrd-calamari-recognize -P checkpoint "../gt4histocr-calamari1/*.ckpt.json" -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
+```
+
+or
+
+```
+ocrd-calamari-recognize -P checkpoint_dir "../gt4histocr-calamari1" -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
+```

-~~~
-ocrd-calamari-recognize -p test-parameters.json -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
-~~~

-With `test-parameters.json`:
-~~~
-{
-    "checkpoint": "/path/to/some/trained/models/*.ckpt.json"
-}
-~~~
+You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions
+for additional parameters and default values.

 ## Development & Testing
 For information regarding development and testing, please see
--- a/ocrd_calamari/ocrd-tool.json
+++ b/ocrd_calamari/ocrd-tool.json
@ -1,6 +1,6 @@
 {
-  "git_url": "https://github.com/kba/ocrd_calamari",
-  "version": "0.0.3",
+  "git_url": "https://github.com/OCR-D/ocrd_calamari",
+  "version": "1.0.1",
  "tools": {
    "ocrd-calamari-recognize": {
      "executable": "ocrd-calamari-recognize",
@ -18,6 +18,10 @@
        "OCR-D-OCR-CALAMARI"
      ],
      "parameters": {
+        "checkpoint_dir": {
+          "description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory",
+          "type": "string", "format": "file", "cacheable": true, "default": "qurator-gt4histocr-1.0"
+        },
        "checkpoint": {
          "description": "The calamari model files (*.ckpt.json)",
          "type": "string", "format": "file", "cacheable": true
@ -25,6 +29,18 @@
        "voter": {
          "description": "The voting algorithm to use",
          "type": "string", "default": "confidence_voter_default_ctc"
+        },
+        "textequiv_level": {
+          "type": "string",
+          "enum": ["line", "word", "glyph"],
+          "default": "line",
+          "description": "Deepest PAGE XML hierarchy level to include TextEquiv results for"
+        },
+        "glyph_conf_cutoff": {
+          "type": "number",
+          "format": "float",
+          "default": 0.001,
+          "description": "Only include glyph alternatives with confidences above this threshold"
        }
      }
    }
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@ -1,33 +1,50 @@
 from __future__ import absolute_import

 import os
+import itertools
 from glob import glob

 import numpy as np
+from calamari_ocr import __version__ as calamari_version
 from calamari_ocr.ocr import MultiPredictor
 from calamari_ocr.ocr.voting import voter_from_proto
 from calamari_ocr.proto import VoterParams
 from ocrd import Processor
 from ocrd_modelfactory import page_from_file
-from ocrd_models.ocrd_page import to_xml
-from ocrd_models.ocrd_page_generateds import TextEquivType
-from ocrd_utils import getLogger, concat_padded, MIMETYPE_PAGE
+from ocrd_models.ocrd_page import (
+        LabelType, LabelsType,
+        MetadataItemType,
+        TextEquivType,
+        WordType, GlyphType, CoordsType,
+        to_xml
+)
+from ocrd_utils import (
+        getLogger, concat_padded,
+        coordinates_for_segment, points_from_polygon, polygon_from_x0y0x1y1,
+        make_file_id, assert_file_grp_cardinality,
+        MIMETYPE_PAGE
+)

 from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL

-log = getLogger('processor.CalamariRecognize')
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL
+from tensorflow import __version__ as tensorflow_version
+
+TOOL = 'ocrd-calamari-recognize'


 class CalamariRecognize(Processor):

    def __init__(self, *args, **kwargs):
-        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-recognize']
-        kwargs['version'] = OCRD_TOOL['version']
+        kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
+        kwargs['version'] = '%s (calamari %s, tensorflow %s)' % (OCRD_TOOL['version'], calamari_version, tensorflow_version)
        super(CalamariRecognize, self).__init__(*args, **kwargs)

    def _init_calamari(self):
-        os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL

+        if not self.parameter.get('checkpoint', None) and self.parameter.get('checkpoint_dir', None):
+            resolved = self.resolve_resource(self.parameter['checkpoint_dir'])
+            self.parameter['checkpoint'] = '%s/*.ckpt.json' % resolved
        checkpoints = glob(self.parameter['checkpoint'])
        self.predictor = MultiPredictor(checkpoints=checkpoints)

@ -43,16 +60,14 @@ class CalamariRecognize(Processor):
        voter_params.type = VoterParams.Type.Value(self.parameter['voter'].upper())
        self.voter = voter_from_proto(voter_params)

-    def _make_file_id(self, input_file, n):
-        file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
-        if file_id == input_file.ID:
-            file_id = concat_padded(self.output_file_grp, n)
-        return file_id
-
    def process(self):
        """
        Performs the recognition.
        """
+        log = getLogger('processor.CalamariRecognize')
+
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)

        self._init_calamari()

@ -71,44 +86,169 @@ class CalamariRecognize(Processor):

                textlines = region.get_TextLine()
                log.info("About to recognize %i lines of region '%s'", len(textlines), region.id)
+                line_images_np = []
                for line in textlines:
                    log.debug("Recognizing line '%s' in region '%s'", line.id, region.id)

-                    line_image, line_coords = self.workspace.image_from_segment(
-                        line, region_image, region_coords, feature_selector=self.features)
-                    if ('binarized' not in line_coords['features'] and
-                        'grayscale_normalized' not in line_coords['features'] and
-                        self.input_channels == 1):
+                    line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_coords, feature_selector=self.features)
+                    if ('binarized' not in line_coords['features'] and 'grayscale_normalized' not in line_coords['features'] and self.input_channels == 1):
                        # We cannot use a feature selector for this since we don't
                        # know whether the model expects (has been trained on)
                        # binarized or grayscale images; but raw images are likely
                        # always inadequate:
-                        log.warning("Using raw image for line '%s' in region '%s'",
-                                    line.id, region.id)
-                    
+                        log.warning("Using raw image for line '%s' in region '%s'", line.id, region.id)
+
+                    line_image = line_image if all(line_image.size) else [[0]]
                    line_image_np = np.array(line_image, dtype=np.uint8)
+                    line_images_np.append(line_image_np)
+                raw_results_all = self.predictor.predict_raw(line_images_np, progress_bar=False)
+
+                for line, raw_results in zip(textlines, raw_results_all):

-                    raw_results = list(self.predictor.predict_raw([line_image_np], progress_bar=False))[0]
                    for i, p in enumerate(raw_results):
                        p.prediction.id = "fold_{}".format(i)

                    prediction = self.voter.vote_prediction_result(raw_results)
                    prediction.id = "voted"

-                    line_text = prediction.sentence
-                    line_conf = prediction.avg_char_probability
+                    # Build line text on our own
+                    #
+                    # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
+                    # on prediction.positions. Do it on our own to have consistency.
+                    #
+                    # XXX Check Calamari's built-in post-processing on prediction.sentence
+
+
+                    def _sort_chars(p):
+                        """Filter and sort chars of prediction p"""
+                        chars = p.chars
+                        chars = [c for c in chars if c.char]  # XXX Note that omission probabilities are not normalized?!
+                        chars = [c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff']]
+                        chars = sorted(chars, key=lambda k: k.probability, reverse=True)
+                        return chars
+                    def _drop_leading_spaces(positions):
+                        return list(itertools.dropwhile(lambda p: _sort_chars(p)[0].char == " ", positions))
+                    def _drop_trailing_spaces(positions):
+                        return list(reversed(_drop_leading_spaces(reversed(positions))))
+                    def _drop_double_spaces(positions):
+                        def _drop_double_spaces_generator(positions):
+                            last_was_space = False
+                            for p in positions:
+                                if p.chars[0].char == " ":
+                                    if not last_was_space:
+                                        yield p
+                                    last_was_space = True
+                                else:
+                                    yield p
+                                    last_was_space = False
+                        return list(_drop_double_spaces_generator(positions))
+                    positions = prediction.positions
+                    positions = _drop_leading_spaces(positions)
+                    positions = _drop_trailing_spaces(positions)
+                    positions = _drop_double_spaces(positions)
+                    positions = list(positions)
+
+                    line_text = ''.join(_sort_chars(p)[0].char for p in positions)
+                    if line_text != prediction.sentence:
+                        log.warning("Our own line text is not the same as Calamari's: '%s' != '%s'",
+                                    line_text, prediction.sentence)

+                    # Delete existing results
                    if line.get_TextEquiv():
                        log.warning("Line '%s' already contained text results", line.id)
-                    line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])
-                    
+                    line.set_TextEquiv([])
                    if line.get_Word():
                        log.warning("Line '%s' already contained word segmentation", line.id)
                    line.set_Word([])

+                    # Save line results
+                    line_conf = prediction.avg_char_probability
+                    line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])
+
+
+                    # Save word results
+                    #
+                    # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
+                    # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
+                    # hierarchy of lines > words > glyphs.
+
+                    def _words(s):
+                        """Split words based on spaces and include spaces as 'words'"""
+                        spaces = None
+                        word = ''
+                        for c in s:
+                            if c == ' ' and spaces is True:
+                                word += c
+                            elif c != ' ' and spaces is False:
+                                word += c
+                            else:
+                                if word:
+                                    yield word
+                                word = c
+                                spaces = (c == ' ')
+                        yield word
+
+                    if self.parameter['textequiv_level'] in ['word', 'glyph']:
+                        word_no = 0
+                        i = 0
+
+
+
+                        for word_text in _words(line_text):
+                            word_length = len(word_text)
+                            if not all(c == ' ' for c in word_text):
+                                word_positions = positions[i:i+word_length]
+                                word_start = word_positions[0].global_start
+                                word_end = word_positions[-1].global_end
+
+                                polygon = polygon_from_x0y0x1y1([word_start, 0, word_end, line_image.height])
+                                points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords))
+                                # XXX Crop to line polygon?
+
+                                word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points))
+                                word.add_TextEquiv(TextEquivType(Unicode=word_text))
+
+                                if self.parameter['textequiv_level'] == 'glyph':
+                                    for glyph_no, p in enumerate(word_positions):
+                                        glyph_start = p.global_start
+                                        glyph_end = p.global_end
+
+                                        polygon = polygon_from_x0y0x1y1([glyph_start, 0, glyph_end, line_image.height])
+                                        points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords))
+
+                                        glyph = GlyphType(id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points))
+
+                                        # Add predictions (= TextEquivs)
+                                        char_index_start = 1  # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
+                                        for char_index, char in enumerate(_sort_chars(p), start=char_index_start):
+                                            glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=char_index, conf=char.probability))
+
+                                        word.add_Glyph(glyph)
+
+                                line.add_Word(word)
+                                word_no += 1
+
+                            i += word_length
+
+
            _page_update_higher_textequiv_levels('line', pcgts)

-            file_id = self._make_file_id(input_file, n)
+
+            # Add metadata about this operation and its runtime parameters:
+            metadata = pcgts.get_Metadata()  # ensured by from_file()
+            metadata.add_MetadataItem(
+                MetadataItemType(type_="processingStep",
+                                 name=self.ocrd_tool['steps'][0],
+                                 value=TOOL,
+                                 Labels=[LabelsType(
+                                     externalModel="ocrd-tool",
+                                     externalId="parameters",
+                                     Label=[LabelType(type_=name, value=self.parameter[name])
+                                            for name in self.parameter.keys()])]))
+
+
+            file_id = make_file_id(input_file, self.output_file_grp)
+            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
@ -151,3 +291,5 @@ def _page_update_higher_textequiv_levels(level, pcgts):
                                        else u'' for line in lines)
            region.set_TextEquiv(
                [TextEquivType(Unicode=region_unicode)])  # remove old
+
+# vim:tw=120:
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,6 @@
-numpy
-tensorflow-gpu == 1.14.0
-calamari-ocr == 0.3.5
+h5py < 3  # XXX tensorflow 2.4.0rc3 requires h5py~=2.10.0, but you'll have h5py 3.1.0 which is incompatible.
+tensorflow >= 2.3.0rc2
+calamari-ocr == 1.0.*
 setuptools >= 41.0.0  # tensorboard depends on this, but why do we get an error at runtime?
 click
-ocrd >= 1.0.0b11
+ocrd >= 2.22.0
--- a/setup.py
+++ b/setup.py
@ -5,15 +5,15 @@ from setuptools import setup, find_packages

 setup(
    name='ocrd_calamari',
-    version='0.0.3',
+    version='1.0.1',
    description='Calamari bindings',
    long_description=Path('README.md').read_text(),
    long_description_content_type='text/markdown',
    author='Konstantin Baierer, Mike Gerber',
    author_email='unixprog@gmail.com, mike.gerber@sbb.spk-berlin.de',
-    url='https://github.com/kba/ocrd_calamari',
+    url='https://github.com/OCR-D/ocrd_calamari',
    license='Apache License 2.0',
-    packages=find_packages(exclude=('tests', 'docs')),
+    packages=find_packages(exclude=('test', 'docs')),
    install_requires=Path('requirements.txt').read_text().split('\n'),
    package_data={
        '': ['*.json', '*.yml', '*.yaml'],
--- a/test/test_recognize.py
+++ b/test/test_recognize.py
@ -2,6 +2,8 @@ import os
 import shutil
 import subprocess
 import urllib.request
+from lxml import etree
+from glob import glob

 import pytest
 import logging
@ -10,9 +12,14 @@ from ocrd.resolver import Resolver
 from ocrd_calamari import CalamariRecognize
 from .base import assets

-METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml')
-CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json')
+METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml')
 WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
+CHECKPOINT_DIR = os.path.join(os.getcwd(), 'gt4histocr-calamari1')
+CHECKPOINT = os.path.join(CHECKPOINT_DIR, '*.ckpt.json')
+
+# Because XML namespace versions are so much fun, we not only use one, we use TWO!
+NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
+NSMAP_GT = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" }


@pytest.fixture
@ -32,10 +39,6 @@ def workspace():
            "https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f,
            os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f))

-    return workspace
-
-
-def test_recognize(workspace):
    # The binarization options I have are:
    #
    # a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf)
@ -48,17 +51,49 @@ def test_recognize(workspace):
        ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)
        subprocess.call(['convert', ff, '-threshold', '50%', ff])

-    # XXX Should remove GT text to really test this
+    # Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text
+    # XXX Review data again
+    # XXX Make this more robust against namespace version changes
+    for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-LINE"):
+        workspace.download_file(of)
+    for to_remove in ["//pc:Word", "//pc:TextEquiv"]:
+        for ff in glob(os.path.join(WORKSPACE_DIR, "OCR-D-GT-SEG-LINE", "*")):
+            tree = etree.parse(ff)
+            for e in tree.xpath(to_remove, namespaces=NSMAP_GT):
+                e.getparent().remove(e)
+            tree.write(ff, xml_declaration=True, encoding="utf-8")
+
+    return workspace
+

+def test_recognize(workspace):
    CalamariRecognize(
        workspace,
        input_file_grp="OCR-D-GT-SEG-LINE",
        output_file_grp="OCR-D-OCR-CALAMARI",
-        parameter={'checkpoint': CHECKPOINT}
+        parameter={
+            "checkpoint": CHECKPOINT,
+        }
    ).process()
    workspace.save_mets()

-    page1 = os.path.join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml')
+    page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
+    assert os.path.exists(page1)
+    with open(page1, "r", encoding="utf-8") as f:
+        assert "verſchuldeten" in f.read()
+
+def test_recognize_with_checkpoint_dir(workspace):
+    CalamariRecognize(
+        workspace,
+        input_file_grp="OCR-D-GT-SEG-LINE",
+        output_file_grp="OCR-D-OCR-CALAMARI",
+        parameter={
+            "checkpoint_dir": CHECKPOINT_DIR,
+        }
+    ).process()
+    workspace.save_mets()
+
+    page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
    assert os.path.exists(page1)
    with open(page1, 'r', encoding='utf-8') as f:
        assert 'verſchuldeten' in f.read()
@ -75,3 +110,61 @@ def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(works

    interesting_log_messages = [t[2] for t in caplog.record_tuples if "Using raw image" in t[2]]
    assert len(interesting_log_messages) > 10  # For every line!
+    with open(page1, "r", encoding="utf-8") as f:
+        assert "verſchuldeten" in f.read()
+
+
+def test_word_segmentation(workspace):
+    CalamariRecognize(
+        workspace,
+        input_file_grp="OCR-D-GT-SEG-LINE",
+        output_file_grp="OCR-D-OCR-CALAMARI",
+        parameter={
+            "checkpoint": CHECKPOINT,
+            "textequiv_level": "word",   # Note that we're going down to word level here
+        }
+    ).process()
+    workspace.save_mets()
+
+    page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
+    assert os.path.exists(page1)
+    tree = etree.parse(page1)
+
+    # The result should contain a TextLine that contains the text "December"
+    line = tree.xpath(".//pc:TextLine[pc:TextEquiv/pc:Unicode[contains(text(),'December')]]", namespaces=NSMAP)[0]
+    assert line
+
+    # The textline should a. contain multiple words and b. these should concatenate fine to produce the same line text
+    words = line.xpath(".//pc:Word", namespaces=NSMAP)
+    assert len(words) >= 2
+    words_text = " ".join(word.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text for word in words)
+    line_text = line.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text
+    assert words_text == line_text
+
+    # For extra measure, check that we're not seeing any glyphs, as we asked for textequiv_level == "word"
+    glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP)
+    assert len(glyphs) == 0
+
+
+def test_glyphs(workspace):
+    CalamariRecognize(
+        workspace,
+        input_file_grp="OCR-D-GT-SEG-LINE",
+        output_file_grp="OCR-D-OCR-CALAMARI",
+        parameter={
+            "checkpoint": CHECKPOINT,
+            "textequiv_level": "glyph",   # Note that we're going down to glyph level here
+        }
+    ).process()
+    workspace.save_mets()
+
+    page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
+    assert os.path.exists(page1)
+    tree = etree.parse(page1)
+
+    # The result should contain a lot of glyphs
+    glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP)
+    assert len(glyphs) >= 100
+
+
+# vim:tw=120: