Merge branch 'master' into image-features

2025-10-14 08:49:53 +02:00 · 2021-02-09 18:17:23 +01:00 · 2021-02-09 18:17:23 +01:00 · c0902cdef5
commit c0902cdef5
parent 377466a71a 1bb72cbaf1
12 changed files with 385 additions and 105 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -7,15 +7,23 @@ jobs:
  build-python36:
    docker:
      - image: ubuntu:18.04
    environment:
      - PYTHONIOENCODING: utf-8
    steps:
-      - run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget imagemagick
+      - run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget imagemagick locales
      - run: locale-gen "en_US.UTF-8"; update-locale LC_ALL="en_US.UTF-8"
      - checkout
      - run: pip3 install --upgrade pip
      - run: make install PIP_INSTALL="pip3 install"
      - run: pip3 install -r requirements-test.txt
-      - run: make coverage
+      - run: make coverage LC_ALL=en_US.utf8
      - codecov/upload
 workflows:
  build:
    jobs:
-      - build-python36
+      - build-python36:
          filters:
            branches:
              ignore:
                - screenshots
--- a/.gitignore
+++ b/.gitignore
@ -107,5 +107,7 @@ venv.bak/
 /calamari
 /calamari_models
 /gt4histocr-calamari
 /actevedef_718448162*
 /repo
 /test/assets
 gt4histocr-calamari*
--- a/8
+++ b/8
@ -1,4 +1,4 @@
-FROM ocrd/core:edge
+FROM ocrd/core
 MAINTAINER OCR-D
 ENV DEBIAN_FRONTEND noninteractive
 ENV PYTHONIOENCODING utf8
@ -10,10 +10,12 @@ COPY Makefile .
 COPY setup.py .
 COPY ocrd-tool.json .
 COPY requirements.txt .
 COPY README.md .
 COPY ocrd_calamari ocrd_calamari
-RUN make calamari/build
+RUN pip3 install --upgrade pip && \
-RUN pip3 install .
+    pip3 install . && \
    pip3 check
 ENTRYPOINT ["/usr/local/bin/ocrd-calamari-recognize"]
--- a/2
+++ b/2
@ -186,7 +186,7 @@
      same "printed page" as the copyright notice for easier
      identification within third-party archives.
-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2018-2020 Konstantin Baierer, Mike Gerber
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
--- a/37
+++ b/37
@ -1,3 +1,4 @@
 export  # export variables to subshells
 PIP_INSTALL = pip3 install
 GIT_CLONE = git clone
 PYTHON = python3
@ -10,10 +11,8 @@ help:
 	@echo "  Targets"
 	@echo ""
 	@echo "    install          Install ocrd_calamari"
-	@echo "    calamari         Clone calamari repo"
+	@echo "    gt4histocr-calamari1 Get GT4HistOCR Calamari model (from SBB)"
-	@echo "    calamari_models  Clone calamari_models repo"
+	@echo "    actevedef_718448162 Download example data"
 	@echo "    gt4histocr-calamari Get GT4HistOCR Calamari model (from SBB)"
 	@echo "    calamari/build   pip install calamari"
 	@echo "    deps-test        Install testing python deps via pip"
 	@echo "    repo/assets      Clone OCR-D/assets to ./repo/assets"
 	@echo "    test/assets      Setup test assets"
@ -33,29 +32,21 @@ help:
 install:
 	$(PIP_INSTALL) .
 # Clone calamari repo
 calamari:
 	$(GIT_CLONE) https://github.com/chwick/calamari
-# Clone calamari_models repo
+# Get GT4HistOCR Calamari model (from SBB)
-calamari_models:
+gt4histocr-calamari1:
-	$(GIT_CLONE) -n https://github.com/chwick/calamari_models
+	mkdir -p gt4histocr-calamari1
-	# Checkout latest version that works with calamari-ocr==0.3.5:
+	cd gt4histocr-calamari1 && \
-	cd calamari_models && git checkout f76b1d3ec
+	wget https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz && \
 gt4histocr-calamari:
 	mkdir gt4histocr-calamari
 	cd gt4histocr-calamari && \
 	wget https://file.spk-berlin.de:8443/calamari-models/GT4HistOCR/model.tar.xz && \
 	tar xfv model.tar.xz && \
 	rm model.tar.xz
 # Download example data
 actevedef_718448162:
 	wget https://qurator-data.de/examples/actevedef_718448162.zip && \
 	unzip actevedef_718448162.zip
 # pip install calamari
 calamari/build: calamari calamari_models
 	cd calamari && $(PIP_INSTALL) .
 #
 # Assets and Tests
@ -82,12 +73,12 @@ assets-clean:
 	rm -rf test/assets
 # Run unit tests
-test: test/assets gt4histocr-calamari
+test: test/assets gt4histocr-calamari1
 	# declare -p HTTP_PROXY
 	$(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS)
 # Run unit tests and determine test coverage
-coverage: test/assets calamari_models
+coverage: test/assets gt4histocr-calamari1
 	coverage erase
 	make test PYTHON="coverage run"
 	coverage report
--- a/README-DEV.md
+++ b/README-DEV.md
@ -4,17 +4,20 @@ In a Python 3 virtualenv:
 ~~~
 pip install -e .
 pip install -r requirements-test.txt
 make test
 ~~~
-Release
+Releasing
-------
+---------
-* Update ocrd-tool.json version
+* Update `ocrd-tool.json` version
-* Update setup.py version
+* Update `setup.py` version
-* git commit -m 'v<version>'
+* `git commit -m 'v<version>'`
-* git tag -m 'v<version>' 'v<version>'
+* `git tag -m 'v<version>' 'v<version>'`
-* git push --tags
+* `git push --tags`
 * Do a release on GitHub
-PyPI:
+### Uploading to PyPI
-* python sdist bdist_wheel
+* `rm -rf dist/` or backup if `dist/` exists already
-* twine upload dist/ocrd_calamari-<version>*
+* In the virtualenv: `python setup.py sdist bdist_wheel`
 * `twine upload dist/ocrd_calamari-<version>*`
--- a/README.md
+++ b/README.md
@ -8,11 +8,22 @@
 ## Introduction
-This offers a OCR-D compliant workspace processor for some of the functionality of Calamari OCR.
+**ocrd_calamari** offers a [OCR-D](https://ocr-d.de) compliant workspace processor for the functionality of Calamari OCR. It uses OCR-D workspaces (METS) with [PAGE XML](https://github.com/PRImA-Research-Lab/PAGE-XML) documents as input and output.
 This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized 
 image) as its input.
 In addition to the line text it may also output word and glyph segmentation
 including per-glyph confidence values and per-glyph alternative predictions as
 provided by the Calamari OCR engine, using a `textequiv_level` of `word` or
 `glyph`. Note that while Calamari does not provide word segmentation, this
 processor produces word segmentation inferred from text
 segmentation and the glyph positions. The provided glyph and word segmentation
 can be used for text extraction and highlighting, but is probably not useful for
 further image-based processing.
 ![Example output as viewed in PAGE Viewer](https://github.com/OCR-D/ocrd_calamari/raw/screenshots/output-in-page-viewer.jpg)
 ## Installation
 ### From PyPI
@ -29,32 +40,44 @@ pip install .
 ## Install models
 Download standard models:
 ```
 wget https://github.com/Calamari-OCR/calamari_models/archive/master.zip
 unzip master.zip
 ```
 Download models trained on GT4HistOCR data:
 ```
-make gt4histocr-calamari
+make gt4histocr-calamari1
-ls gt4histocr-calamari
+ls gt4histocr-calamari1
 ```
 Manual download: [model.tar.xz](https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz)
 ## Example Usage
 Before using `ocrd-calamari-recognize` get some example data and model, and
 prepare the document for OCR:
 ```
 # Download model and example data
 make gt4histocr-calamari1
 make actevedef_718448162
-~~~
+# Create binarized images and line segmentation using other OCR-D projects
-ocrd-calamari-recognize -p test-parameters.json -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
+cd actevedef_718448162
-~~~
+ocrd-olena-binarize -P impl sauvola-ms-split -I OCR-D-IMG -O OCR-D-IMG-BIN
 ocrd-tesserocr-segment-region -I OCR-D-IMG-BIN -O OCR-D-SEG-REGION
 ocrd-tesserocr-segment-line -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE
 ```
-With `test-parameters.json`:
+Finally recognize the text using ocrd_calamari and the downloaded model:
-~~~
+```
-{
+ocrd-calamari-recognize -P checkpoint "../gt4histocr-calamari1/*.ckpt.json" -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
-    "checkpoint": "/path/to/some/trained/models/*.ckpt.json"
+```
-}
+
-~~~
+or
 ```
 ocrd-calamari-recognize -P checkpoint_dir "../gt4histocr-calamari1" -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
 ```
 You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions
 for additional parameters and default values.
 ## Development & Testing
 For information regarding development and testing, please see
--- a/ocrd_calamari/ocrd-tool.json
+++ b/ocrd_calamari/ocrd-tool.json
@ -1,6 +1,6 @@
 {
-  "git_url": "https://github.com/kba/ocrd_calamari",
+  "git_url": "https://github.com/OCR-D/ocrd_calamari",
-  "version": "0.0.3",
+  "version": "1.0.1",
  "tools": {
    "ocrd-calamari-recognize": {
      "executable": "ocrd-calamari-recognize",
@ -18,6 +18,10 @@
        "OCR-D-OCR-CALAMARI"
      ],
      "parameters": {
        "checkpoint_dir": {
          "description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory",
          "type": "string", "format": "file", "cacheable": true, "default": "qurator-gt4histocr-1.0"
        },
        "checkpoint": {
          "description": "The calamari model files (*.ckpt.json)",
          "type": "string", "format": "file", "cacheable": true
@ -25,6 +29,18 @@
        "voter": {
          "description": "The voting algorithm to use",
          "type": "string", "default": "confidence_voter_default_ctc"
        },
        "textequiv_level": {
          "type": "string",
          "enum": ["line", "word", "glyph"],
          "default": "line",
          "description": "Deepest PAGE XML hierarchy level to include TextEquiv results for"
        },
        "glyph_conf_cutoff": {
          "type": "number",
          "format": "float",
          "default": 0.001,
          "description": "Only include glyph alternatives with confidences above this threshold"
        }
      }
    }
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@ -1,33 +1,50 @@
 from __future__ import absolute_import
 import os
 import itertools
 from glob import glob
 import numpy as np
 from calamari_ocr import __version__ as calamari_version
 from calamari_ocr.ocr import MultiPredictor
 from calamari_ocr.ocr.voting import voter_from_proto
 from calamari_ocr.proto import VoterParams
 from ocrd import Processor
 from ocrd_modelfactory import page_from_file
-from ocrd_models.ocrd_page import to_xml
+from ocrd_models.ocrd_page import (
-from ocrd_models.ocrd_page_generateds import TextEquivType
+        LabelType, LabelsType,
-from ocrd_utils import getLogger, concat_padded, MIMETYPE_PAGE
+        MetadataItemType,
        TextEquivType,
        WordType, GlyphType, CoordsType,
        to_xml
 )
 from ocrd_utils import (
        getLogger, concat_padded,
        coordinates_for_segment, points_from_polygon, polygon_from_x0y0x1y1,
        make_file_id, assert_file_grp_cardinality,
        MIMETYPE_PAGE
 )
 from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL
-log = getLogger('processor.CalamariRecognize')
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL
 from tensorflow import __version__ as tensorflow_version
 TOOL = 'ocrd-calamari-recognize'
 class CalamariRecognize(Processor):
    def __init__(self, *args, **kwargs):
-        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-recognize']
+        kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
-        kwargs['version'] = OCRD_TOOL['version']
+        kwargs['version'] = '%s (calamari %s, tensorflow %s)' % (OCRD_TOOL['version'], calamari_version, tensorflow_version)
        super(CalamariRecognize, self).__init__(*args, **kwargs)
    def _init_calamari(self):
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL
        if not self.parameter.get('checkpoint', None) and self.parameter.get('checkpoint_dir', None):
            resolved = self.resolve_resource(self.parameter['checkpoint_dir'])
            self.parameter['checkpoint'] = '%s/*.ckpt.json' % resolved
        checkpoints = glob(self.parameter['checkpoint'])
        self.predictor = MultiPredictor(checkpoints=checkpoints)
@ -43,16 +60,14 @@ class CalamariRecognize(Processor):
        voter_params.type = VoterParams.Type.Value(self.parameter['voter'].upper())
        self.voter = voter_from_proto(voter_params)
    def _make_file_id(self, input_file, n):
        file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
        if file_id == input_file.ID:
            file_id = concat_padded(self.output_file_grp, n)
        return file_id
    def process(self):
        """
        Performs the recognition.
        """
        log = getLogger('processor.CalamariRecognize')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        self._init_calamari()
@ -71,44 +86,169 @@ class CalamariRecognize(Processor):
                textlines = region.get_TextLine()
                log.info("About to recognize %i lines of region '%s'", len(textlines), region.id)
                line_images_np = []
                for line in textlines:
                    log.debug("Recognizing line '%s' in region '%s'", line.id, region.id)
-                    line_image, line_coords = self.workspace.image_from_segment(
+                    line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_coords, feature_selector=self.features)
-                        line, region_image, region_coords, feature_selector=self.features)
+                    if ('binarized' not in line_coords['features'] and 'grayscale_normalized' not in line_coords['features'] and self.input_channels == 1):
                    if ('binarized' not in line_coords['features'] and
                        'grayscale_normalized' not in line_coords['features'] and
                        self.input_channels == 1):
                        # We cannot use a feature selector for this since we don't
                        # know whether the model expects (has been trained on)
                        # binarized or grayscale images; but raw images are likely
                        # always inadequate:
-                        log.warning("Using raw image for line '%s' in region '%s'",
+                        log.warning("Using raw image for line '%s' in region '%s'", line.id, region.id)
-                                    line.id, region.id)
+
-                    
+                    line_image = line_image if all(line_image.size) else [[0]]
-                    line_image_np = np.array(line_image, dtype=np.uint8)
+                    line_image_np = np.array(line_image, dtype=np.uint8)
                    line_images_np.append(line_image_np)
                raw_results_all = self.predictor.predict_raw(line_images_np, progress_bar=False)
                for line, raw_results in zip(textlines, raw_results_all):
                    raw_results = list(self.predictor.predict_raw([line_image_np], progress_bar=False))[0]
                    for i, p in enumerate(raw_results):
                        p.prediction.id = "fold_{}".format(i)
                    prediction = self.voter.vote_prediction_result(raw_results)
                    prediction.id = "voted"
-                    line_text = prediction.sentence
+                    # Build line text on our own
-                    line_conf = prediction.avg_char_probability
+                    #
                    # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
                    # on prediction.positions. Do it on our own to have consistency.
                    #
                    # XXX Check Calamari's built-in post-processing on prediction.sentence
                    def _sort_chars(p):
                        """Filter and sort chars of prediction p"""
                        chars = p.chars
                        chars = [c for c in chars if c.char]  # XXX Note that omission probabilities are not normalized?!
                        chars = [c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff']]
                        chars = sorted(chars, key=lambda k: k.probability, reverse=True)
                        return chars
                    def _drop_leading_spaces(positions):
                        return list(itertools.dropwhile(lambda p: _sort_chars(p)[0].char == " ", positions))
                    def _drop_trailing_spaces(positions):
                        return list(reversed(_drop_leading_spaces(reversed(positions))))
                    def _drop_double_spaces(positions):
                        def _drop_double_spaces_generator(positions):
                            last_was_space = False
                            for p in positions:
                                if p.chars[0].char == " ":
                                    if not last_was_space:
                                        yield p
                                    last_was_space = True
                                else:
                                    yield p
                                    last_was_space = False
                        return list(_drop_double_spaces_generator(positions))
                    positions = prediction.positions
                    positions = _drop_leading_spaces(positions)
                    positions = _drop_trailing_spaces(positions)
                    positions = _drop_double_spaces(positions)
                    positions = list(positions)
                    line_text = ''.join(_sort_chars(p)[0].char for p in positions)
                    if line_text != prediction.sentence:
                        log.warning("Our own line text is not the same as Calamari's: '%s' != '%s'",
                                    line_text, prediction.sentence)
                    # Delete existing results
                    if line.get_TextEquiv():
                        log.warning("Line '%s' already contained text results", line.id)
-                    line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])
+                    line.set_TextEquiv([])
                    if line.get_Word():
                        log.warning("Line '%s' already contained word segmentation", line.id)
                    line.set_Word([])
                    # Save line results
                    line_conf = prediction.avg_char_probability
                    line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])
                    # Save word results
                    #
                    # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
                    # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
                    # hierarchy of lines > words > glyphs.
                    def _words(s):
                        """Split words based on spaces and include spaces as 'words'"""
                        spaces = None
                        word = ''
                        for c in s:
                            if c == ' ' and spaces is True:
                                word += c
                            elif c != ' ' and spaces is False:
                                word += c
                            else:
                                if word:
                                    yield word
                                word = c
                                spaces = (c == ' ')
                        yield word
                    if self.parameter['textequiv_level'] in ['word', 'glyph']:
                        word_no = 0
                        i = 0
                        for word_text in _words(line_text):
                            word_length = len(word_text)
                            if not all(c == ' ' for c in word_text):
                                word_positions = positions[i:i+word_length]
                                word_start = word_positions[0].global_start
                                word_end = word_positions[-1].global_end
                                polygon = polygon_from_x0y0x1y1([word_start, 0, word_end, line_image.height])
                                points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords))
                                # XXX Crop to line polygon?
                                word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points))
                                word.add_TextEquiv(TextEquivType(Unicode=word_text))
                                if self.parameter['textequiv_level'] == 'glyph':
                                    for glyph_no, p in enumerate(word_positions):
                                        glyph_start = p.global_start
                                        glyph_end = p.global_end
                                        polygon = polygon_from_x0y0x1y1([glyph_start, 0, glyph_end, line_image.height])
                                        points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords))
                                        glyph = GlyphType(id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points))
                                        # Add predictions (= TextEquivs)
                                        char_index_start = 1  # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
                                        for char_index, char in enumerate(_sort_chars(p), start=char_index_start):
                                            glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=char_index, conf=char.probability))
                                        word.add_Glyph(glyph)
                                line.add_Word(word)
                                word_no += 1
                            i += word_length
            _page_update_higher_textequiv_levels('line', pcgts)
-            file_id = self._make_file_id(input_file, n)
+
            # Add metadata about this operation and its runtime parameters:
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(type_="processingStep",
                                 name=self.ocrd_tool['steps'][0],
                                 value=TOOL,
                                 Labels=[LabelsType(
                                     externalModel="ocrd-tool",
                                     externalId="parameters",
                                     Label=[LabelType(type_=name, value=self.parameter[name])
                                            for name in self.parameter.keys()])]))
            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
@ -151,3 +291,5 @@ def _page_update_higher_textequiv_levels(level, pcgts):
                                        else u'' for line in lines)
            region.set_TextEquiv(
                [TextEquivType(Unicode=region_unicode)])  # remove old
 # vim:tw=120:
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,6 @@
-numpy
+h5py < 3  # XXX tensorflow 2.4.0rc3 requires h5py~=2.10.0, but you'll have h5py 3.1.0 which is incompatible.
-tensorflow-gpu == 1.14.0
+tensorflow >= 2.3.0rc2
-calamari-ocr == 0.3.5
+calamari-ocr == 1.0.*
 setuptools >= 41.0.0  # tensorboard depends on this, but why do we get an error at runtime?
 click
-ocrd >= 1.0.0b11
+ocrd >= 2.22.0
--- a/setup.py
+++ b/setup.py
@ -5,15 +5,15 @@ from setuptools import setup, find_packages
 setup(
    name='ocrd_calamari',
-    version='0.0.3',
+    version='1.0.1',
    description='Calamari bindings',
    long_description=Path('README.md').read_text(),
    long_description_content_type='text/markdown',
    author='Konstantin Baierer, Mike Gerber',
    author_email='unixprog@gmail.com, mike.gerber@sbb.spk-berlin.de',
-    url='https://github.com/kba/ocrd_calamari',
+    url='https://github.com/OCR-D/ocrd_calamari',
    license='Apache License 2.0',
-    packages=find_packages(exclude=('tests', 'docs')),
+    packages=find_packages(exclude=('test', 'docs')),
    install_requires=Path('requirements.txt').read_text().split('\n'),
    package_data={
        '': ['*.json', '*.yml', '*.yaml'],
--- a/test/test_recognize.py
+++ b/test/test_recognize.py
@ -2,6 +2,8 @@ import os
 import shutil
 import subprocess
 import urllib.request
 from lxml import etree
 from glob import glob
 import pytest
 import logging
@ -10,9 +12,14 @@ from ocrd.resolver import Resolver
 from ocrd_calamari import CalamariRecognize
 from .base import assets
-METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml')
+METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml')
 CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json')
 WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
 CHECKPOINT_DIR = os.path.join(os.getcwd(), 'gt4histocr-calamari1')
 CHECKPOINT = os.path.join(CHECKPOINT_DIR, '*.ckpt.json')
 # Because XML namespace versions are so much fun, we not only use one, we use TWO!
 NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
 NSMAP_GT = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" }
@pytest.fixture
@ -32,10 +39,6 @@ def workspace():
            "https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f,
            os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f))
    return workspace
 def test_recognize(workspace):
    # The binarization options I have are:
    #
    # a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf)
@ -48,17 +51,49 @@ def test_recognize(workspace):
        ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)
        subprocess.call(['convert', ff, '-threshold', '50%', ff])
-    # XXX Should remove GT text to really test this
+    # Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text
    # XXX Review data again
    # XXX Make this more robust against namespace version changes
    for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-LINE"):
        workspace.download_file(of)
    for to_remove in ["//pc:Word", "//pc:TextEquiv"]:
        for ff in glob(os.path.join(WORKSPACE_DIR, "OCR-D-GT-SEG-LINE", "*")):
            tree = etree.parse(ff)
            for e in tree.xpath(to_remove, namespaces=NSMAP_GT):
                e.getparent().remove(e)
            tree.write(ff, xml_declaration=True, encoding="utf-8")
    return workspace
 def test_recognize(workspace):
    CalamariRecognize(
        workspace,
        input_file_grp="OCR-D-GT-SEG-LINE",
        output_file_grp="OCR-D-OCR-CALAMARI",
-        parameter={'checkpoint': CHECKPOINT}
+        parameter={
            "checkpoint": CHECKPOINT,
        }
    ).process()
    workspace.save_mets()
-    page1 = os.path.join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml')
+    page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
    assert os.path.exists(page1)
    with open(page1, "r", encoding="utf-8") as f:
        assert "verſchuldeten" in f.read()
 def test_recognize_with_checkpoint_dir(workspace):
    CalamariRecognize(
        workspace,
        input_file_grp="OCR-D-GT-SEG-LINE",
        output_file_grp="OCR-D-OCR-CALAMARI",
        parameter={
            "checkpoint_dir": CHECKPOINT_DIR,
        }
    ).process()
    workspace.save_mets()
    page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
    assert os.path.exists(page1)
    with open(page1, 'r', encoding='utf-8') as f:
        assert 'verſchuldeten' in f.read()
@ -75,3 +110,61 @@ def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(works
    interesting_log_messages = [t[2] for t in caplog.record_tuples if "Using raw image" in t[2]]
    assert len(interesting_log_messages) > 10  # For every line!
    with open(page1, "r", encoding="utf-8") as f:
        assert "verſchuldeten" in f.read()
 def test_word_segmentation(workspace):
    CalamariRecognize(
        workspace,
        input_file_grp="OCR-D-GT-SEG-LINE",
        output_file_grp="OCR-D-OCR-CALAMARI",
        parameter={
            "checkpoint": CHECKPOINT,
            "textequiv_level": "word",   # Note that we're going down to word level here
        }
    ).process()
    workspace.save_mets()
    page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
    assert os.path.exists(page1)
    tree = etree.parse(page1)
    # The result should contain a TextLine that contains the text "December"
    line = tree.xpath(".//pc:TextLine[pc:TextEquiv/pc:Unicode[contains(text(),'December')]]", namespaces=NSMAP)[0]
    assert line
    # The textline should a. contain multiple words and b. these should concatenate fine to produce the same line text
    words = line.xpath(".//pc:Word", namespaces=NSMAP)
    assert len(words) >= 2
    words_text = " ".join(word.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text for word in words)
    line_text = line.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text
    assert words_text == line_text
    # For extra measure, check that we're not seeing any glyphs, as we asked for textequiv_level == "word"
    glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP)
    assert len(glyphs) == 0
 def test_glyphs(workspace):
    CalamariRecognize(
        workspace,
        input_file_grp="OCR-D-GT-SEG-LINE",
        output_file_grp="OCR-D-OCR-CALAMARI",
        parameter={
            "checkpoint": CHECKPOINT,
            "textequiv_level": "glyph",   # Note that we're going down to glyph level here
        }
    ).process()
    workspace.save_mets()
    page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
    assert os.path.exists(page1)
    tree = etree.parse(page1)
    # The result should contain a lot of glyphs
    glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP)
    assert len(glyphs) >= 100
 # vim:tw=120: