Merge branch 'master' into image-features

fix/readme-no-checkpoint
Gerber, Mike 4 years ago
commit c0902cdef5

@ -7,15 +7,23 @@ jobs:
build-python36:
docker:
- image: ubuntu:18.04
environment:
- PYTHONIOENCODING: utf-8
steps:
- run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget imagemagick
- run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget imagemagick locales
- run: locale-gen "en_US.UTF-8"; update-locale LC_ALL="en_US.UTF-8"
- checkout
- run: pip3 install --upgrade pip
- run: make install PIP_INSTALL="pip3 install"
- run: pip3 install -r requirements-test.txt
- run: make coverage
- run: make coverage LC_ALL=en_US.utf8
- codecov/upload
workflows:
build:
jobs:
- build-python36
- build-python36:
filters:
branches:
ignore:
- screenshots

2
.gitignore vendored

@ -107,5 +107,7 @@ venv.bak/
/calamari
/calamari_models
/gt4histocr-calamari
/actevedef_718448162*
/repo
/test/assets
gt4histocr-calamari*

@ -1,4 +1,4 @@
FROM ocrd/core:edge
FROM ocrd/core
MAINTAINER OCR-D
ENV DEBIAN_FRONTEND noninteractive
ENV PYTHONIOENCODING utf8
@ -10,10 +10,12 @@ COPY Makefile .
COPY setup.py .
COPY ocrd-tool.json .
COPY requirements.txt .
COPY README.md .
COPY ocrd_calamari ocrd_calamari
RUN make calamari/build
RUN pip3 install .
RUN pip3 install --upgrade pip && \
pip3 install . && \
pip3 check
ENTRYPOINT ["/usr/local/bin/ocrd-calamari-recognize"]

@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Copyright 2018-2020 Konstantin Baierer, Mike Gerber
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.

@ -1,3 +1,4 @@
export # export variables to subshells
PIP_INSTALL = pip3 install
GIT_CLONE = git clone
PYTHON = python3
@ -10,10 +11,8 @@ help:
@echo " Targets"
@echo ""
@echo " install Install ocrd_calamari"
@echo " calamari Clone calamari repo"
@echo " calamari_models Clone calamari_models repo"
@echo " gt4histocr-calamari Get GT4HistOCR Calamari model (from SBB)"
@echo " calamari/build pip install calamari"
@echo " gt4histocr-calamari1 Get GT4HistOCR Calamari model (from SBB)"
@echo " actevedef_718448162 Download example data"
@echo " deps-test Install testing python deps via pip"
@echo " repo/assets Clone OCR-D/assets to ./repo/assets"
@echo " test/assets Setup test assets"
@ -33,29 +32,21 @@ help:
install:
$(PIP_INSTALL) .
# Clone calamari repo
calamari:
$(GIT_CLONE) https://github.com/chwick/calamari
# Clone calamari_models repo
calamari_models:
$(GIT_CLONE) -n https://github.com/chwick/calamari_models
# Checkout latest version that works with calamari-ocr==0.3.5:
cd calamari_models && git checkout f76b1d3ec
gt4histocr-calamari:
mkdir gt4histocr-calamari
cd gt4histocr-calamari && \
wget https://file.spk-berlin.de:8443/calamari-models/GT4HistOCR/model.tar.xz && \
# Get GT4HistOCR Calamari model (from SBB)
gt4histocr-calamari1:
mkdir -p gt4histocr-calamari1
cd gt4histocr-calamari1 && \
wget https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz && \
tar xfv model.tar.xz && \
rm model.tar.xz
# Download example data
actevedef_718448162:
wget https://qurator-data.de/examples/actevedef_718448162.zip && \
unzip actevedef_718448162.zip
# pip install calamari
calamari/build: calamari calamari_models
cd calamari && $(PIP_INSTALL) .
#
# Assets and Tests
@ -82,12 +73,12 @@ assets-clean:
rm -rf test/assets
# Run unit tests
test: test/assets gt4histocr-calamari
test: test/assets gt4histocr-calamari1
# declare -p HTTP_PROXY
$(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS)
# Run unit tests and determine test coverage
coverage: test/assets calamari_models
coverage: test/assets gt4histocr-calamari1
coverage erase
make test PYTHON="coverage run"
coverage report

@ -4,17 +4,20 @@ In a Python 3 virtualenv:
~~~
pip install -e .
pip install -r requirements-test.txt
make test
~~~
Release
-------
* Update ocrd-tool.json version
* Update setup.py version
* git commit -m 'v<version>'
* git tag -m 'v<version>' 'v<version>'
* git push --tags
Releasing
---------
* Update `ocrd-tool.json` version
* Update `setup.py` version
* `git commit -m 'v<version>'`
* `git tag -m 'v<version>' 'v<version>'`
* `git push --tags`
* Do a release on GitHub
PyPI:
* python sdist bdist_wheel
* twine upload dist/ocrd_calamari-<version>*
### Uploading to PyPI
* `rm -rf dist/` or backup if `dist/` exists already
* In the virtualenv: `python setup.py sdist bdist_wheel`
* `twine upload dist/ocrd_calamari-<version>*`

@ -8,11 +8,22 @@
## Introduction
This offers a OCR-D compliant workspace processor for some of the functionality of Calamari OCR.
**ocrd_calamari** offers a [OCR-D](https://ocr-d.de) compliant workspace processor for the functionality of Calamari OCR. It uses OCR-D workspaces (METS) with [PAGE XML](https://github.com/PRImA-Research-Lab/PAGE-XML) documents as input and output.
This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized
image) as its input.
In addition to the line text it may also output word and glyph segmentation
including per-glyph confidence values and per-glyph alternative predictions as
provided by the Calamari OCR engine, using a `textequiv_level` of `word` or
`glyph`. Note that while Calamari does not provide word segmentation, this
processor produces word segmentation inferred from text
segmentation and the glyph positions. The provided glyph and word segmentation
can be used for text extraction and highlighting, but is probably not useful for
further image-based processing.
![Example output as viewed in PAGE Viewer](https://github.com/OCR-D/ocrd_calamari/raw/screenshots/output-in-page-viewer.jpg)
## Installation
### From PyPI
@ -29,32 +40,44 @@ pip install .
## Install models
Download standard models:
Download models trained on GT4HistOCR data:
```
wget https://github.com/Calamari-OCR/calamari_models/archive/master.zip
unzip master.zip
make gt4histocr-calamari1
ls gt4histocr-calamari1
```
Download models trained on GT4HistOCR data:
Manual download: [model.tar.xz](https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz)
## Example Usage
Before using `ocrd-calamari-recognize` get some example data and model, and
prepare the document for OCR:
```
make gt4histocr-calamari
ls gt4histocr-calamari
# Download model and example data
make gt4histocr-calamari1
make actevedef_718448162
# Create binarized images and line segmentation using other OCR-D projects
cd actevedef_718448162
ocrd-olena-binarize -P impl sauvola-ms-split -I OCR-D-IMG -O OCR-D-IMG-BIN
ocrd-tesserocr-segment-region -I OCR-D-IMG-BIN -O OCR-D-SEG-REGION
ocrd-tesserocr-segment-line -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE
```
## Example Usage
Finally recognize the text using ocrd_calamari and the downloaded model:
```
ocrd-calamari-recognize -P checkpoint "../gt4histocr-calamari1/*.ckpt.json" -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
```
or
```
ocrd-calamari-recognize -P checkpoint_dir "../gt4histocr-calamari1" -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
```
~~~
ocrd-calamari-recognize -p test-parameters.json -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
~~~
With `test-parameters.json`:
~~~
{
"checkpoint": "/path/to/some/trained/models/*.ckpt.json"
}
~~~
You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions
for additional parameters and default values.
## Development & Testing
For information regarding development and testing, please see

@ -1,6 +1,6 @@
{
"git_url": "https://github.com/kba/ocrd_calamari",
"version": "0.0.3",
"git_url": "https://github.com/OCR-D/ocrd_calamari",
"version": "1.0.1",
"tools": {
"ocrd-calamari-recognize": {
"executable": "ocrd-calamari-recognize",
@ -18,6 +18,10 @@
"OCR-D-OCR-CALAMARI"
],
"parameters": {
"checkpoint_dir": {
"description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory",
"type": "string", "format": "file", "cacheable": true, "default": "qurator-gt4histocr-1.0"
},
"checkpoint": {
"description": "The calamari model files (*.ckpt.json)",
"type": "string", "format": "file", "cacheable": true
@ -25,6 +29,18 @@
"voter": {
"description": "The voting algorithm to use",
"type": "string", "default": "confidence_voter_default_ctc"
},
"textequiv_level": {
"type": "string",
"enum": ["line", "word", "glyph"],
"default": "line",
"description": "Deepest PAGE XML hierarchy level to include TextEquiv results for"
},
"glyph_conf_cutoff": {
"type": "number",
"format": "float",
"default": 0.001,
"description": "Only include glyph alternatives with confidences above this threshold"
}
}
}

@ -1,33 +1,50 @@
from __future__ import absolute_import
import os
import itertools
from glob import glob
import numpy as np
from calamari_ocr import __version__ as calamari_version
from calamari_ocr.ocr import MultiPredictor
from calamari_ocr.ocr.voting import voter_from_proto
from calamari_ocr.proto import VoterParams
from ocrd import Processor
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import to_xml
from ocrd_models.ocrd_page_generateds import TextEquivType
from ocrd_utils import getLogger, concat_padded, MIMETYPE_PAGE
from ocrd_models.ocrd_page import (
LabelType, LabelsType,
MetadataItemType,
TextEquivType,
WordType, GlyphType, CoordsType,
to_xml
)
from ocrd_utils import (
getLogger, concat_padded,
coordinates_for_segment, points_from_polygon, polygon_from_x0y0x1y1,
make_file_id, assert_file_grp_cardinality,
MIMETYPE_PAGE
)
from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL
log = getLogger('processor.CalamariRecognize')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL
from tensorflow import __version__ as tensorflow_version
TOOL = 'ocrd-calamari-recognize'
class CalamariRecognize(Processor):
def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-recognize']
kwargs['version'] = OCRD_TOOL['version']
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
kwargs['version'] = '%s (calamari %s, tensorflow %s)' % (OCRD_TOOL['version'], calamari_version, tensorflow_version)
super(CalamariRecognize, self).__init__(*args, **kwargs)
def _init_calamari(self):
os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL
if not self.parameter.get('checkpoint', None) and self.parameter.get('checkpoint_dir', None):
resolved = self.resolve_resource(self.parameter['checkpoint_dir'])
self.parameter['checkpoint'] = '%s/*.ckpt.json' % resolved
checkpoints = glob(self.parameter['checkpoint'])
self.predictor = MultiPredictor(checkpoints=checkpoints)
@ -43,16 +60,14 @@ class CalamariRecognize(Processor):
voter_params.type = VoterParams.Type.Value(self.parameter['voter'].upper())
self.voter = voter_from_proto(voter_params)
def _make_file_id(self, input_file, n):
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
if file_id == input_file.ID:
file_id = concat_padded(self.output_file_grp, n)
return file_id
def process(self):
"""
Performs the recognition.
"""
log = getLogger('processor.CalamariRecognize')
assert_file_grp_cardinality(self.input_file_grp, 1)
assert_file_grp_cardinality(self.output_file_grp, 1)
self._init_calamari()
@ -71,44 +86,169 @@ class CalamariRecognize(Processor):
textlines = region.get_TextLine()
log.info("About to recognize %i lines of region '%s'", len(textlines), region.id)
line_images_np = []
for line in textlines:
log.debug("Recognizing line '%s' in region '%s'", line.id, region.id)
line_image, line_coords = self.workspace.image_from_segment(
line, region_image, region_coords, feature_selector=self.features)
if ('binarized' not in line_coords['features'] and
'grayscale_normalized' not in line_coords['features'] and
self.input_channels == 1):
line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_coords, feature_selector=self.features)
if ('binarized' not in line_coords['features'] and 'grayscale_normalized' not in line_coords['features'] and self.input_channels == 1):
# We cannot use a feature selector for this since we don't
# know whether the model expects (has been trained on)
# binarized or grayscale images; but raw images are likely
# always inadequate:
log.warning("Using raw image for line '%s' in region '%s'",
line.id, region.id)
log.warning("Using raw image for line '%s' in region '%s'", line.id, region.id)
line_image = line_image if all(line_image.size) else [[0]]
line_image_np = np.array(line_image, dtype=np.uint8)
line_images_np.append(line_image_np)
raw_results_all = self.predictor.predict_raw(line_images_np, progress_bar=False)
for line, raw_results in zip(textlines, raw_results_all):
raw_results = list(self.predictor.predict_raw([line_image_np], progress_bar=False))[0]
for i, p in enumerate(raw_results):
p.prediction.id = "fold_{}".format(i)
prediction = self.voter.vote_prediction_result(raw_results)
prediction.id = "voted"
line_text = prediction.sentence
line_conf = prediction.avg_char_probability
# Build line text on our own
#
# Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
# on prediction.positions. Do it on our own to have consistency.
#
# XXX Check Calamari's built-in post-processing on prediction.sentence
def _sort_chars(p):
"""Filter and sort chars of prediction p"""
chars = p.chars
chars = [c for c in chars if c.char] # XXX Note that omission probabilities are not normalized?!
chars = [c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff']]
chars = sorted(chars, key=lambda k: k.probability, reverse=True)
return chars
def _drop_leading_spaces(positions):
return list(itertools.dropwhile(lambda p: _sort_chars(p)[0].char == " ", positions))
def _drop_trailing_spaces(positions):
return list(reversed(_drop_leading_spaces(reversed(positions))))
def _drop_double_spaces(positions):
def _drop_double_spaces_generator(positions):
last_was_space = False
for p in positions:
if p.chars[0].char == " ":
if not last_was_space:
yield p
last_was_space = True
else:
yield p
last_was_space = False
return list(_drop_double_spaces_generator(positions))
positions = prediction.positions
positions = _drop_leading_spaces(positions)
positions = _drop_trailing_spaces(positions)
positions = _drop_double_spaces(positions)
positions = list(positions)
line_text = ''.join(_sort_chars(p)[0].char for p in positions)
if line_text != prediction.sentence:
log.warning("Our own line text is not the same as Calamari's: '%s' != '%s'",
line_text, prediction.sentence)
# Delete existing results
if line.get_TextEquiv():
log.warning("Line '%s' already contained text results", line.id)
line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])
line.set_TextEquiv([])
if line.get_Word():
log.warning("Line '%s' already contained word segmentation", line.id)
line.set_Word([])
# Save line results
line_conf = prediction.avg_char_probability
line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])
# Save word results
#
# Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
# and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
# hierarchy of lines > words > glyphs.
def _words(s):
"""Split words based on spaces and include spaces as 'words'"""
spaces = None
word = ''
for c in s:
if c == ' ' and spaces is True:
word += c
elif c != ' ' and spaces is False:
word += c
else:
if word:
yield word
word = c
spaces = (c == ' ')
yield word
if self.parameter['textequiv_level'] in ['word', 'glyph']:
word_no = 0
i = 0
for word_text in _words(line_text):
word_length = len(word_text)
if not all(c == ' ' for c in word_text):
word_positions = positions[i:i+word_length]
word_start = word_positions[0].global_start
word_end = word_positions[-1].global_end
polygon = polygon_from_x0y0x1y1([word_start, 0, word_end, line_image.height])
points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords))
# XXX Crop to line polygon?
word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points))
word.add_TextEquiv(TextEquivType(Unicode=word_text))
if self.parameter['textequiv_level'] == 'glyph':
for glyph_no, p in enumerate(word_positions):
glyph_start = p.global_start
glyph_end = p.global_end
polygon = polygon_from_x0y0x1y1([glyph_start, 0, glyph_end, line_image.height])
points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords))
glyph = GlyphType(id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points))
# Add predictions (= TextEquivs)
char_index_start = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
for char_index, char in enumerate(_sort_chars(p), start=char_index_start):
glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=char_index, conf=char.probability))
word.add_Glyph(glyph)
line.add_Word(word)
word_no += 1
i += word_length
_page_update_higher_textequiv_levels('line', pcgts)
file_id = self._make_file_id(input_file, n)
# Add metadata about this operation and its runtime parameters:
metadata = pcgts.get_Metadata() # ensured by from_file()
metadata.add_MetadataItem(
MetadataItemType(type_="processingStep",
name=self.ocrd_tool['steps'][0],
value=TOOL,
Labels=[LabelsType(
externalModel="ocrd-tool",
externalId="parameters",
Label=[LabelType(type_=name, value=self.parameter[name])
for name in self.parameter.keys()])]))
file_id = make_file_id(input_file, self.output_file_grp)
pcgts.set_pcGtsId(file_id)
self.workspace.add_file(
ID=file_id,
file_grp=self.output_file_grp,
@ -151,3 +291,5 @@ def _page_update_higher_textequiv_levels(level, pcgts):
else u'' for line in lines)
region.set_TextEquiv(
[TextEquivType(Unicode=region_unicode)]) # remove old
# vim:tw=120:

@ -1,6 +1,6 @@
numpy
tensorflow-gpu == 1.14.0
calamari-ocr == 0.3.5
h5py < 3 # XXX tensorflow 2.4.0rc3 requires h5py~=2.10.0, but you'll have h5py 3.1.0 which is incompatible.
tensorflow >= 2.3.0rc2
calamari-ocr == 1.0.*
setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime?
click
ocrd >= 1.0.0b11
ocrd >= 2.22.0

@ -5,15 +5,15 @@ from setuptools import setup, find_packages
setup(
name='ocrd_calamari',
version='0.0.3',
version='1.0.1',
description='Calamari bindings',
long_description=Path('README.md').read_text(),
long_description_content_type='text/markdown',
author='Konstantin Baierer, Mike Gerber',
author_email='unixprog@gmail.com, mike.gerber@sbb.spk-berlin.de',
url='https://github.com/kba/ocrd_calamari',
url='https://github.com/OCR-D/ocrd_calamari',
license='Apache License 2.0',
packages=find_packages(exclude=('tests', 'docs')),
packages=find_packages(exclude=('test', 'docs')),
install_requires=Path('requirements.txt').read_text().split('\n'),
package_data={
'': ['*.json', '*.yml', '*.yaml'],

@ -2,6 +2,8 @@ import os
import shutil
import subprocess
import urllib.request
from lxml import etree
from glob import glob
import pytest
import logging
@ -10,9 +12,14 @@ from ocrd.resolver import Resolver
from ocrd_calamari import CalamariRecognize
from .base import assets
METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml')
CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json')
METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml')
WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
CHECKPOINT_DIR = os.path.join(os.getcwd(), 'gt4histocr-calamari1')
CHECKPOINT = os.path.join(CHECKPOINT_DIR, '*.ckpt.json')
# Because XML namespace versions are so much fun, we not only use one, we use TWO!
NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
NSMAP_GT = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" }
@pytest.fixture
@ -32,10 +39,6 @@ def workspace():
"https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f,
os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f))
return workspace
def test_recognize(workspace):
# The binarization options I have are:
#
# a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf)
@ -48,17 +51,49 @@ def test_recognize(workspace):
ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)
subprocess.call(['convert', ff, '-threshold', '50%', ff])
# XXX Should remove GT text to really test this
# Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text
# XXX Review data again
# XXX Make this more robust against namespace version changes
for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-LINE"):
workspace.download_file(of)
for to_remove in ["//pc:Word", "//pc:TextEquiv"]:
for ff in glob(os.path.join(WORKSPACE_DIR, "OCR-D-GT-SEG-LINE", "*")):
tree = etree.parse(ff)
for e in tree.xpath(to_remove, namespaces=NSMAP_GT):
e.getparent().remove(e)
tree.write(ff, xml_declaration=True, encoding="utf-8")
return workspace
def test_recognize(workspace):
CalamariRecognize(
workspace,
input_file_grp="OCR-D-GT-SEG-LINE",
output_file_grp="OCR-D-OCR-CALAMARI",
parameter={'checkpoint': CHECKPOINT}
parameter={
"checkpoint": CHECKPOINT,
}
).process()
workspace.save_mets()
page1 = os.path.join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml')
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
assert os.path.exists(page1)
with open(page1, "r", encoding="utf-8") as f:
assert "verſchuldeten" in f.read()
def test_recognize_with_checkpoint_dir(workspace):
CalamariRecognize(
workspace,
input_file_grp="OCR-D-GT-SEG-LINE",
output_file_grp="OCR-D-OCR-CALAMARI",
parameter={
"checkpoint_dir": CHECKPOINT_DIR,
}
).process()
workspace.save_mets()
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
assert os.path.exists(page1)
with open(page1, 'r', encoding='utf-8') as f:
assert 'verſchuldeten' in f.read()
@ -75,3 +110,61 @@ def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(works
interesting_log_messages = [t[2] for t in caplog.record_tuples if "Using raw image" in t[2]]
assert len(interesting_log_messages) > 10 # For every line!
with open(page1, "r", encoding="utf-8") as f:
assert "verſchuldeten" in f.read()
def test_word_segmentation(workspace):
CalamariRecognize(
workspace,
input_file_grp="OCR-D-GT-SEG-LINE",
output_file_grp="OCR-D-OCR-CALAMARI",
parameter={
"checkpoint": CHECKPOINT,
"textequiv_level": "word", # Note that we're going down to word level here
}
).process()
workspace.save_mets()
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
assert os.path.exists(page1)
tree = etree.parse(page1)
# The result should contain a TextLine that contains the text "December"
line = tree.xpath(".//pc:TextLine[pc:TextEquiv/pc:Unicode[contains(text(),'December')]]", namespaces=NSMAP)[0]
assert line
# The textline should a. contain multiple words and b. these should concatenate fine to produce the same line text
words = line.xpath(".//pc:Word", namespaces=NSMAP)
assert len(words) >= 2
words_text = " ".join(word.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text for word in words)
line_text = line.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text
assert words_text == line_text
# For extra measure, check that we're not seeing any glyphs, as we asked for textequiv_level == "word"
glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP)
assert len(glyphs) == 0
def test_glyphs(workspace):
CalamariRecognize(
workspace,
input_file_grp="OCR-D-GT-SEG-LINE",
output_file_grp="OCR-D-OCR-CALAMARI",
parameter={
"checkpoint": CHECKPOINT,
"textequiv_level": "glyph", # Note that we're going down to glyph level here
}
).process()
workspace.save_mets()
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
assert os.path.exists(page1)
tree = etree.parse(page1)
# The result should contain a lot of glyphs
glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP)
assert len(glyphs) >= 100
# vim:tw=120:

Loading…
Cancel
Save