Merge branch 'master' into image-features

fix/readme-no-checkpoint
Gerber, Mike 3 years ago
commit c0902cdef5

@ -7,15 +7,23 @@ jobs:
build-python36: build-python36:
docker: docker:
- image: ubuntu:18.04 - image: ubuntu:18.04
environment:
- PYTHONIOENCODING: utf-8
steps: steps:
- run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget imagemagick - run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget imagemagick locales
- run: locale-gen "en_US.UTF-8"; update-locale LC_ALL="en_US.UTF-8"
- checkout - checkout
- run: pip3 install --upgrade pip
- run: make install PIP_INSTALL="pip3 install" - run: make install PIP_INSTALL="pip3 install"
- run: pip3 install -r requirements-test.txt - run: pip3 install -r requirements-test.txt
- run: make coverage - run: make coverage LC_ALL=en_US.utf8
- codecov/upload - codecov/upload
workflows: workflows:
build: build:
jobs: jobs:
- build-python36 - build-python36:
filters:
branches:
ignore:
- screenshots

2
.gitignore vendored

@ -107,5 +107,7 @@ venv.bak/
/calamari /calamari
/calamari_models /calamari_models
/gt4histocr-calamari /gt4histocr-calamari
/actevedef_718448162*
/repo /repo
/test/assets /test/assets
gt4histocr-calamari*

@ -1,4 +1,4 @@
FROM ocrd/core:edge FROM ocrd/core
MAINTAINER OCR-D MAINTAINER OCR-D
ENV DEBIAN_FRONTEND noninteractive ENV DEBIAN_FRONTEND noninteractive
ENV PYTHONIOENCODING utf8 ENV PYTHONIOENCODING utf8
@ -10,10 +10,12 @@ COPY Makefile .
COPY setup.py . COPY setup.py .
COPY ocrd-tool.json . COPY ocrd-tool.json .
COPY requirements.txt . COPY requirements.txt .
COPY README.md .
COPY ocrd_calamari ocrd_calamari COPY ocrd_calamari ocrd_calamari
RUN make calamari/build RUN pip3 install --upgrade pip && \
RUN pip3 install . pip3 install . && \
pip3 check
ENTRYPOINT ["/usr/local/bin/ocrd-calamari-recognize"] ENTRYPOINT ["/usr/local/bin/ocrd-calamari-recognize"]

@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier same "printed page" as the copyright notice for easier
identification within third-party archives. identification within third-party archives.
Copyright [yyyy] [name of copyright owner] Copyright 2018-2020 Konstantin Baierer, Mike Gerber
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.

@ -1,3 +1,4 @@
export # export variables to subshells
PIP_INSTALL = pip3 install PIP_INSTALL = pip3 install
GIT_CLONE = git clone GIT_CLONE = git clone
PYTHON = python3 PYTHON = python3
@ -10,10 +11,8 @@ help:
@echo " Targets" @echo " Targets"
@echo "" @echo ""
@echo " install Install ocrd_calamari" @echo " install Install ocrd_calamari"
@echo " calamari Clone calamari repo" @echo " gt4histocr-calamari1 Get GT4HistOCR Calamari model (from SBB)"
@echo " calamari_models Clone calamari_models repo" @echo " actevedef_718448162 Download example data"
@echo " gt4histocr-calamari Get GT4HistOCR Calamari model (from SBB)"
@echo " calamari/build pip install calamari"
@echo " deps-test Install testing python deps via pip" @echo " deps-test Install testing python deps via pip"
@echo " repo/assets Clone OCR-D/assets to ./repo/assets" @echo " repo/assets Clone OCR-D/assets to ./repo/assets"
@echo " test/assets Setup test assets" @echo " test/assets Setup test assets"
@ -33,29 +32,21 @@ help:
install: install:
$(PIP_INSTALL) . $(PIP_INSTALL) .
# Clone calamari repo
calamari:
$(GIT_CLONE) https://github.com/chwick/calamari
# Clone calamari_models repo # Get GT4HistOCR Calamari model (from SBB)
calamari_models: gt4histocr-calamari1:
$(GIT_CLONE) -n https://github.com/chwick/calamari_models mkdir -p gt4histocr-calamari1
# Checkout latest version that works with calamari-ocr==0.3.5: cd gt4histocr-calamari1 && \
cd calamari_models && git checkout f76b1d3ec wget https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz && \
gt4histocr-calamari:
mkdir gt4histocr-calamari
cd gt4histocr-calamari && \
wget https://file.spk-berlin.de:8443/calamari-models/GT4HistOCR/model.tar.xz && \
tar xfv model.tar.xz && \ tar xfv model.tar.xz && \
rm model.tar.xz rm model.tar.xz
# Download example data
actevedef_718448162:
wget https://qurator-data.de/examples/actevedef_718448162.zip && \
unzip actevedef_718448162.zip
# pip install calamari
calamari/build: calamari calamari_models
cd calamari && $(PIP_INSTALL) .
# #
# Assets and Tests # Assets and Tests
@ -82,12 +73,12 @@ assets-clean:
rm -rf test/assets rm -rf test/assets
# Run unit tests # Run unit tests
test: test/assets gt4histocr-calamari test: test/assets gt4histocr-calamari1
# declare -p HTTP_PROXY # declare -p HTTP_PROXY
$(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS) $(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS)
# Run unit tests and determine test coverage # Run unit tests and determine test coverage
coverage: test/assets calamari_models coverage: test/assets gt4histocr-calamari1
coverage erase coverage erase
make test PYTHON="coverage run" make test PYTHON="coverage run"
coverage report coverage report

@ -4,17 +4,20 @@ In a Python 3 virtualenv:
~~~ ~~~
pip install -e . pip install -e .
pip install -r requirements-test.txt
make test make test
~~~ ~~~
Release Releasing
------- ---------
* Update ocrd-tool.json version * Update `ocrd-tool.json` version
* Update setup.py version * Update `setup.py` version
* git commit -m 'v<version>' * `git commit -m 'v<version>'`
* git tag -m 'v<version>' 'v<version>' * `git tag -m 'v<version>' 'v<version>'`
* git push --tags * `git push --tags`
* Do a release on GitHub
PyPI: ### Uploading to PyPI
* python sdist bdist_wheel * `rm -rf dist/` or backup if `dist/` exists already
* twine upload dist/ocrd_calamari-<version>* * In the virtualenv: `python setup.py sdist bdist_wheel`
* `twine upload dist/ocrd_calamari-<version>*`

@ -8,11 +8,22 @@
## Introduction ## Introduction
This offers a OCR-D compliant workspace processor for some of the functionality of Calamari OCR. **ocrd_calamari** offers a [OCR-D](https://ocr-d.de) compliant workspace processor for the functionality of Calamari OCR. It uses OCR-D workspaces (METS) with [PAGE XML](https://github.com/PRImA-Research-Lab/PAGE-XML) documents as input and output.
This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized
image) as its input. image) as its input.
In addition to the line text it may also output word and glyph segmentation
including per-glyph confidence values and per-glyph alternative predictions as
provided by the Calamari OCR engine, using a `textequiv_level` of `word` or
`glyph`. Note that while Calamari does not provide word segmentation, this
processor produces word segmentation inferred from text
segmentation and the glyph positions. The provided glyph and word segmentation
can be used for text extraction and highlighting, but is probably not useful for
further image-based processing.
![Example output as viewed in PAGE Viewer](https://github.com/OCR-D/ocrd_calamari/raw/screenshots/output-in-page-viewer.jpg)
## Installation ## Installation
### From PyPI ### From PyPI
@ -29,32 +40,44 @@ pip install .
## Install models ## Install models
Download standard models: Download models trained on GT4HistOCR data:
``` ```
wget https://github.com/Calamari-OCR/calamari_models/archive/master.zip make gt4histocr-calamari1
unzip master.zip ls gt4histocr-calamari1
``` ```
Download models trained on GT4HistOCR data: Manual download: [model.tar.xz](https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz)
## Example Usage
Before using `ocrd-calamari-recognize` get some example data and model, and
prepare the document for OCR:
``` ```
make gt4histocr-calamari # Download model and example data
ls gt4histocr-calamari make gt4histocr-calamari1
make actevedef_718448162
# Create binarized images and line segmentation using other OCR-D projects
cd actevedef_718448162
ocrd-olena-binarize -P impl sauvola-ms-split -I OCR-D-IMG -O OCR-D-IMG-BIN
ocrd-tesserocr-segment-region -I OCR-D-IMG-BIN -O OCR-D-SEG-REGION
ocrd-tesserocr-segment-line -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE
``` ```
## Example Usage Finally recognize the text using ocrd_calamari and the downloaded model:
```
ocrd-calamari-recognize -P checkpoint "../gt4histocr-calamari1/*.ckpt.json" -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
```
or
```
ocrd-calamari-recognize -P checkpoint_dir "../gt4histocr-calamari1" -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
```
~~~
ocrd-calamari-recognize -p test-parameters.json -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
~~~
With `test-parameters.json`: You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions
~~~ for additional parameters and default values.
{
"checkpoint": "/path/to/some/trained/models/*.ckpt.json"
}
~~~
## Development & Testing ## Development & Testing
For information regarding development and testing, please see For information regarding development and testing, please see

@ -1,6 +1,6 @@
{ {
"git_url": "https://github.com/kba/ocrd_calamari", "git_url": "https://github.com/OCR-D/ocrd_calamari",
"version": "0.0.3", "version": "1.0.1",
"tools": { "tools": {
"ocrd-calamari-recognize": { "ocrd-calamari-recognize": {
"executable": "ocrd-calamari-recognize", "executable": "ocrd-calamari-recognize",
@ -18,6 +18,10 @@
"OCR-D-OCR-CALAMARI" "OCR-D-OCR-CALAMARI"
], ],
"parameters": { "parameters": {
"checkpoint_dir": {
"description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory",
"type": "string", "format": "file", "cacheable": true, "default": "qurator-gt4histocr-1.0"
},
"checkpoint": { "checkpoint": {
"description": "The calamari model files (*.ckpt.json)", "description": "The calamari model files (*.ckpt.json)",
"type": "string", "format": "file", "cacheable": true "type": "string", "format": "file", "cacheable": true
@ -25,6 +29,18 @@
"voter": { "voter": {
"description": "The voting algorithm to use", "description": "The voting algorithm to use",
"type": "string", "default": "confidence_voter_default_ctc" "type": "string", "default": "confidence_voter_default_ctc"
},
"textequiv_level": {
"type": "string",
"enum": ["line", "word", "glyph"],
"default": "line",
"description": "Deepest PAGE XML hierarchy level to include TextEquiv results for"
},
"glyph_conf_cutoff": {
"type": "number",
"format": "float",
"default": 0.001,
"description": "Only include glyph alternatives with confidences above this threshold"
} }
} }
} }

@ -1,33 +1,50 @@
from __future__ import absolute_import from __future__ import absolute_import
import os import os
import itertools
from glob import glob from glob import glob
import numpy as np import numpy as np
from calamari_ocr import __version__ as calamari_version
from calamari_ocr.ocr import MultiPredictor from calamari_ocr.ocr import MultiPredictor
from calamari_ocr.ocr.voting import voter_from_proto from calamari_ocr.ocr.voting import voter_from_proto
from calamari_ocr.proto import VoterParams from calamari_ocr.proto import VoterParams
from ocrd import Processor from ocrd import Processor
from ocrd_modelfactory import page_from_file from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import to_xml from ocrd_models.ocrd_page import (
from ocrd_models.ocrd_page_generateds import TextEquivType LabelType, LabelsType,
from ocrd_utils import getLogger, concat_padded, MIMETYPE_PAGE MetadataItemType,
TextEquivType,
WordType, GlyphType, CoordsType,
to_xml
)
from ocrd_utils import (
getLogger, concat_padded,
coordinates_for_segment, points_from_polygon, polygon_from_x0y0x1y1,
make_file_id, assert_file_grp_cardinality,
MIMETYPE_PAGE
)
from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL
log = getLogger('processor.CalamariRecognize') os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL
from tensorflow import __version__ as tensorflow_version
TOOL = 'ocrd-calamari-recognize'
class CalamariRecognize(Processor): class CalamariRecognize(Processor):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-recognize'] kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
kwargs['version'] = OCRD_TOOL['version'] kwargs['version'] = '%s (calamari %s, tensorflow %s)' % (OCRD_TOOL['version'], calamari_version, tensorflow_version)
super(CalamariRecognize, self).__init__(*args, **kwargs) super(CalamariRecognize, self).__init__(*args, **kwargs)
def _init_calamari(self): def _init_calamari(self):
os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL
if not self.parameter.get('checkpoint', None) and self.parameter.get('checkpoint_dir', None):
resolved = self.resolve_resource(self.parameter['checkpoint_dir'])
self.parameter['checkpoint'] = '%s/*.ckpt.json' % resolved
checkpoints = glob(self.parameter['checkpoint']) checkpoints = glob(self.parameter['checkpoint'])
self.predictor = MultiPredictor(checkpoints=checkpoints) self.predictor = MultiPredictor(checkpoints=checkpoints)
@ -43,16 +60,14 @@ class CalamariRecognize(Processor):
voter_params.type = VoterParams.Type.Value(self.parameter['voter'].upper()) voter_params.type = VoterParams.Type.Value(self.parameter['voter'].upper())
self.voter = voter_from_proto(voter_params) self.voter = voter_from_proto(voter_params)
def _make_file_id(self, input_file, n):
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
if file_id == input_file.ID:
file_id = concat_padded(self.output_file_grp, n)
return file_id
def process(self): def process(self):
""" """
Performs the recognition. Performs the recognition.
""" """
log = getLogger('processor.CalamariRecognize')
assert_file_grp_cardinality(self.input_file_grp, 1)
assert_file_grp_cardinality(self.output_file_grp, 1)
self._init_calamari() self._init_calamari()
@ -71,44 +86,169 @@ class CalamariRecognize(Processor):
textlines = region.get_TextLine() textlines = region.get_TextLine()
log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) log.info("About to recognize %i lines of region '%s'", len(textlines), region.id)
line_images_np = []
for line in textlines: for line in textlines:
log.debug("Recognizing line '%s' in region '%s'", line.id, region.id) log.debug("Recognizing line '%s' in region '%s'", line.id, region.id)
line_image, line_coords = self.workspace.image_from_segment( line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_coords, feature_selector=self.features)
line, region_image, region_coords, feature_selector=self.features) if ('binarized' not in line_coords['features'] and 'grayscale_normalized' not in line_coords['features'] and self.input_channels == 1):
if ('binarized' not in line_coords['features'] and
'grayscale_normalized' not in line_coords['features'] and
self.input_channels == 1):
# We cannot use a feature selector for this since we don't # We cannot use a feature selector for this since we don't
# know whether the model expects (has been trained on) # know whether the model expects (has been trained on)
# binarized or grayscale images; but raw images are likely # binarized or grayscale images; but raw images are likely
# always inadequate: # always inadequate:
log.warning("Using raw image for line '%s' in region '%s'", log.warning("Using raw image for line '%s' in region '%s'", line.id, region.id)
line.id, region.id)
line_image = line_image if all(line_image.size) else [[0]]
line_image_np = np.array(line_image, dtype=np.uint8) line_image_np = np.array(line_image, dtype=np.uint8)
line_images_np.append(line_image_np)
raw_results_all = self.predictor.predict_raw(line_images_np, progress_bar=False)
for line, raw_results in zip(textlines, raw_results_all):
raw_results = list(self.predictor.predict_raw([line_image_np], progress_bar=False))[0]
for i, p in enumerate(raw_results): for i, p in enumerate(raw_results):
p.prediction.id = "fold_{}".format(i) p.prediction.id = "fold_{}".format(i)
prediction = self.voter.vote_prediction_result(raw_results) prediction = self.voter.vote_prediction_result(raw_results)
prediction.id = "voted" prediction.id = "voted"
line_text = prediction.sentence # Build line text on our own
line_conf = prediction.avg_char_probability #
# Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
# on prediction.positions. Do it on our own to have consistency.
#
# XXX Check Calamari's built-in post-processing on prediction.sentence
def _sort_chars(p):
"""Filter and sort chars of prediction p"""
chars = p.chars
chars = [c for c in chars if c.char] # XXX Note that omission probabilities are not normalized?!
chars = [c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff']]
chars = sorted(chars, key=lambda k: k.probability, reverse=True)
return chars
def _drop_leading_spaces(positions):
return list(itertools.dropwhile(lambda p: _sort_chars(p)[0].char == " ", positions))
def _drop_trailing_spaces(positions):
return list(reversed(_drop_leading_spaces(reversed(positions))))
def _drop_double_spaces(positions):
def _drop_double_spaces_generator(positions):
last_was_space = False
for p in positions:
if p.chars[0].char == " ":
if not last_was_space:
yield p
last_was_space = True
else:
yield p
last_was_space = False
return list(_drop_double_spaces_generator(positions))
positions = prediction.positions
positions = _drop_leading_spaces(positions)
positions = _drop_trailing_spaces(positions)
positions = _drop_double_spaces(positions)
positions = list(positions)
line_text = ''.join(_sort_chars(p)[0].char for p in positions)
if line_text != prediction.sentence:
log.warning("Our own line text is not the same as Calamari's: '%s' != '%s'",
line_text, prediction.sentence)
# Delete existing results
if line.get_TextEquiv(): if line.get_TextEquiv():
log.warning("Line '%s' already contained text results", line.id) log.warning("Line '%s' already contained text results", line.id)
line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)]) line.set_TextEquiv([])
if line.get_Word(): if line.get_Word():
log.warning("Line '%s' already contained word segmentation", line.id) log.warning("Line '%s' already contained word segmentation", line.id)
line.set_Word([]) line.set_Word([])
# Save line results
line_conf = prediction.avg_char_probability
line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])
# Save word results
#
# Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
# and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
# hierarchy of lines > words > glyphs.
def _words(s):
"""Split words based on spaces and include spaces as 'words'"""
spaces = None
word = ''
for c in s:
if c == ' ' and spaces is True:
word += c
elif c != ' ' and spaces is False:
word += c
else:
if word:
yield word
word = c
spaces = (c == ' ')
yield word
if self.parameter['textequiv_level'] in ['word', 'glyph']:
word_no = 0
i = 0
for word_text in _words(line_text):
word_length = len(word_text)
if not all(c == ' ' for c in word_text):
word_positions = positions[i:i+word_length]
word_start = word_positions[0].global_start
word_end = word_positions[-1].global_end
polygon = polygon_from_x0y0x1y1([word_start, 0, word_end, line_image.height])
points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords))
# XXX Crop to line polygon?
word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points))
word.add_TextEquiv(TextEquivType(Unicode=word_text))
if self.parameter['textequiv_level'] == 'glyph':
for glyph_no, p in enumerate(word_positions):
glyph_start = p.global_start
glyph_end = p.global_end
polygon = polygon_from_x0y0x1y1([glyph_start, 0, glyph_end, line_image.height])
points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords))
glyph = GlyphType(id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points))
# Add predictions (= TextEquivs)
char_index_start = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
for char_index, char in enumerate(_sort_chars(p), start=char_index_start):
glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=char_index, conf=char.probability))
word.add_Glyph(glyph)
line.add_Word(word)
word_no += 1
i += word_length
_page_update_higher_textequiv_levels('line', pcgts) _page_update_higher_textequiv_levels('line', pcgts)
file_id = self._make_file_id(input_file, n)
# Add metadata about this operation and its runtime parameters:
metadata = pcgts.get_Metadata() # ensured by from_file()
metadata.add_MetadataItem(
MetadataItemType(type_="processingStep",
name=self.ocrd_tool['steps'][0],
value=TOOL,
Labels=[LabelsType(
externalModel="ocrd-tool",
externalId="parameters",
Label=[LabelType(type_=name, value=self.parameter[name])
for name in self.parameter.keys()])]))
file_id = make_file_id(input_file, self.output_file_grp)
pcgts.set_pcGtsId(file_id)
self.workspace.add_file( self.workspace.add_file(
ID=file_id, ID=file_id,
file_grp=self.output_file_grp, file_grp=self.output_file_grp,
@ -151,3 +291,5 @@ def _page_update_higher_textequiv_levels(level, pcgts):
else u'' for line in lines) else u'' for line in lines)
region.set_TextEquiv( region.set_TextEquiv(
[TextEquivType(Unicode=region_unicode)]) # remove old [TextEquivType(Unicode=region_unicode)]) # remove old
# vim:tw=120:

@ -1,6 +1,6 @@
numpy h5py < 3 # XXX tensorflow 2.4.0rc3 requires h5py~=2.10.0, but you'll have h5py 3.1.0 which is incompatible.
tensorflow-gpu == 1.14.0 tensorflow >= 2.3.0rc2
calamari-ocr == 0.3.5 calamari-ocr == 1.0.*
setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime?
click click
ocrd >= 1.0.0b11 ocrd >= 2.22.0

@ -5,15 +5,15 @@ from setuptools import setup, find_packages
setup( setup(
name='ocrd_calamari', name='ocrd_calamari',
version='0.0.3', version='1.0.1',
description='Calamari bindings', description='Calamari bindings',
long_description=Path('README.md').read_text(), long_description=Path('README.md').read_text(),
long_description_content_type='text/markdown', long_description_content_type='text/markdown',
author='Konstantin Baierer, Mike Gerber', author='Konstantin Baierer, Mike Gerber',
author_email='unixprog@gmail.com, mike.gerber@sbb.spk-berlin.de', author_email='unixprog@gmail.com, mike.gerber@sbb.spk-berlin.de',
url='https://github.com/kba/ocrd_calamari', url='https://github.com/OCR-D/ocrd_calamari',
license='Apache License 2.0', license='Apache License 2.0',
packages=find_packages(exclude=('tests', 'docs')), packages=find_packages(exclude=('test', 'docs')),
install_requires=Path('requirements.txt').read_text().split('\n'), install_requires=Path('requirements.txt').read_text().split('\n'),
package_data={ package_data={
'': ['*.json', '*.yml', '*.yaml'], '': ['*.json', '*.yml', '*.yaml'],

@ -2,6 +2,8 @@ import os
import shutil import shutil
import subprocess import subprocess
import urllib.request import urllib.request
from lxml import etree
from glob import glob
import pytest import pytest
import logging import logging
@ -10,9 +12,14 @@ from ocrd.resolver import Resolver
from ocrd_calamari import CalamariRecognize from ocrd_calamari import CalamariRecognize
from .base import assets from .base import assets
METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml') METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml')
CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json')
WORKSPACE_DIR = '/tmp/test-ocrd-calamari' WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
CHECKPOINT_DIR = os.path.join(os.getcwd(), 'gt4histocr-calamari1')
CHECKPOINT = os.path.join(CHECKPOINT_DIR, '*.ckpt.json')
# Because XML namespace versions are so much fun, we not only use one, we use TWO!
NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
NSMAP_GT = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" }
@pytest.fixture @pytest.fixture
@ -32,10 +39,6 @@ def workspace():
"https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f, "https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f,
os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)) os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f))
return workspace
def test_recognize(workspace):
# The binarization options I have are: # The binarization options I have are:
# #
# a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf) # a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf)
@ -48,17 +51,49 @@ def test_recognize(workspace):
ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f) ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)
subprocess.call(['convert', ff, '-threshold', '50%', ff]) subprocess.call(['convert', ff, '-threshold', '50%', ff])
# XXX Should remove GT text to really test this # Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text
# XXX Review data again
# XXX Make this more robust against namespace version changes
for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-LINE"):
workspace.download_file(of)
for to_remove in ["//pc:Word", "//pc:TextEquiv"]:
for ff in glob(os.path.join(WORKSPACE_DIR, "OCR-D-GT-SEG-LINE", "*")):
tree = etree.parse(ff)
for e in tree.xpath(to_remove, namespaces=NSMAP_GT):
e.getparent().remove(e)
tree.write(ff, xml_declaration=True, encoding="utf-8")
return workspace
def test_recognize(workspace):
CalamariRecognize( CalamariRecognize(
workspace, workspace,
input_file_grp="OCR-D-GT-SEG-LINE", input_file_grp="OCR-D-GT-SEG-LINE",
output_file_grp="OCR-D-OCR-CALAMARI", output_file_grp="OCR-D-OCR-CALAMARI",
parameter={'checkpoint': CHECKPOINT} parameter={
"checkpoint": CHECKPOINT,
}
).process() ).process()
workspace.save_mets() workspace.save_mets()
page1 = os.path.join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml') page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
assert os.path.exists(page1)
with open(page1, "r", encoding="utf-8") as f:
assert "verſchuldeten" in f.read()
def test_recognize_with_checkpoint_dir(workspace):
CalamariRecognize(
workspace,
input_file_grp="OCR-D-GT-SEG-LINE",
output_file_grp="OCR-D-OCR-CALAMARI",
parameter={
"checkpoint_dir": CHECKPOINT_DIR,
}
).process()
workspace.save_mets()
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
assert os.path.exists(page1) assert os.path.exists(page1)
with open(page1, 'r', encoding='utf-8') as f: with open(page1, 'r', encoding='utf-8') as f:
assert 'verſchuldeten' in f.read() assert 'verſchuldeten' in f.read()
@ -75,3 +110,61 @@ def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(works
interesting_log_messages = [t[2] for t in caplog.record_tuples if "Using raw image" in t[2]] interesting_log_messages = [t[2] for t in caplog.record_tuples if "Using raw image" in t[2]]
assert len(interesting_log_messages) > 10 # For every line! assert len(interesting_log_messages) > 10 # For every line!
with open(page1, "r", encoding="utf-8") as f:
assert "verſchuldeten" in f.read()
def test_word_segmentation(workspace):
CalamariRecognize(
workspace,
input_file_grp="OCR-D-GT-SEG-LINE",
output_file_grp="OCR-D-OCR-CALAMARI",
parameter={
"checkpoint": CHECKPOINT,
"textequiv_level": "word", # Note that we're going down to word level here
}
).process()
workspace.save_mets()
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
assert os.path.exists(page1)
tree = etree.parse(page1)
# The result should contain a TextLine that contains the text "December"
line = tree.xpath(".//pc:TextLine[pc:TextEquiv/pc:Unicode[contains(text(),'December')]]", namespaces=NSMAP)[0]
assert line
# The textline should a. contain multiple words and b. these should concatenate fine to produce the same line text
words = line.xpath(".//pc:Word", namespaces=NSMAP)
assert len(words) >= 2
words_text = " ".join(word.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text for word in words)
line_text = line.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text
assert words_text == line_text
# For extra measure, check that we're not seeing any glyphs, as we asked for textequiv_level == "word"
glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP)
assert len(glyphs) == 0
def test_glyphs(workspace):
CalamariRecognize(
workspace,
input_file_grp="OCR-D-GT-SEG-LINE",
output_file_grp="OCR-D-OCR-CALAMARI",
parameter={
"checkpoint": CHECKPOINT,
"textequiv_level": "glyph", # Note that we're going down to glyph level here
}
).process()
workspace.save_mets()
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
assert os.path.exists(page1)
tree = etree.parse(page1)
# The result should contain a lot of glyphs
glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP)
assert len(glyphs) >= 100
# vim:tw=120:

Loading…
Cancel
Save