mirror of
https://github.com/mikegerber/ocrd_calamari.git
synced 2025-06-26 03:59:53 +02:00
Merge branch 'master' into image-features
This commit is contained in:
commit
c0902cdef5
12 changed files with 385 additions and 105 deletions
|
@ -7,15 +7,23 @@ jobs:
|
|||
build-python36:
|
||||
docker:
|
||||
- image: ubuntu:18.04
|
||||
environment:
|
||||
- PYTHONIOENCODING: utf-8
|
||||
steps:
|
||||
- run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget imagemagick
|
||||
- run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget imagemagick locales
|
||||
- run: locale-gen "en_US.UTF-8"; update-locale LC_ALL="en_US.UTF-8"
|
||||
- checkout
|
||||
- run: pip3 install --upgrade pip
|
||||
- run: make install PIP_INSTALL="pip3 install"
|
||||
- run: pip3 install -r requirements-test.txt
|
||||
- run: make coverage
|
||||
- run: make coverage LC_ALL=en_US.utf8
|
||||
- codecov/upload
|
||||
|
||||
workflows:
|
||||
build:
|
||||
jobs:
|
||||
- build-python36
|
||||
- build-python36:
|
||||
filters:
|
||||
branches:
|
||||
ignore:
|
||||
- screenshots
|
||||
|
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -107,5 +107,7 @@ venv.bak/
|
|||
/calamari
|
||||
/calamari_models
|
||||
/gt4histocr-calamari
|
||||
/actevedef_718448162*
|
||||
/repo
|
||||
/test/assets
|
||||
gt4histocr-calamari*
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
FROM ocrd/core:edge
|
||||
FROM ocrd/core
|
||||
MAINTAINER OCR-D
|
||||
ENV DEBIAN_FRONTEND noninteractive
|
||||
ENV PYTHONIOENCODING utf8
|
||||
|
@ -10,10 +10,12 @@ COPY Makefile .
|
|||
COPY setup.py .
|
||||
COPY ocrd-tool.json .
|
||||
COPY requirements.txt .
|
||||
COPY README.md .
|
||||
COPY ocrd_calamari ocrd_calamari
|
||||
|
||||
RUN make calamari/build
|
||||
RUN pip3 install .
|
||||
RUN pip3 install --upgrade pip && \
|
||||
pip3 install . && \
|
||||
pip3 check
|
||||
|
||||
ENTRYPOINT ["/usr/local/bin/ocrd-calamari-recognize"]
|
||||
|
||||
|
|
2
LICENSE
2
LICENSE
|
@ -186,7 +186,7 @@
|
|||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
Copyright 2018-2020 Konstantin Baierer, Mike Gerber
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
|
|
37
Makefile
37
Makefile
|
@ -1,3 +1,4 @@
|
|||
export # export variables to subshells
|
||||
PIP_INSTALL = pip3 install
|
||||
GIT_CLONE = git clone
|
||||
PYTHON = python3
|
||||
|
@ -10,10 +11,8 @@ help:
|
|||
@echo " Targets"
|
||||
@echo ""
|
||||
@echo " install Install ocrd_calamari"
|
||||
@echo " calamari Clone calamari repo"
|
||||
@echo " calamari_models Clone calamari_models repo"
|
||||
@echo " gt4histocr-calamari Get GT4HistOCR Calamari model (from SBB)"
|
||||
@echo " calamari/build pip install calamari"
|
||||
@echo " gt4histocr-calamari1 Get GT4HistOCR Calamari model (from SBB)"
|
||||
@echo " actevedef_718448162 Download example data"
|
||||
@echo " deps-test Install testing python deps via pip"
|
||||
@echo " repo/assets Clone OCR-D/assets to ./repo/assets"
|
||||
@echo " test/assets Setup test assets"
|
||||
|
@ -33,29 +32,21 @@ help:
|
|||
install:
|
||||
$(PIP_INSTALL) .
|
||||
|
||||
# Clone calamari repo
|
||||
calamari:
|
||||
$(GIT_CLONE) https://github.com/chwick/calamari
|
||||
|
||||
# Clone calamari_models repo
|
||||
calamari_models:
|
||||
$(GIT_CLONE) -n https://github.com/chwick/calamari_models
|
||||
# Checkout latest version that works with calamari-ocr==0.3.5:
|
||||
cd calamari_models && git checkout f76b1d3ec
|
||||
|
||||
gt4histocr-calamari:
|
||||
mkdir gt4histocr-calamari
|
||||
cd gt4histocr-calamari && \
|
||||
wget https://file.spk-berlin.de:8443/calamari-models/GT4HistOCR/model.tar.xz && \
|
||||
# Get GT4HistOCR Calamari model (from SBB)
|
||||
gt4histocr-calamari1:
|
||||
mkdir -p gt4histocr-calamari1
|
||||
cd gt4histocr-calamari1 && \
|
||||
wget https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz && \
|
||||
tar xfv model.tar.xz && \
|
||||
rm model.tar.xz
|
||||
|
||||
# Download example data
|
||||
actevedef_718448162:
|
||||
wget https://qurator-data.de/examples/actevedef_718448162.zip && \
|
||||
unzip actevedef_718448162.zip
|
||||
|
||||
|
||||
# pip install calamari
|
||||
calamari/build: calamari calamari_models
|
||||
cd calamari && $(PIP_INSTALL) .
|
||||
|
||||
|
||||
#
|
||||
# Assets and Tests
|
||||
|
@ -82,12 +73,12 @@ assets-clean:
|
|||
rm -rf test/assets
|
||||
|
||||
# Run unit tests
|
||||
test: test/assets gt4histocr-calamari
|
||||
test: test/assets gt4histocr-calamari1
|
||||
# declare -p HTTP_PROXY
|
||||
$(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS)
|
||||
|
||||
# Run unit tests and determine test coverage
|
||||
coverage: test/assets calamari_models
|
||||
coverage: test/assets gt4histocr-calamari1
|
||||
coverage erase
|
||||
make test PYTHON="coverage run"
|
||||
coverage report
|
||||
|
|
|
@ -4,17 +4,20 @@ In a Python 3 virtualenv:
|
|||
|
||||
~~~
|
||||
pip install -e .
|
||||
pip install -r requirements-test.txt
|
||||
make test
|
||||
~~~
|
||||
|
||||
Release
|
||||
-------
|
||||
* Update ocrd-tool.json version
|
||||
* Update setup.py version
|
||||
* git commit -m 'v<version>'
|
||||
* git tag -m 'v<version>' 'v<version>'
|
||||
* git push --tags
|
||||
Releasing
|
||||
---------
|
||||
* Update `ocrd-tool.json` version
|
||||
* Update `setup.py` version
|
||||
* `git commit -m 'v<version>'`
|
||||
* `git tag -m 'v<version>' 'v<version>'`
|
||||
* `git push --tags`
|
||||
* Do a release on GitHub
|
||||
|
||||
PyPI:
|
||||
* python sdist bdist_wheel
|
||||
* twine upload dist/ocrd_calamari-<version>*
|
||||
### Uploading to PyPI
|
||||
* `rm -rf dist/` or backup if `dist/` exists already
|
||||
* In the virtualenv: `python setup.py sdist bdist_wheel`
|
||||
* `twine upload dist/ocrd_calamari-<version>*`
|
||||
|
|
61
README.md
61
README.md
|
@ -8,11 +8,22 @@
|
|||
|
||||
## Introduction
|
||||
|
||||
This offers a OCR-D compliant workspace processor for some of the functionality of Calamari OCR.
|
||||
**ocrd_calamari** offers a [OCR-D](https://ocr-d.de) compliant workspace processor for the functionality of Calamari OCR. It uses OCR-D workspaces (METS) with [PAGE XML](https://github.com/PRImA-Research-Lab/PAGE-XML) documents as input and output.
|
||||
|
||||
This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized
|
||||
image) as its input.
|
||||
|
||||
In addition to the line text it may also output word and glyph segmentation
|
||||
including per-glyph confidence values and per-glyph alternative predictions as
|
||||
provided by the Calamari OCR engine, using a `textequiv_level` of `word` or
|
||||
`glyph`. Note that while Calamari does not provide word segmentation, this
|
||||
processor produces word segmentation inferred from text
|
||||
segmentation and the glyph positions. The provided glyph and word segmentation
|
||||
can be used for text extraction and highlighting, but is probably not useful for
|
||||
further image-based processing.
|
||||
|
||||

|
||||
|
||||
## Installation
|
||||
|
||||
### From PyPI
|
||||
|
@ -29,32 +40,44 @@ pip install .
|
|||
|
||||
## Install models
|
||||
|
||||
Download standard models:
|
||||
|
||||
```
|
||||
wget https://github.com/Calamari-OCR/calamari_models/archive/master.zip
|
||||
unzip master.zip
|
||||
```
|
||||
|
||||
Download models trained on GT4HistOCR data:
|
||||
|
||||
```
|
||||
make gt4histocr-calamari
|
||||
ls gt4histocr-calamari
|
||||
make gt4histocr-calamari1
|
||||
ls gt4histocr-calamari1
|
||||
```
|
||||
|
||||
Manual download: [model.tar.xz](https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz)
|
||||
|
||||
## Example Usage
|
||||
Before using `ocrd-calamari-recognize` get some example data and model, and
|
||||
prepare the document for OCR:
|
||||
```
|
||||
# Download model and example data
|
||||
make gt4histocr-calamari1
|
||||
make actevedef_718448162
|
||||
|
||||
~~~
|
||||
ocrd-calamari-recognize -p test-parameters.json -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
|
||||
~~~
|
||||
# Create binarized images and line segmentation using other OCR-D projects
|
||||
cd actevedef_718448162
|
||||
ocrd-olena-binarize -P impl sauvola-ms-split -I OCR-D-IMG -O OCR-D-IMG-BIN
|
||||
ocrd-tesserocr-segment-region -I OCR-D-IMG-BIN -O OCR-D-SEG-REGION
|
||||
ocrd-tesserocr-segment-line -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE
|
||||
```
|
||||
|
||||
With `test-parameters.json`:
|
||||
~~~
|
||||
{
|
||||
"checkpoint": "/path/to/some/trained/models/*.ckpt.json"
|
||||
}
|
||||
~~~
|
||||
Finally recognize the text using ocrd_calamari and the downloaded model:
|
||||
```
|
||||
ocrd-calamari-recognize -P checkpoint "../gt4histocr-calamari1/*.ckpt.json" -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```
|
||||
ocrd-calamari-recognize -P checkpoint_dir "../gt4histocr-calamari1" -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
|
||||
```
|
||||
|
||||
|
||||
You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions
|
||||
for additional parameters and default values.
|
||||
|
||||
## Development & Testing
|
||||
For information regarding development and testing, please see
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"git_url": "https://github.com/kba/ocrd_calamari",
|
||||
"version": "0.0.3",
|
||||
"git_url": "https://github.com/OCR-D/ocrd_calamari",
|
||||
"version": "1.0.1",
|
||||
"tools": {
|
||||
"ocrd-calamari-recognize": {
|
||||
"executable": "ocrd-calamari-recognize",
|
||||
|
@ -18,6 +18,10 @@
|
|||
"OCR-D-OCR-CALAMARI"
|
||||
],
|
||||
"parameters": {
|
||||
"checkpoint_dir": {
|
||||
"description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory",
|
||||
"type": "string", "format": "file", "cacheable": true, "default": "qurator-gt4histocr-1.0"
|
||||
},
|
||||
"checkpoint": {
|
||||
"description": "The calamari model files (*.ckpt.json)",
|
||||
"type": "string", "format": "file", "cacheable": true
|
||||
|
@ -25,6 +29,18 @@
|
|||
"voter": {
|
||||
"description": "The voting algorithm to use",
|
||||
"type": "string", "default": "confidence_voter_default_ctc"
|
||||
},
|
||||
"textequiv_level": {
|
||||
"type": "string",
|
||||
"enum": ["line", "word", "glyph"],
|
||||
"default": "line",
|
||||
"description": "Deepest PAGE XML hierarchy level to include TextEquiv results for"
|
||||
},
|
||||
"glyph_conf_cutoff": {
|
||||
"type": "number",
|
||||
"format": "float",
|
||||
"default": 0.001,
|
||||
"description": "Only include glyph alternatives with confidences above this threshold"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,33 +1,50 @@
|
|||
from __future__ import absolute_import
|
||||
|
||||
import os
|
||||
import itertools
|
||||
from glob import glob
|
||||
|
||||
import numpy as np
|
||||
from calamari_ocr import __version__ as calamari_version
|
||||
from calamari_ocr.ocr import MultiPredictor
|
||||
from calamari_ocr.ocr.voting import voter_from_proto
|
||||
from calamari_ocr.proto import VoterParams
|
||||
from ocrd import Processor
|
||||
from ocrd_modelfactory import page_from_file
|
||||
from ocrd_models.ocrd_page import to_xml
|
||||
from ocrd_models.ocrd_page_generateds import TextEquivType
|
||||
from ocrd_utils import getLogger, concat_padded, MIMETYPE_PAGE
|
||||
from ocrd_models.ocrd_page import (
|
||||
LabelType, LabelsType,
|
||||
MetadataItemType,
|
||||
TextEquivType,
|
||||
WordType, GlyphType, CoordsType,
|
||||
to_xml
|
||||
)
|
||||
from ocrd_utils import (
|
||||
getLogger, concat_padded,
|
||||
coordinates_for_segment, points_from_polygon, polygon_from_x0y0x1y1,
|
||||
make_file_id, assert_file_grp_cardinality,
|
||||
MIMETYPE_PAGE
|
||||
)
|
||||
|
||||
from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL
|
||||
|
||||
log = getLogger('processor.CalamariRecognize')
|
||||
os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL
|
||||
from tensorflow import __version__ as tensorflow_version
|
||||
|
||||
TOOL = 'ocrd-calamari-recognize'
|
||||
|
||||
|
||||
class CalamariRecognize(Processor):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-recognize']
|
||||
kwargs['version'] = OCRD_TOOL['version']
|
||||
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
|
||||
kwargs['version'] = '%s (calamari %s, tensorflow %s)' % (OCRD_TOOL['version'], calamari_version, tensorflow_version)
|
||||
super(CalamariRecognize, self).__init__(*args, **kwargs)
|
||||
|
||||
def _init_calamari(self):
|
||||
os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL
|
||||
|
||||
if not self.parameter.get('checkpoint', None) and self.parameter.get('checkpoint_dir', None):
|
||||
resolved = self.resolve_resource(self.parameter['checkpoint_dir'])
|
||||
self.parameter['checkpoint'] = '%s/*.ckpt.json' % resolved
|
||||
checkpoints = glob(self.parameter['checkpoint'])
|
||||
self.predictor = MultiPredictor(checkpoints=checkpoints)
|
||||
|
||||
|
@ -43,16 +60,14 @@ class CalamariRecognize(Processor):
|
|||
voter_params.type = VoterParams.Type.Value(self.parameter['voter'].upper())
|
||||
self.voter = voter_from_proto(voter_params)
|
||||
|
||||
def _make_file_id(self, input_file, n):
|
||||
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
|
||||
if file_id == input_file.ID:
|
||||
file_id = concat_padded(self.output_file_grp, n)
|
||||
return file_id
|
||||
|
||||
def process(self):
|
||||
"""
|
||||
Performs the recognition.
|
||||
"""
|
||||
log = getLogger('processor.CalamariRecognize')
|
||||
|
||||
assert_file_grp_cardinality(self.input_file_grp, 1)
|
||||
assert_file_grp_cardinality(self.output_file_grp, 1)
|
||||
|
||||
self._init_calamari()
|
||||
|
||||
|
@ -71,44 +86,169 @@ class CalamariRecognize(Processor):
|
|||
|
||||
textlines = region.get_TextLine()
|
||||
log.info("About to recognize %i lines of region '%s'", len(textlines), region.id)
|
||||
line_images_np = []
|
||||
for line in textlines:
|
||||
log.debug("Recognizing line '%s' in region '%s'", line.id, region.id)
|
||||
|
||||
line_image, line_coords = self.workspace.image_from_segment(
|
||||
line, region_image, region_coords, feature_selector=self.features)
|
||||
if ('binarized' not in line_coords['features'] and
|
||||
'grayscale_normalized' not in line_coords['features'] and
|
||||
self.input_channels == 1):
|
||||
line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_coords, feature_selector=self.features)
|
||||
if ('binarized' not in line_coords['features'] and 'grayscale_normalized' not in line_coords['features'] and self.input_channels == 1):
|
||||
# We cannot use a feature selector for this since we don't
|
||||
# know whether the model expects (has been trained on)
|
||||
# binarized or grayscale images; but raw images are likely
|
||||
# always inadequate:
|
||||
log.warning("Using raw image for line '%s' in region '%s'",
|
||||
line.id, region.id)
|
||||
|
||||
line_image_np = np.array(line_image, dtype=np.uint8)
|
||||
log.warning("Using raw image for line '%s' in region '%s'", line.id, region.id)
|
||||
|
||||
line_image = line_image if all(line_image.size) else [[0]]
|
||||
line_image_np = np.array(line_image, dtype=np.uint8)
|
||||
line_images_np.append(line_image_np)
|
||||
raw_results_all = self.predictor.predict_raw(line_images_np, progress_bar=False)
|
||||
|
||||
for line, raw_results in zip(textlines, raw_results_all):
|
||||
|
||||
raw_results = list(self.predictor.predict_raw([line_image_np], progress_bar=False))[0]
|
||||
for i, p in enumerate(raw_results):
|
||||
p.prediction.id = "fold_{}".format(i)
|
||||
|
||||
prediction = self.voter.vote_prediction_result(raw_results)
|
||||
prediction.id = "voted"
|
||||
|
||||
line_text = prediction.sentence
|
||||
line_conf = prediction.avg_char_probability
|
||||
# Build line text on our own
|
||||
#
|
||||
# Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
|
||||
# on prediction.positions. Do it on our own to have consistency.
|
||||
#
|
||||
# XXX Check Calamari's built-in post-processing on prediction.sentence
|
||||
|
||||
|
||||
def _sort_chars(p):
|
||||
"""Filter and sort chars of prediction p"""
|
||||
chars = p.chars
|
||||
chars = [c for c in chars if c.char] # XXX Note that omission probabilities are not normalized?!
|
||||
chars = [c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff']]
|
||||
chars = sorted(chars, key=lambda k: k.probability, reverse=True)
|
||||
return chars
|
||||
def _drop_leading_spaces(positions):
|
||||
return list(itertools.dropwhile(lambda p: _sort_chars(p)[0].char == " ", positions))
|
||||
def _drop_trailing_spaces(positions):
|
||||
return list(reversed(_drop_leading_spaces(reversed(positions))))
|
||||
def _drop_double_spaces(positions):
|
||||
def _drop_double_spaces_generator(positions):
|
||||
last_was_space = False
|
||||
for p in positions:
|
||||
if p.chars[0].char == " ":
|
||||
if not last_was_space:
|
||||
yield p
|
||||
last_was_space = True
|
||||
else:
|
||||
yield p
|
||||
last_was_space = False
|
||||
return list(_drop_double_spaces_generator(positions))
|
||||
positions = prediction.positions
|
||||
positions = _drop_leading_spaces(positions)
|
||||
positions = _drop_trailing_spaces(positions)
|
||||
positions = _drop_double_spaces(positions)
|
||||
positions = list(positions)
|
||||
|
||||
line_text = ''.join(_sort_chars(p)[0].char for p in positions)
|
||||
if line_text != prediction.sentence:
|
||||
log.warning("Our own line text is not the same as Calamari's: '%s' != '%s'",
|
||||
line_text, prediction.sentence)
|
||||
|
||||
# Delete existing results
|
||||
if line.get_TextEquiv():
|
||||
log.warning("Line '%s' already contained text results", line.id)
|
||||
line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])
|
||||
|
||||
line.set_TextEquiv([])
|
||||
if line.get_Word():
|
||||
log.warning("Line '%s' already contained word segmentation", line.id)
|
||||
line.set_Word([])
|
||||
|
||||
# Save line results
|
||||
line_conf = prediction.avg_char_probability
|
||||
line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])
|
||||
|
||||
|
||||
# Save word results
|
||||
#
|
||||
# Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
|
||||
# and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
|
||||
# hierarchy of lines > words > glyphs.
|
||||
|
||||
def _words(s):
|
||||
"""Split words based on spaces and include spaces as 'words'"""
|
||||
spaces = None
|
||||
word = ''
|
||||
for c in s:
|
||||
if c == ' ' and spaces is True:
|
||||
word += c
|
||||
elif c != ' ' and spaces is False:
|
||||
word += c
|
||||
else:
|
||||
if word:
|
||||
yield word
|
||||
word = c
|
||||
spaces = (c == ' ')
|
||||
yield word
|
||||
|
||||
if self.parameter['textequiv_level'] in ['word', 'glyph']:
|
||||
word_no = 0
|
||||
i = 0
|
||||
|
||||
|
||||
|
||||
for word_text in _words(line_text):
|
||||
word_length = len(word_text)
|
||||
if not all(c == ' ' for c in word_text):
|
||||
word_positions = positions[i:i+word_length]
|
||||
word_start = word_positions[0].global_start
|
||||
word_end = word_positions[-1].global_end
|
||||
|
||||
polygon = polygon_from_x0y0x1y1([word_start, 0, word_end, line_image.height])
|
||||
points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords))
|
||||
# XXX Crop to line polygon?
|
||||
|
||||
word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points))
|
||||
word.add_TextEquiv(TextEquivType(Unicode=word_text))
|
||||
|
||||
if self.parameter['textequiv_level'] == 'glyph':
|
||||
for glyph_no, p in enumerate(word_positions):
|
||||
glyph_start = p.global_start
|
||||
glyph_end = p.global_end
|
||||
|
||||
polygon = polygon_from_x0y0x1y1([glyph_start, 0, glyph_end, line_image.height])
|
||||
points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords))
|
||||
|
||||
glyph = GlyphType(id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points))
|
||||
|
||||
# Add predictions (= TextEquivs)
|
||||
char_index_start = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
|
||||
for char_index, char in enumerate(_sort_chars(p), start=char_index_start):
|
||||
glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=char_index, conf=char.probability))
|
||||
|
||||
word.add_Glyph(glyph)
|
||||
|
||||
line.add_Word(word)
|
||||
word_no += 1
|
||||
|
||||
i += word_length
|
||||
|
||||
|
||||
_page_update_higher_textequiv_levels('line', pcgts)
|
||||
|
||||
file_id = self._make_file_id(input_file, n)
|
||||
|
||||
# Add metadata about this operation and its runtime parameters:
|
||||
metadata = pcgts.get_Metadata() # ensured by from_file()
|
||||
metadata.add_MetadataItem(
|
||||
MetadataItemType(type_="processingStep",
|
||||
name=self.ocrd_tool['steps'][0],
|
||||
value=TOOL,
|
||||
Labels=[LabelsType(
|
||||
externalModel="ocrd-tool",
|
||||
externalId="parameters",
|
||||
Label=[LabelType(type_=name, value=self.parameter[name])
|
||||
for name in self.parameter.keys()])]))
|
||||
|
||||
|
||||
file_id = make_file_id(input_file, self.output_file_grp)
|
||||
pcgts.set_pcGtsId(file_id)
|
||||
self.workspace.add_file(
|
||||
ID=file_id,
|
||||
file_grp=self.output_file_grp,
|
||||
|
@ -151,3 +291,5 @@ def _page_update_higher_textequiv_levels(level, pcgts):
|
|||
else u'' for line in lines)
|
||||
region.set_TextEquiv(
|
||||
[TextEquivType(Unicode=region_unicode)]) # remove old
|
||||
|
||||
# vim:tw=120:
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
numpy
|
||||
tensorflow-gpu == 1.14.0
|
||||
calamari-ocr == 0.3.5
|
||||
h5py < 3 # XXX tensorflow 2.4.0rc3 requires h5py~=2.10.0, but you'll have h5py 3.1.0 which is incompatible.
|
||||
tensorflow >= 2.3.0rc2
|
||||
calamari-ocr == 1.0.*
|
||||
setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime?
|
||||
click
|
||||
ocrd >= 1.0.0b11
|
||||
ocrd >= 2.22.0
|
||||
|
|
6
setup.py
6
setup.py
|
@ -5,15 +5,15 @@ from setuptools import setup, find_packages
|
|||
|
||||
setup(
|
||||
name='ocrd_calamari',
|
||||
version='0.0.3',
|
||||
version='1.0.1',
|
||||
description='Calamari bindings',
|
||||
long_description=Path('README.md').read_text(),
|
||||
long_description_content_type='text/markdown',
|
||||
author='Konstantin Baierer, Mike Gerber',
|
||||
author_email='unixprog@gmail.com, mike.gerber@sbb.spk-berlin.de',
|
||||
url='https://github.com/kba/ocrd_calamari',
|
||||
url='https://github.com/OCR-D/ocrd_calamari',
|
||||
license='Apache License 2.0',
|
||||
packages=find_packages(exclude=('tests', 'docs')),
|
||||
packages=find_packages(exclude=('test', 'docs')),
|
||||
install_requires=Path('requirements.txt').read_text().split('\n'),
|
||||
package_data={
|
||||
'': ['*.json', '*.yml', '*.yaml'],
|
||||
|
|
|
@ -2,6 +2,8 @@ import os
|
|||
import shutil
|
||||
import subprocess
|
||||
import urllib.request
|
||||
from lxml import etree
|
||||
from glob import glob
|
||||
|
||||
import pytest
|
||||
import logging
|
||||
|
@ -10,9 +12,14 @@ from ocrd.resolver import Resolver
|
|||
from ocrd_calamari import CalamariRecognize
|
||||
from .base import assets
|
||||
|
||||
METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml')
|
||||
CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json')
|
||||
METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml')
|
||||
WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
|
||||
CHECKPOINT_DIR = os.path.join(os.getcwd(), 'gt4histocr-calamari1')
|
||||
CHECKPOINT = os.path.join(CHECKPOINT_DIR, '*.ckpt.json')
|
||||
|
||||
# Because XML namespace versions are so much fun, we not only use one, we use TWO!
|
||||
NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
|
||||
NSMAP_GT = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" }
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -32,10 +39,6 @@ def workspace():
|
|||
"https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f,
|
||||
os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f))
|
||||
|
||||
return workspace
|
||||
|
||||
|
||||
def test_recognize(workspace):
|
||||
# The binarization options I have are:
|
||||
#
|
||||
# a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf)
|
||||
|
@ -48,17 +51,49 @@ def test_recognize(workspace):
|
|||
ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)
|
||||
subprocess.call(['convert', ff, '-threshold', '50%', ff])
|
||||
|
||||
# XXX Should remove GT text to really test this
|
||||
# Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text
|
||||
# XXX Review data again
|
||||
# XXX Make this more robust against namespace version changes
|
||||
for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-LINE"):
|
||||
workspace.download_file(of)
|
||||
for to_remove in ["//pc:Word", "//pc:TextEquiv"]:
|
||||
for ff in glob(os.path.join(WORKSPACE_DIR, "OCR-D-GT-SEG-LINE", "*")):
|
||||
tree = etree.parse(ff)
|
||||
for e in tree.xpath(to_remove, namespaces=NSMAP_GT):
|
||||
e.getparent().remove(e)
|
||||
tree.write(ff, xml_declaration=True, encoding="utf-8")
|
||||
|
||||
return workspace
|
||||
|
||||
|
||||
def test_recognize(workspace):
|
||||
CalamariRecognize(
|
||||
workspace,
|
||||
input_file_grp="OCR-D-GT-SEG-LINE",
|
||||
output_file_grp="OCR-D-OCR-CALAMARI",
|
||||
parameter={'checkpoint': CHECKPOINT}
|
||||
parameter={
|
||||
"checkpoint": CHECKPOINT,
|
||||
}
|
||||
).process()
|
||||
workspace.save_mets()
|
||||
|
||||
page1 = os.path.join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml')
|
||||
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
|
||||
assert os.path.exists(page1)
|
||||
with open(page1, "r", encoding="utf-8") as f:
|
||||
assert "verſchuldeten" in f.read()
|
||||
|
||||
def test_recognize_with_checkpoint_dir(workspace):
|
||||
CalamariRecognize(
|
||||
workspace,
|
||||
input_file_grp="OCR-D-GT-SEG-LINE",
|
||||
output_file_grp="OCR-D-OCR-CALAMARI",
|
||||
parameter={
|
||||
"checkpoint_dir": CHECKPOINT_DIR,
|
||||
}
|
||||
).process()
|
||||
workspace.save_mets()
|
||||
|
||||
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
|
||||
assert os.path.exists(page1)
|
||||
with open(page1, 'r', encoding='utf-8') as f:
|
||||
assert 'verſchuldeten' in f.read()
|
||||
|
@ -75,3 +110,61 @@ def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(works
|
|||
|
||||
interesting_log_messages = [t[2] for t in caplog.record_tuples if "Using raw image" in t[2]]
|
||||
assert len(interesting_log_messages) > 10 # For every line!
|
||||
with open(page1, "r", encoding="utf-8") as f:
|
||||
assert "verſchuldeten" in f.read()
|
||||
|
||||
|
||||
def test_word_segmentation(workspace):
|
||||
CalamariRecognize(
|
||||
workspace,
|
||||
input_file_grp="OCR-D-GT-SEG-LINE",
|
||||
output_file_grp="OCR-D-OCR-CALAMARI",
|
||||
parameter={
|
||||
"checkpoint": CHECKPOINT,
|
||||
"textequiv_level": "word", # Note that we're going down to word level here
|
||||
}
|
||||
).process()
|
||||
workspace.save_mets()
|
||||
|
||||
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
|
||||
assert os.path.exists(page1)
|
||||
tree = etree.parse(page1)
|
||||
|
||||
# The result should contain a TextLine that contains the text "December"
|
||||
line = tree.xpath(".//pc:TextLine[pc:TextEquiv/pc:Unicode[contains(text(),'December')]]", namespaces=NSMAP)[0]
|
||||
assert line
|
||||
|
||||
# The textline should a. contain multiple words and b. these should concatenate fine to produce the same line text
|
||||
words = line.xpath(".//pc:Word", namespaces=NSMAP)
|
||||
assert len(words) >= 2
|
||||
words_text = " ".join(word.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text for word in words)
|
||||
line_text = line.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text
|
||||
assert words_text == line_text
|
||||
|
||||
# For extra measure, check that we're not seeing any glyphs, as we asked for textequiv_level == "word"
|
||||
glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP)
|
||||
assert len(glyphs) == 0
|
||||
|
||||
|
||||
def test_glyphs(workspace):
|
||||
CalamariRecognize(
|
||||
workspace,
|
||||
input_file_grp="OCR-D-GT-SEG-LINE",
|
||||
output_file_grp="OCR-D-OCR-CALAMARI",
|
||||
parameter={
|
||||
"checkpoint": CHECKPOINT,
|
||||
"textequiv_level": "glyph", # Note that we're going down to glyph level here
|
||||
}
|
||||
).process()
|
||||
workspace.save_mets()
|
||||
|
||||
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
|
||||
assert os.path.exists(page1)
|
||||
tree = etree.parse(page1)
|
||||
|
||||
# The result should contain a lot of glyphs
|
||||
glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP)
|
||||
assert len(glyphs) >= 100
|
||||
|
||||
|
||||
# vim:tw=120:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue