mirror of
https://github.com/mikegerber/ocrd_calamari.git
synced 2025-06-08 19:29:53 +02:00
Merge branch 'master' into image-features
This commit is contained in:
commit
c0902cdef5
12 changed files with 385 additions and 105 deletions
|
@ -7,15 +7,23 @@ jobs:
|
||||||
build-python36:
|
build-python36:
|
||||||
docker:
|
docker:
|
||||||
- image: ubuntu:18.04
|
- image: ubuntu:18.04
|
||||||
|
environment:
|
||||||
|
- PYTHONIOENCODING: utf-8
|
||||||
steps:
|
steps:
|
||||||
- run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget imagemagick
|
- run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget imagemagick locales
|
||||||
|
- run: locale-gen "en_US.UTF-8"; update-locale LC_ALL="en_US.UTF-8"
|
||||||
- checkout
|
- checkout
|
||||||
|
- run: pip3 install --upgrade pip
|
||||||
- run: make install PIP_INSTALL="pip3 install"
|
- run: make install PIP_INSTALL="pip3 install"
|
||||||
- run: pip3 install -r requirements-test.txt
|
- run: pip3 install -r requirements-test.txt
|
||||||
- run: make coverage
|
- run: make coverage LC_ALL=en_US.utf8
|
||||||
- codecov/upload
|
- codecov/upload
|
||||||
|
|
||||||
workflows:
|
workflows:
|
||||||
build:
|
build:
|
||||||
jobs:
|
jobs:
|
||||||
- build-python36
|
- build-python36:
|
||||||
|
filters:
|
||||||
|
branches:
|
||||||
|
ignore:
|
||||||
|
- screenshots
|
||||||
|
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -107,5 +107,7 @@ venv.bak/
|
||||||
/calamari
|
/calamari
|
||||||
/calamari_models
|
/calamari_models
|
||||||
/gt4histocr-calamari
|
/gt4histocr-calamari
|
||||||
|
/actevedef_718448162*
|
||||||
/repo
|
/repo
|
||||||
/test/assets
|
/test/assets
|
||||||
|
gt4histocr-calamari*
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
FROM ocrd/core:edge
|
FROM ocrd/core
|
||||||
MAINTAINER OCR-D
|
MAINTAINER OCR-D
|
||||||
ENV DEBIAN_FRONTEND noninteractive
|
ENV DEBIAN_FRONTEND noninteractive
|
||||||
ENV PYTHONIOENCODING utf8
|
ENV PYTHONIOENCODING utf8
|
||||||
|
@ -10,10 +10,12 @@ COPY Makefile .
|
||||||
COPY setup.py .
|
COPY setup.py .
|
||||||
COPY ocrd-tool.json .
|
COPY ocrd-tool.json .
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
|
COPY README.md .
|
||||||
COPY ocrd_calamari ocrd_calamari
|
COPY ocrd_calamari ocrd_calamari
|
||||||
|
|
||||||
RUN make calamari/build
|
RUN pip3 install --upgrade pip && \
|
||||||
RUN pip3 install .
|
pip3 install . && \
|
||||||
|
pip3 check
|
||||||
|
|
||||||
ENTRYPOINT ["/usr/local/bin/ocrd-calamari-recognize"]
|
ENTRYPOINT ["/usr/local/bin/ocrd-calamari-recognize"]
|
||||||
|
|
||||||
|
|
2
LICENSE
2
LICENSE
|
@ -186,7 +186,7 @@
|
||||||
same "printed page" as the copyright notice for easier
|
same "printed page" as the copyright notice for easier
|
||||||
identification within third-party archives.
|
identification within third-party archives.
|
||||||
|
|
||||||
Copyright [yyyy] [name of copyright owner]
|
Copyright 2018-2020 Konstantin Baierer, Mike Gerber
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
|
|
37
Makefile
37
Makefile
|
@ -1,3 +1,4 @@
|
||||||
|
export # export variables to subshells
|
||||||
PIP_INSTALL = pip3 install
|
PIP_INSTALL = pip3 install
|
||||||
GIT_CLONE = git clone
|
GIT_CLONE = git clone
|
||||||
PYTHON = python3
|
PYTHON = python3
|
||||||
|
@ -10,10 +11,8 @@ help:
|
||||||
@echo " Targets"
|
@echo " Targets"
|
||||||
@echo ""
|
@echo ""
|
||||||
@echo " install Install ocrd_calamari"
|
@echo " install Install ocrd_calamari"
|
||||||
@echo " calamari Clone calamari repo"
|
@echo " gt4histocr-calamari1 Get GT4HistOCR Calamari model (from SBB)"
|
||||||
@echo " calamari_models Clone calamari_models repo"
|
@echo " actevedef_718448162 Download example data"
|
||||||
@echo " gt4histocr-calamari Get GT4HistOCR Calamari model (from SBB)"
|
|
||||||
@echo " calamari/build pip install calamari"
|
|
||||||
@echo " deps-test Install testing python deps via pip"
|
@echo " deps-test Install testing python deps via pip"
|
||||||
@echo " repo/assets Clone OCR-D/assets to ./repo/assets"
|
@echo " repo/assets Clone OCR-D/assets to ./repo/assets"
|
||||||
@echo " test/assets Setup test assets"
|
@echo " test/assets Setup test assets"
|
||||||
|
@ -33,29 +32,21 @@ help:
|
||||||
install:
|
install:
|
||||||
$(PIP_INSTALL) .
|
$(PIP_INSTALL) .
|
||||||
|
|
||||||
# Clone calamari repo
|
|
||||||
calamari:
|
|
||||||
$(GIT_CLONE) https://github.com/chwick/calamari
|
|
||||||
|
|
||||||
# Clone calamari_models repo
|
# Get GT4HistOCR Calamari model (from SBB)
|
||||||
calamari_models:
|
gt4histocr-calamari1:
|
||||||
$(GIT_CLONE) -n https://github.com/chwick/calamari_models
|
mkdir -p gt4histocr-calamari1
|
||||||
# Checkout latest version that works with calamari-ocr==0.3.5:
|
cd gt4histocr-calamari1 && \
|
||||||
cd calamari_models && git checkout f76b1d3ec
|
wget https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz && \
|
||||||
|
|
||||||
gt4histocr-calamari:
|
|
||||||
mkdir gt4histocr-calamari
|
|
||||||
cd gt4histocr-calamari && \
|
|
||||||
wget https://file.spk-berlin.de:8443/calamari-models/GT4HistOCR/model.tar.xz && \
|
|
||||||
tar xfv model.tar.xz && \
|
tar xfv model.tar.xz && \
|
||||||
rm model.tar.xz
|
rm model.tar.xz
|
||||||
|
|
||||||
|
# Download example data
|
||||||
|
actevedef_718448162:
|
||||||
|
wget https://qurator-data.de/examples/actevedef_718448162.zip && \
|
||||||
|
unzip actevedef_718448162.zip
|
||||||
|
|
||||||
|
|
||||||
# pip install calamari
|
|
||||||
calamari/build: calamari calamari_models
|
|
||||||
cd calamari && $(PIP_INSTALL) .
|
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Assets and Tests
|
# Assets and Tests
|
||||||
|
@ -82,12 +73,12 @@ assets-clean:
|
||||||
rm -rf test/assets
|
rm -rf test/assets
|
||||||
|
|
||||||
# Run unit tests
|
# Run unit tests
|
||||||
test: test/assets gt4histocr-calamari
|
test: test/assets gt4histocr-calamari1
|
||||||
# declare -p HTTP_PROXY
|
# declare -p HTTP_PROXY
|
||||||
$(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS)
|
$(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS)
|
||||||
|
|
||||||
# Run unit tests and determine test coverage
|
# Run unit tests and determine test coverage
|
||||||
coverage: test/assets calamari_models
|
coverage: test/assets gt4histocr-calamari1
|
||||||
coverage erase
|
coverage erase
|
||||||
make test PYTHON="coverage run"
|
make test PYTHON="coverage run"
|
||||||
coverage report
|
coverage report
|
||||||
|
|
|
@ -4,17 +4,20 @@ In a Python 3 virtualenv:
|
||||||
|
|
||||||
~~~
|
~~~
|
||||||
pip install -e .
|
pip install -e .
|
||||||
|
pip install -r requirements-test.txt
|
||||||
make test
|
make test
|
||||||
~~~
|
~~~
|
||||||
|
|
||||||
Release
|
Releasing
|
||||||
-------
|
---------
|
||||||
* Update ocrd-tool.json version
|
* Update `ocrd-tool.json` version
|
||||||
* Update setup.py version
|
* Update `setup.py` version
|
||||||
* git commit -m 'v<version>'
|
* `git commit -m 'v<version>'`
|
||||||
* git tag -m 'v<version>' 'v<version>'
|
* `git tag -m 'v<version>' 'v<version>'`
|
||||||
* git push --tags
|
* `git push --tags`
|
||||||
|
* Do a release on GitHub
|
||||||
|
|
||||||
PyPI:
|
### Uploading to PyPI
|
||||||
* python sdist bdist_wheel
|
* `rm -rf dist/` or backup if `dist/` exists already
|
||||||
* twine upload dist/ocrd_calamari-<version>*
|
* In the virtualenv: `python setup.py sdist bdist_wheel`
|
||||||
|
* `twine upload dist/ocrd_calamari-<version>*`
|
||||||
|
|
61
README.md
61
README.md
|
@ -8,11 +8,22 @@
|
||||||
|
|
||||||
## Introduction
|
## Introduction
|
||||||
|
|
||||||
This offers a OCR-D compliant workspace processor for some of the functionality of Calamari OCR.
|
**ocrd_calamari** offers a [OCR-D](https://ocr-d.de) compliant workspace processor for the functionality of Calamari OCR. It uses OCR-D workspaces (METS) with [PAGE XML](https://github.com/PRImA-Research-Lab/PAGE-XML) documents as input and output.
|
||||||
|
|
||||||
This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized
|
This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized
|
||||||
image) as its input.
|
image) as its input.
|
||||||
|
|
||||||
|
In addition to the line text it may also output word and glyph segmentation
|
||||||
|
including per-glyph confidence values and per-glyph alternative predictions as
|
||||||
|
provided by the Calamari OCR engine, using a `textequiv_level` of `word` or
|
||||||
|
`glyph`. Note that while Calamari does not provide word segmentation, this
|
||||||
|
processor produces word segmentation inferred from text
|
||||||
|
segmentation and the glyph positions. The provided glyph and word segmentation
|
||||||
|
can be used for text extraction and highlighting, but is probably not useful for
|
||||||
|
further image-based processing.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
### From PyPI
|
### From PyPI
|
||||||
|
@ -29,32 +40,44 @@ pip install .
|
||||||
|
|
||||||
## Install models
|
## Install models
|
||||||
|
|
||||||
Download standard models:
|
|
||||||
|
|
||||||
```
|
|
||||||
wget https://github.com/Calamari-OCR/calamari_models/archive/master.zip
|
|
||||||
unzip master.zip
|
|
||||||
```
|
|
||||||
|
|
||||||
Download models trained on GT4HistOCR data:
|
Download models trained on GT4HistOCR data:
|
||||||
|
|
||||||
```
|
```
|
||||||
make gt4histocr-calamari
|
make gt4histocr-calamari1
|
||||||
ls gt4histocr-calamari
|
ls gt4histocr-calamari1
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Manual download: [model.tar.xz](https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz)
|
||||||
|
|
||||||
## Example Usage
|
## Example Usage
|
||||||
|
Before using `ocrd-calamari-recognize` get some example data and model, and
|
||||||
|
prepare the document for OCR:
|
||||||
|
```
|
||||||
|
# Download model and example data
|
||||||
|
make gt4histocr-calamari1
|
||||||
|
make actevedef_718448162
|
||||||
|
|
||||||
~~~
|
# Create binarized images and line segmentation using other OCR-D projects
|
||||||
ocrd-calamari-recognize -p test-parameters.json -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
|
cd actevedef_718448162
|
||||||
~~~
|
ocrd-olena-binarize -P impl sauvola-ms-split -I OCR-D-IMG -O OCR-D-IMG-BIN
|
||||||
|
ocrd-tesserocr-segment-region -I OCR-D-IMG-BIN -O OCR-D-SEG-REGION
|
||||||
|
ocrd-tesserocr-segment-line -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE
|
||||||
|
```
|
||||||
|
|
||||||
With `test-parameters.json`:
|
Finally recognize the text using ocrd_calamari and the downloaded model:
|
||||||
~~~
|
```
|
||||||
{
|
ocrd-calamari-recognize -P checkpoint "../gt4histocr-calamari1/*.ckpt.json" -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
|
||||||
"checkpoint": "/path/to/some/trained/models/*.ckpt.json"
|
```
|
||||||
}
|
|
||||||
~~~
|
or
|
||||||
|
|
||||||
|
```
|
||||||
|
ocrd-calamari-recognize -P checkpoint_dir "../gt4histocr-calamari1" -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions
|
||||||
|
for additional parameters and default values.
|
||||||
|
|
||||||
## Development & Testing
|
## Development & Testing
|
||||||
For information regarding development and testing, please see
|
For information regarding development and testing, please see
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"git_url": "https://github.com/kba/ocrd_calamari",
|
"git_url": "https://github.com/OCR-D/ocrd_calamari",
|
||||||
"version": "0.0.3",
|
"version": "1.0.1",
|
||||||
"tools": {
|
"tools": {
|
||||||
"ocrd-calamari-recognize": {
|
"ocrd-calamari-recognize": {
|
||||||
"executable": "ocrd-calamari-recognize",
|
"executable": "ocrd-calamari-recognize",
|
||||||
|
@ -18,6 +18,10 @@
|
||||||
"OCR-D-OCR-CALAMARI"
|
"OCR-D-OCR-CALAMARI"
|
||||||
],
|
],
|
||||||
"parameters": {
|
"parameters": {
|
||||||
|
"checkpoint_dir": {
|
||||||
|
"description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory",
|
||||||
|
"type": "string", "format": "file", "cacheable": true, "default": "qurator-gt4histocr-1.0"
|
||||||
|
},
|
||||||
"checkpoint": {
|
"checkpoint": {
|
||||||
"description": "The calamari model files (*.ckpt.json)",
|
"description": "The calamari model files (*.ckpt.json)",
|
||||||
"type": "string", "format": "file", "cacheable": true
|
"type": "string", "format": "file", "cacheable": true
|
||||||
|
@ -25,6 +29,18 @@
|
||||||
"voter": {
|
"voter": {
|
||||||
"description": "The voting algorithm to use",
|
"description": "The voting algorithm to use",
|
||||||
"type": "string", "default": "confidence_voter_default_ctc"
|
"type": "string", "default": "confidence_voter_default_ctc"
|
||||||
|
},
|
||||||
|
"textequiv_level": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["line", "word", "glyph"],
|
||||||
|
"default": "line",
|
||||||
|
"description": "Deepest PAGE XML hierarchy level to include TextEquiv results for"
|
||||||
|
},
|
||||||
|
"glyph_conf_cutoff": {
|
||||||
|
"type": "number",
|
||||||
|
"format": "float",
|
||||||
|
"default": 0.001,
|
||||||
|
"description": "Only include glyph alternatives with confidences above this threshold"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,33 +1,50 @@
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import itertools
|
||||||
from glob import glob
|
from glob import glob
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from calamari_ocr import __version__ as calamari_version
|
||||||
from calamari_ocr.ocr import MultiPredictor
|
from calamari_ocr.ocr import MultiPredictor
|
||||||
from calamari_ocr.ocr.voting import voter_from_proto
|
from calamari_ocr.ocr.voting import voter_from_proto
|
||||||
from calamari_ocr.proto import VoterParams
|
from calamari_ocr.proto import VoterParams
|
||||||
from ocrd import Processor
|
from ocrd import Processor
|
||||||
from ocrd_modelfactory import page_from_file
|
from ocrd_modelfactory import page_from_file
|
||||||
from ocrd_models.ocrd_page import to_xml
|
from ocrd_models.ocrd_page import (
|
||||||
from ocrd_models.ocrd_page_generateds import TextEquivType
|
LabelType, LabelsType,
|
||||||
from ocrd_utils import getLogger, concat_padded, MIMETYPE_PAGE
|
MetadataItemType,
|
||||||
|
TextEquivType,
|
||||||
|
WordType, GlyphType, CoordsType,
|
||||||
|
to_xml
|
||||||
|
)
|
||||||
|
from ocrd_utils import (
|
||||||
|
getLogger, concat_padded,
|
||||||
|
coordinates_for_segment, points_from_polygon, polygon_from_x0y0x1y1,
|
||||||
|
make_file_id, assert_file_grp_cardinality,
|
||||||
|
MIMETYPE_PAGE
|
||||||
|
)
|
||||||
|
|
||||||
from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL
|
from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL
|
||||||
|
|
||||||
log = getLogger('processor.CalamariRecognize')
|
os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL
|
||||||
|
from tensorflow import __version__ as tensorflow_version
|
||||||
|
|
||||||
|
TOOL = 'ocrd-calamari-recognize'
|
||||||
|
|
||||||
|
|
||||||
class CalamariRecognize(Processor):
|
class CalamariRecognize(Processor):
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-recognize']
|
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
|
||||||
kwargs['version'] = OCRD_TOOL['version']
|
kwargs['version'] = '%s (calamari %s, tensorflow %s)' % (OCRD_TOOL['version'], calamari_version, tensorflow_version)
|
||||||
super(CalamariRecognize, self).__init__(*args, **kwargs)
|
super(CalamariRecognize, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
def _init_calamari(self):
|
def _init_calamari(self):
|
||||||
os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL
|
|
||||||
|
|
||||||
|
if not self.parameter.get('checkpoint', None) and self.parameter.get('checkpoint_dir', None):
|
||||||
|
resolved = self.resolve_resource(self.parameter['checkpoint_dir'])
|
||||||
|
self.parameter['checkpoint'] = '%s/*.ckpt.json' % resolved
|
||||||
checkpoints = glob(self.parameter['checkpoint'])
|
checkpoints = glob(self.parameter['checkpoint'])
|
||||||
self.predictor = MultiPredictor(checkpoints=checkpoints)
|
self.predictor = MultiPredictor(checkpoints=checkpoints)
|
||||||
|
|
||||||
|
@ -43,16 +60,14 @@ class CalamariRecognize(Processor):
|
||||||
voter_params.type = VoterParams.Type.Value(self.parameter['voter'].upper())
|
voter_params.type = VoterParams.Type.Value(self.parameter['voter'].upper())
|
||||||
self.voter = voter_from_proto(voter_params)
|
self.voter = voter_from_proto(voter_params)
|
||||||
|
|
||||||
def _make_file_id(self, input_file, n):
|
|
||||||
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
|
|
||||||
if file_id == input_file.ID:
|
|
||||||
file_id = concat_padded(self.output_file_grp, n)
|
|
||||||
return file_id
|
|
||||||
|
|
||||||
def process(self):
|
def process(self):
|
||||||
"""
|
"""
|
||||||
Performs the recognition.
|
Performs the recognition.
|
||||||
"""
|
"""
|
||||||
|
log = getLogger('processor.CalamariRecognize')
|
||||||
|
|
||||||
|
assert_file_grp_cardinality(self.input_file_grp, 1)
|
||||||
|
assert_file_grp_cardinality(self.output_file_grp, 1)
|
||||||
|
|
||||||
self._init_calamari()
|
self._init_calamari()
|
||||||
|
|
||||||
|
@ -71,44 +86,169 @@ class CalamariRecognize(Processor):
|
||||||
|
|
||||||
textlines = region.get_TextLine()
|
textlines = region.get_TextLine()
|
||||||
log.info("About to recognize %i lines of region '%s'", len(textlines), region.id)
|
log.info("About to recognize %i lines of region '%s'", len(textlines), region.id)
|
||||||
|
line_images_np = []
|
||||||
for line in textlines:
|
for line in textlines:
|
||||||
log.debug("Recognizing line '%s' in region '%s'", line.id, region.id)
|
log.debug("Recognizing line '%s' in region '%s'", line.id, region.id)
|
||||||
|
|
||||||
line_image, line_coords = self.workspace.image_from_segment(
|
line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_coords, feature_selector=self.features)
|
||||||
line, region_image, region_coords, feature_selector=self.features)
|
if ('binarized' not in line_coords['features'] and 'grayscale_normalized' not in line_coords['features'] and self.input_channels == 1):
|
||||||
if ('binarized' not in line_coords['features'] and
|
|
||||||
'grayscale_normalized' not in line_coords['features'] and
|
|
||||||
self.input_channels == 1):
|
|
||||||
# We cannot use a feature selector for this since we don't
|
# We cannot use a feature selector for this since we don't
|
||||||
# know whether the model expects (has been trained on)
|
# know whether the model expects (has been trained on)
|
||||||
# binarized or grayscale images; but raw images are likely
|
# binarized or grayscale images; but raw images are likely
|
||||||
# always inadequate:
|
# always inadequate:
|
||||||
log.warning("Using raw image for line '%s' in region '%s'",
|
log.warning("Using raw image for line '%s' in region '%s'", line.id, region.id)
|
||||||
line.id, region.id)
|
|
||||||
|
|
||||||
|
line_image = line_image if all(line_image.size) else [[0]]
|
||||||
line_image_np = np.array(line_image, dtype=np.uint8)
|
line_image_np = np.array(line_image, dtype=np.uint8)
|
||||||
|
line_images_np.append(line_image_np)
|
||||||
|
raw_results_all = self.predictor.predict_raw(line_images_np, progress_bar=False)
|
||||||
|
|
||||||
|
for line, raw_results in zip(textlines, raw_results_all):
|
||||||
|
|
||||||
raw_results = list(self.predictor.predict_raw([line_image_np], progress_bar=False))[0]
|
|
||||||
for i, p in enumerate(raw_results):
|
for i, p in enumerate(raw_results):
|
||||||
p.prediction.id = "fold_{}".format(i)
|
p.prediction.id = "fold_{}".format(i)
|
||||||
|
|
||||||
prediction = self.voter.vote_prediction_result(raw_results)
|
prediction = self.voter.vote_prediction_result(raw_results)
|
||||||
prediction.id = "voted"
|
prediction.id = "voted"
|
||||||
|
|
||||||
line_text = prediction.sentence
|
# Build line text on our own
|
||||||
line_conf = prediction.avg_char_probability
|
#
|
||||||
|
# Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
|
||||||
|
# on prediction.positions. Do it on our own to have consistency.
|
||||||
|
#
|
||||||
|
# XXX Check Calamari's built-in post-processing on prediction.sentence
|
||||||
|
|
||||||
|
|
||||||
|
def _sort_chars(p):
|
||||||
|
"""Filter and sort chars of prediction p"""
|
||||||
|
chars = p.chars
|
||||||
|
chars = [c for c in chars if c.char] # XXX Note that omission probabilities are not normalized?!
|
||||||
|
chars = [c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff']]
|
||||||
|
chars = sorted(chars, key=lambda k: k.probability, reverse=True)
|
||||||
|
return chars
|
||||||
|
def _drop_leading_spaces(positions):
|
||||||
|
return list(itertools.dropwhile(lambda p: _sort_chars(p)[0].char == " ", positions))
|
||||||
|
def _drop_trailing_spaces(positions):
|
||||||
|
return list(reversed(_drop_leading_spaces(reversed(positions))))
|
||||||
|
def _drop_double_spaces(positions):
|
||||||
|
def _drop_double_spaces_generator(positions):
|
||||||
|
last_was_space = False
|
||||||
|
for p in positions:
|
||||||
|
if p.chars[0].char == " ":
|
||||||
|
if not last_was_space:
|
||||||
|
yield p
|
||||||
|
last_was_space = True
|
||||||
|
else:
|
||||||
|
yield p
|
||||||
|
last_was_space = False
|
||||||
|
return list(_drop_double_spaces_generator(positions))
|
||||||
|
positions = prediction.positions
|
||||||
|
positions = _drop_leading_spaces(positions)
|
||||||
|
positions = _drop_trailing_spaces(positions)
|
||||||
|
positions = _drop_double_spaces(positions)
|
||||||
|
positions = list(positions)
|
||||||
|
|
||||||
|
line_text = ''.join(_sort_chars(p)[0].char for p in positions)
|
||||||
|
if line_text != prediction.sentence:
|
||||||
|
log.warning("Our own line text is not the same as Calamari's: '%s' != '%s'",
|
||||||
|
line_text, prediction.sentence)
|
||||||
|
|
||||||
|
# Delete existing results
|
||||||
if line.get_TextEquiv():
|
if line.get_TextEquiv():
|
||||||
log.warning("Line '%s' already contained text results", line.id)
|
log.warning("Line '%s' already contained text results", line.id)
|
||||||
line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])
|
line.set_TextEquiv([])
|
||||||
|
|
||||||
if line.get_Word():
|
if line.get_Word():
|
||||||
log.warning("Line '%s' already contained word segmentation", line.id)
|
log.warning("Line '%s' already contained word segmentation", line.id)
|
||||||
line.set_Word([])
|
line.set_Word([])
|
||||||
|
|
||||||
|
# Save line results
|
||||||
|
line_conf = prediction.avg_char_probability
|
||||||
|
line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])
|
||||||
|
|
||||||
|
|
||||||
|
# Save word results
|
||||||
|
#
|
||||||
|
# Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
|
||||||
|
# and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
|
||||||
|
# hierarchy of lines > words > glyphs.
|
||||||
|
|
||||||
|
def _words(s):
|
||||||
|
"""Split words based on spaces and include spaces as 'words'"""
|
||||||
|
spaces = None
|
||||||
|
word = ''
|
||||||
|
for c in s:
|
||||||
|
if c == ' ' and spaces is True:
|
||||||
|
word += c
|
||||||
|
elif c != ' ' and spaces is False:
|
||||||
|
word += c
|
||||||
|
else:
|
||||||
|
if word:
|
||||||
|
yield word
|
||||||
|
word = c
|
||||||
|
spaces = (c == ' ')
|
||||||
|
yield word
|
||||||
|
|
||||||
|
if self.parameter['textequiv_level'] in ['word', 'glyph']:
|
||||||
|
word_no = 0
|
||||||
|
i = 0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for word_text in _words(line_text):
|
||||||
|
word_length = len(word_text)
|
||||||
|
if not all(c == ' ' for c in word_text):
|
||||||
|
word_positions = positions[i:i+word_length]
|
||||||
|
word_start = word_positions[0].global_start
|
||||||
|
word_end = word_positions[-1].global_end
|
||||||
|
|
||||||
|
polygon = polygon_from_x0y0x1y1([word_start, 0, word_end, line_image.height])
|
||||||
|
points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords))
|
||||||
|
# XXX Crop to line polygon?
|
||||||
|
|
||||||
|
word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points))
|
||||||
|
word.add_TextEquiv(TextEquivType(Unicode=word_text))
|
||||||
|
|
||||||
|
if self.parameter['textequiv_level'] == 'glyph':
|
||||||
|
for glyph_no, p in enumerate(word_positions):
|
||||||
|
glyph_start = p.global_start
|
||||||
|
glyph_end = p.global_end
|
||||||
|
|
||||||
|
polygon = polygon_from_x0y0x1y1([glyph_start, 0, glyph_end, line_image.height])
|
||||||
|
points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords))
|
||||||
|
|
||||||
|
glyph = GlyphType(id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points))
|
||||||
|
|
||||||
|
# Add predictions (= TextEquivs)
|
||||||
|
char_index_start = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
|
||||||
|
for char_index, char in enumerate(_sort_chars(p), start=char_index_start):
|
||||||
|
glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=char_index, conf=char.probability))
|
||||||
|
|
||||||
|
word.add_Glyph(glyph)
|
||||||
|
|
||||||
|
line.add_Word(word)
|
||||||
|
word_no += 1
|
||||||
|
|
||||||
|
i += word_length
|
||||||
|
|
||||||
|
|
||||||
_page_update_higher_textequiv_levels('line', pcgts)
|
_page_update_higher_textequiv_levels('line', pcgts)
|
||||||
|
|
||||||
file_id = self._make_file_id(input_file, n)
|
|
||||||
|
# Add metadata about this operation and its runtime parameters:
|
||||||
|
metadata = pcgts.get_Metadata() # ensured by from_file()
|
||||||
|
metadata.add_MetadataItem(
|
||||||
|
MetadataItemType(type_="processingStep",
|
||||||
|
name=self.ocrd_tool['steps'][0],
|
||||||
|
value=TOOL,
|
||||||
|
Labels=[LabelsType(
|
||||||
|
externalModel="ocrd-tool",
|
||||||
|
externalId="parameters",
|
||||||
|
Label=[LabelType(type_=name, value=self.parameter[name])
|
||||||
|
for name in self.parameter.keys()])]))
|
||||||
|
|
||||||
|
|
||||||
|
file_id = make_file_id(input_file, self.output_file_grp)
|
||||||
|
pcgts.set_pcGtsId(file_id)
|
||||||
self.workspace.add_file(
|
self.workspace.add_file(
|
||||||
ID=file_id,
|
ID=file_id,
|
||||||
file_grp=self.output_file_grp,
|
file_grp=self.output_file_grp,
|
||||||
|
@ -151,3 +291,5 @@ def _page_update_higher_textequiv_levels(level, pcgts):
|
||||||
else u'' for line in lines)
|
else u'' for line in lines)
|
||||||
region.set_TextEquiv(
|
region.set_TextEquiv(
|
||||||
[TextEquivType(Unicode=region_unicode)]) # remove old
|
[TextEquivType(Unicode=region_unicode)]) # remove old
|
||||||
|
|
||||||
|
# vim:tw=120:
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
numpy
|
h5py < 3 # XXX tensorflow 2.4.0rc3 requires h5py~=2.10.0, but you'll have h5py 3.1.0 which is incompatible.
|
||||||
tensorflow-gpu == 1.14.0
|
tensorflow >= 2.3.0rc2
|
||||||
calamari-ocr == 0.3.5
|
calamari-ocr == 1.0.*
|
||||||
setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime?
|
setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime?
|
||||||
click
|
click
|
||||||
ocrd >= 1.0.0b11
|
ocrd >= 2.22.0
|
||||||
|
|
6
setup.py
6
setup.py
|
@ -5,15 +5,15 @@ from setuptools import setup, find_packages
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name='ocrd_calamari',
|
name='ocrd_calamari',
|
||||||
version='0.0.3',
|
version='1.0.1',
|
||||||
description='Calamari bindings',
|
description='Calamari bindings',
|
||||||
long_description=Path('README.md').read_text(),
|
long_description=Path('README.md').read_text(),
|
||||||
long_description_content_type='text/markdown',
|
long_description_content_type='text/markdown',
|
||||||
author='Konstantin Baierer, Mike Gerber',
|
author='Konstantin Baierer, Mike Gerber',
|
||||||
author_email='unixprog@gmail.com, mike.gerber@sbb.spk-berlin.de',
|
author_email='unixprog@gmail.com, mike.gerber@sbb.spk-berlin.de',
|
||||||
url='https://github.com/kba/ocrd_calamari',
|
url='https://github.com/OCR-D/ocrd_calamari',
|
||||||
license='Apache License 2.0',
|
license='Apache License 2.0',
|
||||||
packages=find_packages(exclude=('tests', 'docs')),
|
packages=find_packages(exclude=('test', 'docs')),
|
||||||
install_requires=Path('requirements.txt').read_text().split('\n'),
|
install_requires=Path('requirements.txt').read_text().split('\n'),
|
||||||
package_data={
|
package_data={
|
||||||
'': ['*.json', '*.yml', '*.yaml'],
|
'': ['*.json', '*.yml', '*.yaml'],
|
||||||
|
|
|
@ -2,6 +2,8 @@ import os
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
from lxml import etree
|
||||||
|
from glob import glob
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import logging
|
import logging
|
||||||
|
@ -10,9 +12,14 @@ from ocrd.resolver import Resolver
|
||||||
from ocrd_calamari import CalamariRecognize
|
from ocrd_calamari import CalamariRecognize
|
||||||
from .base import assets
|
from .base import assets
|
||||||
|
|
||||||
METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml')
|
METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml')
|
||||||
CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json')
|
|
||||||
WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
|
WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
|
||||||
|
CHECKPOINT_DIR = os.path.join(os.getcwd(), 'gt4histocr-calamari1')
|
||||||
|
CHECKPOINT = os.path.join(CHECKPOINT_DIR, '*.ckpt.json')
|
||||||
|
|
||||||
|
# Because XML namespace versions are so much fun, we not only use one, we use TWO!
|
||||||
|
NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
|
||||||
|
NSMAP_GT = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" }
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -32,10 +39,6 @@ def workspace():
|
||||||
"https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f,
|
"https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f,
|
||||||
os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f))
|
os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f))
|
||||||
|
|
||||||
return workspace
|
|
||||||
|
|
||||||
|
|
||||||
def test_recognize(workspace):
|
|
||||||
# The binarization options I have are:
|
# The binarization options I have are:
|
||||||
#
|
#
|
||||||
# a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf)
|
# a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf)
|
||||||
|
@ -48,17 +51,49 @@ def test_recognize(workspace):
|
||||||
ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)
|
ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)
|
||||||
subprocess.call(['convert', ff, '-threshold', '50%', ff])
|
subprocess.call(['convert', ff, '-threshold', '50%', ff])
|
||||||
|
|
||||||
# XXX Should remove GT text to really test this
|
# Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text
|
||||||
|
# XXX Review data again
|
||||||
|
# XXX Make this more robust against namespace version changes
|
||||||
|
for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-LINE"):
|
||||||
|
workspace.download_file(of)
|
||||||
|
for to_remove in ["//pc:Word", "//pc:TextEquiv"]:
|
||||||
|
for ff in glob(os.path.join(WORKSPACE_DIR, "OCR-D-GT-SEG-LINE", "*")):
|
||||||
|
tree = etree.parse(ff)
|
||||||
|
for e in tree.xpath(to_remove, namespaces=NSMAP_GT):
|
||||||
|
e.getparent().remove(e)
|
||||||
|
tree.write(ff, xml_declaration=True, encoding="utf-8")
|
||||||
|
|
||||||
|
return workspace
|
||||||
|
|
||||||
|
|
||||||
|
def test_recognize(workspace):
|
||||||
CalamariRecognize(
|
CalamariRecognize(
|
||||||
workspace,
|
workspace,
|
||||||
input_file_grp="OCR-D-GT-SEG-LINE",
|
input_file_grp="OCR-D-GT-SEG-LINE",
|
||||||
output_file_grp="OCR-D-OCR-CALAMARI",
|
output_file_grp="OCR-D-OCR-CALAMARI",
|
||||||
parameter={'checkpoint': CHECKPOINT}
|
parameter={
|
||||||
|
"checkpoint": CHECKPOINT,
|
||||||
|
}
|
||||||
).process()
|
).process()
|
||||||
workspace.save_mets()
|
workspace.save_mets()
|
||||||
|
|
||||||
page1 = os.path.join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml')
|
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
|
||||||
|
assert os.path.exists(page1)
|
||||||
|
with open(page1, "r", encoding="utf-8") as f:
|
||||||
|
assert "verſchuldeten" in f.read()
|
||||||
|
|
||||||
|
def test_recognize_with_checkpoint_dir(workspace):
|
||||||
|
CalamariRecognize(
|
||||||
|
workspace,
|
||||||
|
input_file_grp="OCR-D-GT-SEG-LINE",
|
||||||
|
output_file_grp="OCR-D-OCR-CALAMARI",
|
||||||
|
parameter={
|
||||||
|
"checkpoint_dir": CHECKPOINT_DIR,
|
||||||
|
}
|
||||||
|
).process()
|
||||||
|
workspace.save_mets()
|
||||||
|
|
||||||
|
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
|
||||||
assert os.path.exists(page1)
|
assert os.path.exists(page1)
|
||||||
with open(page1, 'r', encoding='utf-8') as f:
|
with open(page1, 'r', encoding='utf-8') as f:
|
||||||
assert 'verſchuldeten' in f.read()
|
assert 'verſchuldeten' in f.read()
|
||||||
|
@ -75,3 +110,61 @@ def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(works
|
||||||
|
|
||||||
interesting_log_messages = [t[2] for t in caplog.record_tuples if "Using raw image" in t[2]]
|
interesting_log_messages = [t[2] for t in caplog.record_tuples if "Using raw image" in t[2]]
|
||||||
assert len(interesting_log_messages) > 10 # For every line!
|
assert len(interesting_log_messages) > 10 # For every line!
|
||||||
|
with open(page1, "r", encoding="utf-8") as f:
|
||||||
|
assert "verſchuldeten" in f.read()
|
||||||
|
|
||||||
|
|
||||||
|
def test_word_segmentation(workspace):
|
||||||
|
CalamariRecognize(
|
||||||
|
workspace,
|
||||||
|
input_file_grp="OCR-D-GT-SEG-LINE",
|
||||||
|
output_file_grp="OCR-D-OCR-CALAMARI",
|
||||||
|
parameter={
|
||||||
|
"checkpoint": CHECKPOINT,
|
||||||
|
"textequiv_level": "word", # Note that we're going down to word level here
|
||||||
|
}
|
||||||
|
).process()
|
||||||
|
workspace.save_mets()
|
||||||
|
|
||||||
|
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
|
||||||
|
assert os.path.exists(page1)
|
||||||
|
tree = etree.parse(page1)
|
||||||
|
|
||||||
|
# The result should contain a TextLine that contains the text "December"
|
||||||
|
line = tree.xpath(".//pc:TextLine[pc:TextEquiv/pc:Unicode[contains(text(),'December')]]", namespaces=NSMAP)[0]
|
||||||
|
assert line
|
||||||
|
|
||||||
|
# The textline should a. contain multiple words and b. these should concatenate fine to produce the same line text
|
||||||
|
words = line.xpath(".//pc:Word", namespaces=NSMAP)
|
||||||
|
assert len(words) >= 2
|
||||||
|
words_text = " ".join(word.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text for word in words)
|
||||||
|
line_text = line.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text
|
||||||
|
assert words_text == line_text
|
||||||
|
|
||||||
|
# For extra measure, check that we're not seeing any glyphs, as we asked for textequiv_level == "word"
|
||||||
|
glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP)
|
||||||
|
assert len(glyphs) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_glyphs(workspace):
|
||||||
|
CalamariRecognize(
|
||||||
|
workspace,
|
||||||
|
input_file_grp="OCR-D-GT-SEG-LINE",
|
||||||
|
output_file_grp="OCR-D-OCR-CALAMARI",
|
||||||
|
parameter={
|
||||||
|
"checkpoint": CHECKPOINT,
|
||||||
|
"textequiv_level": "glyph", # Note that we're going down to glyph level here
|
||||||
|
}
|
||||||
|
).process()
|
||||||
|
workspace.save_mets()
|
||||||
|
|
||||||
|
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
|
||||||
|
assert os.path.exists(page1)
|
||||||
|
tree = etree.parse(page1)
|
||||||
|
|
||||||
|
# The result should contain a lot of glyphs
|
||||||
|
glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP)
|
||||||
|
assert len(glyphs) >= 100
|
||||||
|
|
||||||
|
|
||||||
|
# vim:tw=120:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue