This commit is contained in:
Konstantin Baierer 2025-11-07 11:47:31 +00:00 committed by GitHub
commit 34761d3ab5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
49 changed files with 3153 additions and 2029 deletions

View file

@ -24,61 +24,63 @@ jobs:
sudo rm -rf "$AGENT_TOOLSDIRECTORY" sudo rm -rf "$AGENT_TOOLSDIRECTORY"
df -h df -h
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- uses: actions/cache/restore@v4
id: seg_model_cache # - name: Lint with ruff
# uses: astral-sh/ruff-action@v3
# with:
# src: "./src"
- name: Try to restore models_eynollah
uses: actions/cache/restore@v4
id: all_model_cache
with: with:
path: models_layout_v0_5_0 path: models_eynollah
key: seg-models key: models_eynollah
- uses: actions/cache/restore@v4
id: ocr_model_cache
with:
path: models_ocr_v0_5_1
key: ocr-models
- uses: actions/cache/restore@v4
id: bin_model_cache
with:
path: default-2021-03-09
key: bin-models
- name: Download models - name: Download models
if: steps.seg_model_cache.outputs.cache-hit != 'true' || steps.bin_model_cache.outputs.cache-hit != 'true' || steps.ocr_model_cache.outputs.cache-hit != true if: steps.all_model_cache.outputs.cache-hit != 'true'
run: make models run: |
make models
ls -la models_eynollah
- uses: actions/cache/save@v4 - uses: actions/cache/save@v4
if: steps.seg_model_cache.outputs.cache-hit != 'true' if: steps.all_model_cache.outputs.cache-hit != 'true'
with: with:
path: models_layout_v0_5_0 path: models_eynollah
key: seg-models key: models_eynollah
- uses: actions/cache/save@v4
if: steps.ocr_model_cache.outputs.cache-hit != 'true'
with:
path: models_ocr_v0_5_1
key: ocr-models
- uses: actions/cache/save@v4
if: steps.bin_model_cache.outputs.cache-hit != 'true'
with:
path: default-2021-03-09
key: bin-models
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
# - uses: actions/cache@v4
# with:
# path: |
# path/to/dependencies
# some/other/dependencies
# key: ${{ runner.os }}-${{ hashFiles('**/lockfiles') }}
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
make install-dev EXTRAS=OCR,plotting make install-dev EXTRAS=OCR,plotting
make deps-test EXTRAS=OCR,plotting make deps-test EXTRAS=OCR,plotting
ls -l models_*
- name: Lint with ruff - name: Hard-upgrade torch for debugging
uses: astral-sh/ruff-action@v3 run: |
with: python -m pip install --upgrade torch
src: "./src"
- name: Test with pytest - name: Test with pytest
run: make coverage PYTEST_ARGS="-vv --junitxml=pytest.xml" run: make coverage PYTEST_ARGS="-vv --junitxml=pytest.xml"
- name: Get coverage results - name: Get coverage results
run: | run: |
coverage report --format=markdown >> $GITHUB_STEP_SUMMARY coverage report --format=markdown >> $GITHUB_STEP_SUMMARY
coverage html coverage html
coverage json coverage json
coverage xml coverage xml
- name: Store coverage results - name: Store coverage results
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
@ -88,12 +90,15 @@ jobs:
pytest.xml pytest.xml
coverage.xml coverage.xml
coverage.json coverage.json
- name: Upload coverage results - name: Upload coverage results
uses: codecov/codecov-action@v4 uses: codecov/codecov-action@v4
with: with:
files: coverage.xml files: coverage.xml
fail_ci_if_error: false fail_ci_if_error: false
- name: Test standalone CLI - name: Test standalone CLI
run: make smoke-test run: make smoke-test
- name: Test OCR-D CLI - name: Test OCR-D CLI
run: make ocrd-test run: make ocrd-test

1
.gitignore vendored
View file

@ -11,3 +11,4 @@ output.html
*.tif *.tif
*.sw? *.sw?
TAGS TAGS
uv.lock

View file

@ -6,23 +6,17 @@ EXTRAS ?=
DOCKER_BASE_IMAGE ?= docker.io/ocrd/core-cuda-tf2:latest DOCKER_BASE_IMAGE ?= docker.io/ocrd/core-cuda-tf2:latest
DOCKER_TAG ?= ocrd/eynollah DOCKER_TAG ?= ocrd/eynollah
DOCKER ?= docker DOCKER ?= docker
WGET = wget -O
#SEG_MODEL := https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz #SEG_MODEL := https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz
#SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz #SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz
# SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah.tar.gz # SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah.tar.gz
#SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz #SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz
#SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz #SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz
SEG_MODEL := https://zenodo.org/records/17194824/files/models_layout_v0_5_0.tar.gz?download=1 #SEG_MODEL := https://zenodo.org/records/17194824/files/models_layout_v0_5_0.tar.gz?download=1
SEG_MODELFILE = $(notdir $(patsubst %?download=1,%,$(SEG_MODEL))) EYNOLLAH_MODELS_URL := https://zenodo.org/records/17417471/files/models_all_v0_7_0.zip
SEG_MODELNAME = $(SEG_MODELFILE:%.tar.gz=%) EYNOLLAH_MODELS_ZIP = $(notdir $(EYNOLLAH_MODELS_URL))
EYNOLLAH_MODELS_DIR = $(EYNOLLAH_MODELS_ZIP:%.zip=%)
BIN_MODEL := https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip
BIN_MODELFILE = $(notdir $(BIN_MODEL))
BIN_MODELNAME := default-2021-03-09
OCR_MODEL := https://zenodo.org/records/17236998/files/models_ocr_v0_5_1.tar.gz?download=1
OCR_MODELFILE = $(notdir $(patsubst %?download=1,%,$(OCR_MODEL)))
OCR_MODELNAME = $(OCR_MODELFILE:%.tar.gz=%)
PYTEST_ARGS ?= -vv --isolate PYTEST_ARGS ?= -vv --isolate
@ -38,7 +32,7 @@ help:
@echo " install-dev Install editable with pip" @echo " install-dev Install editable with pip"
@echo " deps-test Install test dependencies with pip" @echo " deps-test Install test dependencies with pip"
@echo " models Download and extract models to $(CURDIR):" @echo " models Download and extract models to $(CURDIR):"
@echo " $(BIN_MODELNAME) $(SEG_MODELNAME) $(OCR_MODELNAME)" @echo " $(EYNOLLAH_MODELS_DIR)"
@echo " smoke-test Run simple CLI check" @echo " smoke-test Run simple CLI check"
@echo " ocrd-test Run OCR-D CLI check" @echo " ocrd-test Run OCR-D CLI check"
@echo " test Run unit tests" @echo " test Run unit tests"
@ -47,34 +41,22 @@ help:
@echo " EXTRAS comma-separated list of features (like 'OCR,plotting') for 'install' [$(EXTRAS)]" @echo " EXTRAS comma-separated list of features (like 'OCR,plotting') for 'install' [$(EXTRAS)]"
@echo " DOCKER_TAG Docker image tag for 'docker' [$(DOCKER_TAG)]" @echo " DOCKER_TAG Docker image tag for 'docker' [$(DOCKER_TAG)]"
@echo " PYTEST_ARGS pytest args for 'test' (Set to '-s' to see log output during test execution, '-vv' to see individual tests. [$(PYTEST_ARGS)]" @echo " PYTEST_ARGS pytest args for 'test' (Set to '-s' to see log output during test execution, '-vv' to see individual tests. [$(PYTEST_ARGS)]"
@echo " SEG_MODEL URL of 'models' archive to download for segmentation 'test' [$(SEG_MODEL)]" @echo " ALL_MODELS URL of archive of all models [$(ALL_MODELS)]"
@echo " BIN_MODEL URL of 'models' archive to download for binarization 'test' [$(BIN_MODEL)]"
@echo " OCR_MODEL URL of 'models' archive to download for binarization 'test' [$(OCR_MODEL)]"
@echo "" @echo ""
# END-EVAL # END-EVAL
# Download and extract models to $(PWD)/models_layout_v0_6_0
# Download and extract models to $(PWD)/models_layout_v0_5_0 models: $(EYNOLLAH_MODELS_DIR)
models: $(BIN_MODELNAME) $(SEG_MODELNAME) $(OCR_MODELNAME)
# do not download these files if we already have the directories # do not download these files if we already have the directories
.INTERMEDIATE: $(BIN_MODELFILE) $(SEG_MODELFILE) $(OCR_MODELFILE) .INTERMEDIATE: $(EYNOLLAH_MODELS_ZIP)
$(BIN_MODELFILE): $(EYNOLLAH_MODELS_ZIP):
wget -O $@ $(BIN_MODEL) $(WGET) $@ $(EYNOLLAH_MODELS_URL)
$(SEG_MODELFILE):
wget -O $@ $(SEG_MODEL)
$(OCR_MODELFILE):
wget -O $@ $(OCR_MODEL)
$(BIN_MODELNAME): $(BIN_MODELFILE) $(EYNOLLAH_MODELS_DIR): $(EYNOLLAH_MODELS_ZIP)
mkdir $@ unzip $<
unzip -d $@ $<
$(SEG_MODELNAME): $(SEG_MODELFILE)
tar zxf $<
$(OCR_MODELNAME): $(OCR_MODELFILE)
tar zxf $<
build: build:
$(PIP) install build $(PIP) install build
@ -88,34 +70,28 @@ install:
install-dev: install-dev:
$(PIP) install -e .$(and $(EXTRAS),[$(EXTRAS)]) $(PIP) install -e .$(and $(EXTRAS),[$(EXTRAS)])
ifeq (OCR,$(findstring OCR, $(EXTRAS))) deps-test:
deps-test: $(OCR_MODELNAME)
endif
deps-test: $(BIN_MODELNAME) $(SEG_MODELNAME)
$(PIP) install -r requirements-test.txt $(PIP) install -r requirements-test.txt
ifeq (OCR,$(findstring OCR, $(EXTRAS)))
ln -rs $(OCR_MODELNAME)/* $(SEG_MODELNAME)/
endif
smoke-test: TMPDIR != mktemp -d smoke-test: TMPDIR != mktemp -d
smoke-test: tests/resources/kant_aufklaerung_1784_0020.tif smoke-test: tests/resources/kant_aufklaerung_1784_0020.tif
# layout analysis: # layout analysis:
eynollah layout -i $< -o $(TMPDIR) -m $(CURDIR)/$(SEG_MODELNAME) eynollah -m $(CURDIR)/models_eynollah layout -i $< -o $(TMPDIR)
fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 $(TMPDIR)/$(basename $(<F)).xml fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 $(TMPDIR)/$(basename $(<F)).xml
fgrep -c -e TextRegion -e ImageRegion -e SeparatorRegion $(TMPDIR)/$(basename $(<F)).xml fgrep -c -e TextRegion -e ImageRegion -e SeparatorRegion $(TMPDIR)/$(basename $(<F)).xml
# layout, directory mode (skip one, add one): # layout, directory mode (skip one, add one):
eynollah layout -di $(<D) -o $(TMPDIR) -m $(CURDIR)/$(SEG_MODELNAME) eynollah -m $(CURDIR)/models_eynollah layout -di $(<D) -o $(TMPDIR)
test -s $(TMPDIR)/euler_rechenkunst01_1738_0025.xml test -s $(TMPDIR)/euler_rechenkunst01_1738_0025.xml
# mbreorder, directory mode (overwrite): # mbreorder, directory mode (overwrite):
eynollah machine-based-reading-order -di $(<D) -o $(TMPDIR) -m $(CURDIR)/$(SEG_MODELNAME) eynollah -m $(CURDIR)/$(SEG_MODELNAME) machine-based-reading-order -di $(<D) -o $(TMPDIR)
fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 $(TMPDIR)/$(basename $(<F)).xml fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 $(TMPDIR)/$(basename $(<F)).xml
fgrep -c -e RegionRefIndexed $(TMPDIR)/$(basename $(<F)).xml fgrep -c -e RegionRefIndexed $(TMPDIR)/$(basename $(<F)).xml
# binarize: # binarize:
eynollah binarization -m $(CURDIR)/$(BIN_MODELNAME) -i $< -o $(TMPDIR)/$(<F) eynollah -m $(CURDIR)/models_eynollah/eynollah-binarization_20210425 binarization -i $< -o $(TMPDIR)/$(<F)
test -s $(TMPDIR)/$(<F) test -s $(TMPDIR)/$(<F)
@set -x; test "$$(identify -format '%w %h' $<)" = "$$(identify -format '%w %h' $(TMPDIR)/$(<F))" @set -x; test "$$(identify -format '%w %h' $<)" = "$$(identify -format '%w %h' $(TMPDIR)/$(<F))"
# enhance: # enhance:
eynollah enhancement -m $(CURDIR)/$(SEG_MODELNAME) -sos -i $< -o $(TMPDIR) -O eynollah -m $(CURDIR)/models_eynollah enhancement -sos -i $< -o $(TMPDIR) -O
test -s $(TMPDIR)/$(<F) test -s $(TMPDIR)/$(<F)
@set -x; test "$$(identify -format '%w %h' $<)" = "$$(identify -format '%w %h' $(TMPDIR)/$(<F))" @set -x; test "$$(identify -format '%w %h' $<)" = "$$(identify -format '%w %h' $(TMPDIR)/$(<F))"
$(RM) -r $(TMPDIR) $(RM) -r $(TMPDIR)
@ -126,18 +102,16 @@ ocrd-test: tests/resources/kant_aufklaerung_1784_0020.tif
cp $< $(TMPDIR) cp $< $(TMPDIR)
ocrd workspace -d $(TMPDIR) init ocrd workspace -d $(TMPDIR) init
ocrd workspace -d $(TMPDIR) add -G OCR-D-IMG -g PHYS_0020 -i OCR-D-IMG_0020 $(<F) ocrd workspace -d $(TMPDIR) add -G OCR-D-IMG -g PHYS_0020 -i OCR-D-IMG_0020 $(<F)
ocrd-eynollah-segment -w $(TMPDIR) -I OCR-D-IMG -O OCR-D-SEG -P models $(CURDIR)/$(SEG_MODELNAME) ocrd-eynollah-segment -w $(TMPDIR) -I OCR-D-IMG -O OCR-D-SEG -P models $(CURDIR)/models_eynollah
result=$$(ocrd workspace -d $(TMPDIR) find -G OCR-D-SEG); \ result=$$(ocrd workspace -d $(TMPDIR) find -G OCR-D-SEG); \
fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 $(TMPDIR)/$$result && \ fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 $(TMPDIR)/$$result && \
fgrep -c -e TextRegion -e ImageRegion -e SeparatorRegion $(TMPDIR)/$$result fgrep -c -e TextRegion -e ImageRegion -e SeparatorRegion $(TMPDIR)/$$result
ocrd-sbb-binarize -w $(TMPDIR) -I OCR-D-IMG -O OCR-D-BIN -P model $(CURDIR)/$(BIN_MODELNAME) ocrd-sbb-binarize -w $(TMPDIR) -I OCR-D-IMG -O OCR-D-BIN -P model $(CURDIR)/models_eynollah/eynollah-binarization_20210425
ocrd-sbb-binarize -w $(TMPDIR) -I OCR-D-SEG -O OCR-D-SEG-BIN -P model $(CURDIR)/$(BIN_MODELNAME) -P operation_level region ocrd-sbb-binarize -w $(TMPDIR) -I OCR-D-SEG -O OCR-D-SEG-BIN -P model $(CURDIR)/models_eynollah/eynollah-binarization_20210425 -P operation_level region
$(RM) -r $(TMPDIR) $(RM) -r $(TMPDIR)
# Run unit tests # Run unit tests
test: export MODELS_LAYOUT=$(CURDIR)/$(SEG_MODELNAME) test: export EYNOLLAH_MODELS_DIR := $(CURDIR)
test: export MODELS_OCR=$(CURDIR)/$(OCR_MODELNAME)
test: export MODELS_BIN=$(CURDIR)/$(BIN_MODELNAME)
test: test:
$(PYTHON) -m pytest tests --durations=0 --continue-on-collection-errors $(PYTEST_ARGS) $(PYTHON) -m pytest tests --durations=0 --continue-on-collection-errors $(PYTEST_ARGS)

159
README.md
View file

@ -2,6 +2,7 @@
> Document Layout Analysis, Binarization and OCR with Deep Learning and Heuristics > Document Layout Analysis, Binarization and OCR with Deep Learning and Heuristics
[![Python Versions](https://img.shields.io/pypi/pyversions/eynollah.svg)](https://pypi.python.org/pypi/eynollah)
[![PyPI Version](https://img.shields.io/pypi/v/eynollah)](https://pypi.org/project/eynollah/) [![PyPI Version](https://img.shields.io/pypi/v/eynollah)](https://pypi.org/project/eynollah/)
[![GH Actions Test](https://github.com/qurator-spk/eynollah/actions/workflows/test-eynollah.yml/badge.svg)](https://github.com/qurator-spk/eynollah/actions/workflows/test-eynollah.yml) [![GH Actions Test](https://github.com/qurator-spk/eynollah/actions/workflows/test-eynollah.yml/badge.svg)](https://github.com/qurator-spk/eynollah/actions/workflows/test-eynollah.yml)
[![GH Actions Deploy](https://github.com/qurator-spk/eynollah/actions/workflows/build-docker.yml/badge.svg)](https://github.com/qurator-spk/eynollah/actions/workflows/build-docker.yml) [![GH Actions Deploy](https://github.com/qurator-spk/eynollah/actions/workflows/build-docker.yml/badge.svg)](https://github.com/qurator-spk/eynollah/actions/workflows/build-docker.yml)
@ -11,24 +12,22 @@
![](https://user-images.githubusercontent.com/952378/102350683-8a74db80-3fa5-11eb-8c7e-f743f7d6eae2.jpg) ![](https://user-images.githubusercontent.com/952378/102350683-8a74db80-3fa5-11eb-8c7e-f743f7d6eae2.jpg)
## Features ## Features
* Support for 10 distinct segmentation classes: * Document layout analysis using pixelwise segmentation models with support for 10 segmentation classes:
* background, [page border](https://ocr-d.de/en/gt-guidelines/trans/lyRand.html), [text region](https://ocr-d.de/en/gt-guidelines/trans/lytextregion.html#textregionen__textregion_), [text line](https://ocr-d.de/en/gt-guidelines/pagexml/pagecontent_xsd_Complex_Type_pc_TextLineType.html), [header](https://ocr-d.de/en/gt-guidelines/trans/lyUeberschrift.html), [image](https://ocr-d.de/en/gt-guidelines/trans/lyBildbereiche.html), [separator](https://ocr-d.de/en/gt-guidelines/trans/lySeparatoren.html), [marginalia](https://ocr-d.de/en/gt-guidelines/trans/lyMarginalie.html), [initial](https://ocr-d.de/en/gt-guidelines/trans/lyInitiale.html), [table](https://ocr-d.de/en/gt-guidelines/trans/lyTabellen.html) * background, [page border](https://ocr-d.de/en/gt-guidelines/trans/lyRand.html), [text region](https://ocr-d.de/en/gt-guidelines/trans/lytextregion.html#textregionen__textregion_), [text line](https://ocr-d.de/en/gt-guidelines/pagexml/pagecontent_xsd_Complex_Type_pc_TextLineType.html), [header](https://ocr-d.de/en/gt-guidelines/trans/lyUeberschrift.html), [image](https://ocr-d.de/en/gt-guidelines/trans/lyBildbereiche.html), [separator](https://ocr-d.de/en/gt-guidelines/trans/lySeparatoren.html), [marginalia](https://ocr-d.de/en/gt-guidelines/trans/lyMarginalie.html), [initial](https://ocr-d.de/en/gt-guidelines/trans/lyInitiale.html), [table](https://ocr-d.de/en/gt-guidelines/trans/lyTabellen.html)
* Support for various image optimization operations:
* cropping (border detection), binarization, deskewing, dewarping, scaling, enhancing, resizing
* Textline segmentation to bounding boxes or polygons (contours) including for curved lines and vertical text * Textline segmentation to bounding boxes or polygons (contours) including for curved lines and vertical text
* Text recognition (OCR) using either CNN-RNN or Transformer models * Document image binarization with pixelwise segmentation or hybrid CNN-Transformer models
* Detection of reading order (left-to-right or right-to-left) using either heuristics or trainable models * Text recognition (OCR) with CNN-RNN or TrOCR models
* Detection of reading order (left-to-right or right-to-left) using heuristics or trainable models
* Output in [PAGE-XML](https://github.com/PRImA-Research-Lab/PAGE-XML) * Output in [PAGE-XML](https://github.com/PRImA-Research-Lab/PAGE-XML)
* [OCR-D](https://github.com/qurator-spk/eynollah#use-as-ocr-d-processor) interface * [OCR-D](https://github.com/qurator-spk/eynollah#use-as-ocr-d-processor) interface
:warning: Development is focused on achieving the best quality of results for a wide variety of historical :warning: Development is focused on achieving the best quality of results for a wide variety of historical
documents and therefore processing can be very slow. We aim to improve this, but contributions are welcome. documents using a combination of multiple deep learning models and heuristics; therefore processing can be slow.
## Installation ## Installation
Python `3.8-3.11` with Tensorflow `<2.13` on Linux are currently supported. Python `3.8-3.11` with Tensorflow `<2.13` on Linux are currently supported.
For (limited) GPU support the CUDA toolkit needs to be installed.
For (limited) GPU support the CUDA toolkit needs to be installed. A known working config is CUDA `11` with cuDNN `8.6`. A working config is CUDA `11.8` with cuDNN `8.6`.
You can either install from PyPI You can either install from PyPI
@ -53,31 +52,41 @@ pip install "eynollah[OCR]"
make install EXTRAS=OCR make install EXTRAS=OCR
``` ```
### Docker
Use
```
docker pull ghcr.io/qurator-spk/eynollah:latest
```
When using Eynollah with Docker, see [`docker.md`](https://github.com/qurator-spk/eynollah/tree/main/docs/docker.md).
## Models ## Models
Pretrained models can be downloaded from [zenodo](https://zenodo.org/records/17194824) or [huggingface](https://huggingface.co/SBB?search_models=eynollah). Pretrained models can be downloaded from [Zenodo](https://zenodo.org/records/17194824) or [Hugging Face](https://huggingface.co/SBB?search_models=eynollah).
For documentation on models, have a look at [`models.md`](https://github.com/qurator-spk/eynollah/tree/main/docs/models.md). For model documentation and model cards, see [`models.md`](https://github.com/qurator-spk/eynollah/tree/main/docs/models.md).
Model cards are also provided for our trained models.
## Training ## Training
In case you want to train your own model with Eynollah, see the To train your own model with Eynollah, see [`train.md`](https://github.com/qurator-spk/eynollah/tree/main/docs/train.md) and use the tools in the [`train`](https://github.com/qurator-spk/eynollah/tree/main/train) folder.
documentation in [`train.md`](https://github.com/qurator-spk/eynollah/tree/main/docs/train.md) and use the
tools in the [`train` folder](https://github.com/qurator-spk/eynollah/tree/main/train).
## Usage ## Usage
Eynollah supports five use cases: layout analysis (segmentation), binarization, Eynollah supports five use cases:
image enhancement, text recognition (OCR), and reading order detection. 1. [layout analysis (segmentation)](#layout-analysis),
2. [binarization](#binarization),
3. [image enhancement](#image-enhancement),
4. [text recognition (OCR)](#ocr), and
5. [reading order detection](#reading-order-detection).
Some example outputs can be found in [`examples.md`](https://github.com/qurator-spk/eynollah/tree/main/docs/examples.md).
### Layout Analysis ### Layout Analysis
The layout analysis module is responsible for detecting layout elements, identifying text lines, and determining reading The layout analysis module is responsible for detecting layout elements, identifying text lines, and determining reading
order using either heuristic methods or a [pretrained reading order detection model](https://github.com/qurator-spk/eynollah#machine-based-reading-order). order using heuristic methods or a [pretrained model](https://github.com/qurator-spk/eynollah#machine-based-reading-order).
Reading order detection can be performed either as part of layout analysis based on image input, or, currently under
development, based on pre-existing layout analysis results in PAGE-XML format as input.
The command-line interface for layout analysis can be called like this: The command-line interface for layout analysis can be called like this:
@ -91,29 +100,42 @@ eynollah layout \
The following options can be used to further configure the processing: The following options can be used to further configure the processing:
| option | description | | option | description |
|-------------------|:-------------------------------------------------------------------------------| |-------------------|:--------------------------------------------------------------------------------------------|
| `-fl` | full layout analysis including all steps and segmentation classes | | `-fl` | full layout analysis including all steps and segmentation classes (recommended) |
| `-light` | lighter and faster but simpler method for main region detection and deskewing | | `-light` | lighter and faster but simpler method for main region detection and deskewing (recommended) |
| `-tll` | this indicates the light textline and should be passed with light version | | `-tll` | this indicates the light textline and should be passed with light version (recommended) |
| `-tab` | apply table detection | | `-tab` | apply table detection |
| `-ae` | apply enhancement (the resulting image is saved to the output directory) | | `-ae` | apply enhancement (the resulting image is saved to the output directory) |
| `-as` | apply scaling | | `-as` | apply scaling |
| `-cl` | apply contour detection for curved text lines instead of bounding boxes | | `-cl` | apply contour detection for curved text lines instead of bounding boxes |
| `-ib` | apply binarization (the resulting image is saved to the output directory) | | `-ib` | apply binarization (the resulting image is saved to the output directory) |
| `-ep` | enable plotting (MUST always be used with `-sl`, `-sd`, `-sa`, `-si` or `-ae`) | | `-ep` | enable plotting (MUST always be used with `-sl`, `-sd`, `-sa`, `-si` or `-ae`) |
| `-eoi` | extract only images to output directory (other processing will not be done) | | `-eoi` | extract only images to output directory (other processing will not be done) |
| `-ho` | ignore headers for reading order dectection | | `-ho` | ignore headers for reading order dectection |
| `-si <directory>` | save image regions detected to this directory | | `-si <directory>` | save image regions detected to this directory |
| `-sd <directory>` | save deskewed image to this directory | | `-sd <directory>` | save deskewed image to this directory |
| `-sl <directory>` | save layout prediction as plot to this directory | | `-sl <directory>` | save layout prediction as plot to this directory |
| `-sp <directory>` | save cropped page image to this directory | | `-sp <directory>` | save cropped page image to this directory |
| `-sa <directory>` | save all (plot, enhanced/binary image, layout) to this directory | | `-sa <directory>` | save all (plot, enhanced/binary image, layout) to this directory |
| `-thart` | threshold of artifical class in the case of textline detection. The default value is 0.1 |
| `-tharl` | threshold of artifical class in the case of layout detection. The default value is 0.1 |
| `-ocr` | do ocr |
| `-tr` | apply transformer ocr. Default model is a CNN-RNN model |
| `-bs_ocr` | ocr inference batch size. Default bs for trocr and cnn_rnn models are 2 and 8 respectively |
| `-ncu` | upper limit of columns in document image |
| `-ncl` | lower limit of columns in document image |
| `-slro` | skip layout detection and reading order |
| `-romb` | apply machine based reading order detection |
| `-ipe` | ignore page extraction |
If no further option is set, the tool performs layout detection of main regions (background, text, images, separators If no further option is set, the tool performs layout detection of main regions (background, text, images, separators
and marginals). and marginals).
The best output quality is achieved when RGB images are used as input rather than greyscale or binarized images. The best output quality is achieved when RGB images are used as input rather than greyscale or binarized images.
Additional documentation can be found in [`usage.md`](https://github.com/qurator-spk/eynollah/tree/main/docs/usage.md).
### Binarization ### Binarization
The binarization module performs document image binarization using pretrained pixelwise segmentation models. The binarization module performs document image binarization using pretrained pixelwise segmentation models.
@ -124,9 +146,12 @@ The command-line interface for binarization can be called like this:
eynollah binarization \ eynollah binarization \
-i <single image file> | -di <directory containing image files> \ -i <single image file> | -di <directory containing image files> \
-o <output directory> \ -o <output directory> \
-m <directory containing model files> \ -m <directory containing model files>
``` ```
### Image Enhancement
TODO
### OCR ### OCR
The OCR module performs text recognition using either a CNN-RNN model or a Transformer model. The OCR module performs text recognition using either a CNN-RNN model or a Transformer model.
@ -138,12 +163,29 @@ eynollah ocr \
-i <single image file> | -di <directory containing image files> \ -i <single image file> | -di <directory containing image files> \
-dx <directory of xmls> \ -dx <directory of xmls> \
-o <output directory> \ -o <output directory> \
-m <directory containing model files> | --model_name <path to specific model> \ -m <directory containing model files> | --model_name <path to specific model>
``` ```
### Machine-based-reading-order The following options can be used to further configure the ocr processing:
The machine-based reading-order module employs a pretrained model to identify the reading order from layouts represented in PAGE-XML files. | option | description |
|-------------------|:-------------------------------------------------------------------------------------------|
| `-dib` | directory of binarized images (file type must be '.png'), prediction with both RGB and bin |
| `-doit` | directory for output images rendered with the predicted text |
| `--model_name` | file path to use specific model for OCR |
| `-trocr` | use transformer ocr model (otherwise cnn_rnn model is used) |
| `-etit` | export textline images and text in xml to output dir (OCR training data) |
| `-nmtc` | cropped textline images will not be masked with textline contour |
| `-bs` | ocr inference batch size. Default batch size is 2 for trocr and 8 for cnn_rnn models |
| `-ds_pref` | add an abbrevation of dataset name to generated training data |
| `-min_conf` | minimum OCR confidence value. OCR with textline conf lower than this will be ignored |
### Reading Order Detection
Reading order detection can be performed either as part of layout analysis based on image input, or, currently under
development, based on pre-existing layout analysis data in PAGE-XML format as input.
The reading order detection module employs a pretrained model to identify the reading order from layouts represented in PAGE-XML files.
The command-line interface for machine based reading order can be called like this: The command-line interface for machine based reading order can be called like this:
@ -155,36 +197,9 @@ eynollah machine-based-reading-order \
-o <output directory> -o <output directory>
``` ```
#### Use as OCR-D processor ## Use as OCR-D processor
Eynollah ships with a CLI interface to be used as [OCR-D](https://ocr-d.de) [processor](https://ocr-d.de/en/spec/cli), See [`ocrd.md`](https://github.com/qurator-spk/eynollah/tree/main/docs/ocrd.md).
formally described in [`ocrd-tool.json`](https://github.com/qurator-spk/eynollah/tree/main/src/eynollah/ocrd-tool.json).
In this case, the source image file group with (preferably) RGB images should be used as input like this:
ocrd-eynollah-segment -I OCR-D-IMG -O OCR-D-SEG -P models eynollah_layout_v0_5_0
If the input file group is PAGE-XML (from a previous OCR-D workflow step), Eynollah behaves as follows:
- existing regions are kept and ignored (i.e. in effect they might overlap segments from Eynollah results)
- existing annotation (and respective `AlternativeImage`s) are partially _ignored_:
- previous page frame detection (`cropped` images)
- previous derotation (`deskewed` images)
- previous thresholding (`binarized` images)
- if the page-level image nevertheless deviates from the original (`@imageFilename`)
(because some other preprocessing step was in effect like `denoised`), then
the output PAGE-XML will be based on that as new top-level (`@imageFilename`)
ocrd-eynollah-segment -I OCR-D-XYZ -O OCR-D-SEG -P models eynollah_layout_v0_5_0
In general, it makes more sense to add other workflow steps **after** Eynollah.
There is also an OCR-D processor for binarization:
ocrd-sbb-binarize -I OCR-D-IMG -O OCR-D-BIN -P models default-2021-03-09
#### Additional documentation
Additional documentation is available in the [docs](https://github.com/qurator-spk/eynollah/tree/main/docs) directory.
## How to cite ## How to cite

43
docs/docker.md Normal file
View file

@ -0,0 +1,43 @@
## Inference with Docker
docker pull ghcr.io/qurator-spk/eynollah:latest
### 1. ocrd resource manager
(just once, to get the models and install them into a named volume for later re-use)
vol_models=ocrd-resources:/usr/local/share/ocrd-resources
docker run --rm -v $vol_models ocrd/eynollah ocrd resmgr download ocrd-eynollah-segment default
Now, each time you want to use Eynollah, pass the same resources volume again.
Also, bind-mount some data directory, e.g. current working directory $PWD (/data is default working directory in the container).
Either use standalone CLI (2) or OCR-D CLI (3):
### 2. standalone CLI
(follow self-help, cf. readme)
docker run --rm -v $vol_models -v $PWD:/data ocrd/eynollah eynollah binarization --help
docker run --rm -v $vol_models -v $PWD:/data ocrd/eynollah eynollah layout --help
docker run --rm -v $vol_models -v $PWD:/data ocrd/eynollah eynollah ocr --help
### 3. OCR-D CLI
(follow self-help, cf. readme and https://ocr-d.de/en/spec/cli)
docker run --rm -v $vol_models -v $PWD:/data ocrd/eynollah ocrd-eynollah-segment -h
docker run --rm -v $vol_models -v $PWD:/data ocrd/eynollah ocrd-sbb-binarize -h
Alternatively, just "log in" to the container once and use the commands there:
docker run --rm -v $vol_models -v $PWD:/data -it ocrd/eynollah bash
## Training with Docker
Build the Docker training image
cd train
docker build -t model-training .
Run the Docker training image
cd train
docker run --gpus all -v $PWD:/entry_point_dir model-training

18
docs/examples.md Normal file
View file

@ -0,0 +1,18 @@
# Examples
Example outputs of various Eynollah models
# Binarisation
<img src="https://user-images.githubusercontent.com/952378/63592437-e433e400-c5b1-11e9-9c2d-889c6e93d748.jpg" width="45%"><img src="https://user-images.githubusercontent.com/952378/63592435-e433e400-c5b1-11e9-88e4-3e441b61fa67.jpg" width="45%">
<img src="https://user-images.githubusercontent.com/952378/63592440-e4cc7a80-c5b1-11e9-8964-2cd1b22c87be.jpg" width="45%"><img src="https://user-images.githubusercontent.com/952378/63592438-e4cc7a80-c5b1-11e9-86dc-a9e9f8555422.jpg" width="45%">
# Reading Order Detection
<img src="https://github.com/user-attachments/assets/42df2582-4579-415e-92f1-54858a02c830" alt="Input Image" width="45%">
<img src="https://github.com/user-attachments/assets/77fc819e-6302-4fc9-967c-ee11d10d863e" alt="Output Image" width="45%">
# OCR
<img src="https://github.com/user-attachments/assets/71054636-51c6-4117-b3cf-361c5cda3528" alt="Input Image" width="45%"><img src="https://github.com/user-attachments/assets/cfb3ce38-007a-4037-b547-21324a7d56dd" alt="Output Image" width="45%">
<img src="https://github.com/user-attachments/assets/343b2ed8-d818-4d4a-b301-f304cbbebfcd" alt="Input Image" width="45%"><img src="https://github.com/user-attachments/assets/accb5ba7-e37f-477e-84aa-92eafa0d136e" alt="Output Image" width="45%">

View file

@ -18,7 +18,8 @@ Two Arabic/Persian terms form the name of the model suite: عين الله, whic
See the flowchart below for the different stages and how they interact: See the flowchart below for the different stages and how they interact:
![](https://user-images.githubusercontent.com/952378/100619946-1936f680-331e-11eb-9297-6e8b4cab3c16.png) <img width="810" height="691" alt="eynollah_flowchart" src="https://github.com/user-attachments/assets/42dd55bc-7b85-4b46-9afe-15ff712607f0" />
## Models ## Models
@ -151,15 +152,75 @@ This model is used for the task of illustration detection only.
Model card: [Reading Order Detection]() Model card: [Reading Order Detection]()
TODO The model extracts the reading order of text regions from the layout by classifying pairwise relationships between them. A sorting algorithm then determines the overall reading sequence.
### OCR
We have trained three OCR models: two CNN-RNNbased models and one transformer-based TrOCR model. The CNN-RNN models are generally faster and provide better results in most cases, though their performance decreases with heavily degraded images. The TrOCR model, on the other hand, is computationally expensive and slower during inference, but it can possibly produce better results on strongly degraded images.
#### CNN-RNN model: model_eynollah_ocr_cnnrnn_20250805
This model is trained on data where most of the samples are in Fraktur german script.
| Dataset | Input | CER | WER |
|-----------------------|:-------|:-----------|:----------|
| OCR-D-GT-Archiveform | BIN | 0.02147 | 0.05685 |
| OCR-D-GT-Archiveform | RGB | 0.01636 | 0.06285 |
#### CNN-RNN model: model_eynollah_ocr_cnnrnn_20250904 (Default)
Compared to the model_eynollah_ocr_cnnrnn_20250805 model, this model is trained on a larger proportion of Antiqua data and achieves superior performance.
| Dataset | Input | CER | WER |
|-----------------------|:------------|:-----------|:----------|
| OCR-D-GT-Archiveform | BIN | 0.01635 | 0.05410 |
| OCR-D-GT-Archiveform | RGB | 0.01471 | 0.05813 |
| BLN600 | RGB | 0.04409 | 0.08879 |
| BLN600 | Enhanced | 0.03599 | 0.06244 |
#### Transformer OCR model: model_eynollah_ocr_trocr_20250919
This transformer OCR model is trained on the same data as model_eynollah_ocr_trocr_20250919.
| Dataset | Input | CER | WER |
|-----------------------|:------------|:-----------|:----------|
| OCR-D-GT-Archiveform | BIN | 0.01841 | 0.05589 |
| OCR-D-GT-Archiveform | RGB | 0.01552 | 0.06177 |
| BLN600 | RGB | 0.06347 | 0.13853 |
##### Qualitative evaluation of the models
| <img width="1600" src="https://github.com/user-attachments/assets/120fec0c-c370-46a6-b132-b0af800607cf"> | <img width="1000" src="https://github.com/user-attachments/assets/d84e6819-0a2a-4b3a-bb7d-ceac941babc4"> | <img width="1000" src="https://github.com/user-attachments/assets/bdd27cdb-bbec-4223-9a86-de7a27c6d018"> | <img width="1000" src="https://github.com/user-attachments/assets/1a507c75-75de-4da3-9545-af3746b9a207"> |
|:---:|:---:|:---:|:---:|
| Image | cnnrnn_20250805 | cnnrnn_20250904 | trocr_20250919 |
| <img width="2000" src="https://github.com/user-attachments/assets/9bc13d48-2a92-45fc-88db-c07ffadba067"> | <img width="1000" src="https://github.com/user-attachments/assets/2b294aeb-1362-4d6e-b70f-8aeffd94c5e7"> | <img width="1000" src="https://github.com/user-attachments/assets/9911317e-632e-4e6a-8839-1fb7e783da11"> | <img width="1000" src="https://github.com/user-attachments/assets/2c5626d9-0d23-49d3-80f5-a95f629c9c76"> |
|:---:|:---:|:---:|:---:|
| Image | cnnrnn_20250805 | cnnrnn_20250904 | trocr_20250919 |
| <img width="2000" src="https://github.com/user-attachments/assets/d54d8510-5c6a-4ab0-9ba7-f6ec4ad452c6"> | <img width="1000" src="https://github.com/user-attachments/assets/a418b25b-00dc-493a-b3a3-b325b9b0cb85"> | <img width="1000" src="https://github.com/user-attachments/assets/df6e2b9e-a821-4b4c-8868-0c765700c341"> | <img width="1000" src="https://github.com/user-attachments/assets/b90277f5-40f4-4c99-80a2-da400f7d3640"> |
|:---:|:---:|:---:|:---:|
| Image | cnnrnn_20250805 | cnnrnn_20250904 | trocr_20250919 |
| <img width="2000" src="https://github.com/user-attachments/assets/7ec49211-099f-4c21-9e60-47bfdf21f1b6"> | <img width="1000" src="https://github.com/user-attachments/assets/00ef9785-8885-41b3-bf6e-21eab743df71"> | <img width="1000" src="https://github.com/user-attachments/assets/13eb9f62-4d5a-46dc-befc-b02eb4f31fc1"> | <img width="1000" src="https://github.com/user-attachments/assets/a5c078d1-6d15-4d12-9040-526d7063d459"> |
|:---:|:---:|:---:|:---:|
| Image | cnnrnn_20250805 | cnnrnn_20250904 | trocr_20250919 |
## Heuristic methods ## Heuristic methods
Additionally, some heuristic methods are employed to further improve the model predictions: Additionally, some heuristic methods are employed to further improve the model predictions:
* After border detection, the largest contour is determined by a bounding box, and the image cropped to these coordinates. * After border detection, the largest contour is determined by a bounding box, and the image cropped to these coordinates.
* For text region detection, the image is scaled up to make it easier for the model to detect background space between text regions. * Unlike the non-light version, where the image is scaled up to help the model better detect the background spaces between text regions, the light version uses down-scaled images. In this case, introducing an artificial class along the boundaries of text regions and text lines has helped to isolate and separate the text regions more effectively.
* A minimum area is defined for text regions in relation to the overall image dimensions, so that very small regions that are noise can be filtered out. * A minimum area is defined for text regions in relation to the overall image dimensions, so that very small regions that are noise can be filtered out.
* Deskewing is applied on the text region level (due to regions having different degrees of skew) in order to improve the textline segmentation result. * In the non-light version, deskewing is applied at the text-region level (since regions may have different degrees of skew) to improve text-line segmentation results. In contrast, the light version performs deskewing only at the page level to enhance margin detection and heuristic reading-order estimation.
* After deskewing, a calculation of the pixel distribution on the X-axis allows the separation of textlines (foreground) and background pixels. * After deskewing, a calculation of the pixel distribution on the X-axis allows the separation of textlines (foreground) and background pixels (only in non-light version).
* Finally, using the derived coordinates, bounding boxes are determined for each textline. * Finally, using the derived coordinates, bounding boxes are determined for each textline (only in non-light version).
* As mentioned above, the reading order can be determined using a model; however, this approach is computationally expensive, time-consuming, and less accurate due to the limited amount of ground-truth data available for training. Therefore, our tool uses a heuristic reading-order detection method as the default. The heuristic approach relies on headers and separators to determine the reading order of text regions.

26
docs/ocrd.md Normal file
View file

@ -0,0 +1,26 @@
## Use as OCR-D processor
Eynollah ships with a CLI interface to be used as [OCR-D](https://ocr-d.de) [processor](https://ocr-d.de/en/spec/cli),
formally described in [`ocrd-tool.json`](https://github.com/qurator-spk/eynollah/tree/main/src/eynollah/ocrd-tool.json).
When using Eynollah in OCR-D, the source image file group with (preferably) RGB images should be used as input like this:
ocrd-eynollah-segment -I OCR-D-IMG -O OCR-D-SEG -P models eynollah_layout_v0_5_0
If the input file group is PAGE-XML (from a previous OCR-D workflow step), Eynollah behaves as follows:
- existing regions are kept and ignored (i.e. in effect they might overlap segments from Eynollah results)
- existing annotation (and respective `AlternativeImage`s) are partially _ignored_:
- previous page frame detection (`cropped` images)
- previous derotation (`deskewed` images)
- previous thresholding (`binarized` images)
- if the page-level image nevertheless deviates from the original (`@imageFilename`)
(because some other preprocessing step was in effect like `denoised`), then
the output PAGE-XML will be based on that as new top-level (`@imageFilename`)
ocrd-eynollah-segment -I OCR-D-XYZ -O OCR-D-SEG -P models eynollah_layout_v0_5_0
In general, it makes more sense to add other workflow steps **after** Eynollah.
There is also an OCR-D processor for binarization:
ocrd-sbb-binarize -I OCR-D-IMG -O OCR-D-BIN -P models default-2021-03-09

View file

@ -1,3 +1,41 @@
# Prerequisistes
## 1. Install Eynollah with training dependencies
Clone the repository and install eynollah along with the dependencies necessary for training:
```sh
git clone https://github.com/qurator-spk/eynollah
cd eynollah
pip install '.[training]'
```
## 2. Pretrained encoder
Download our pretrained weights and add them to a `train/pretrained_model` folder:
```sh
cd train
wget -O pretrained_model.tar.gz https://zenodo.org/records/17243320/files/pretrained_model_v0_5_1.tar.gz?download=1
tar xf pretrained_model.tar.gz
```
## 3. Example data
### Binarization
A small sample of training data for binarization experiment can be found on [Zenodo](https://zenodo.org/records/17243320/files/training_data_sample_binarization_v0_5_1.tar.gz?download=1),
which contains `images` and `labels` folders.
## 4. Helpful tools
* [`pagexml2img`](https://github.com/qurator-spk/page2img)
> Tool to extract 2-D or 3-D RGB images from PAGE-XML data. In the former case, the output will be 1 2-D image array which each class has filled with a pixel value. In the case of a 3-D RGB image,
each class will be defined with a RGB value and beside images, a text file of classes will also be produced.
* [`cocoSegmentationToPng`](https://github.com/nightrome/cocostuffapi/blob/17acf33aef3c6cc2d6aca46dcf084266c2778cf0/PythonAPI/pycocotools/cocostuffhelper.py#L130)
> Convert COCO GT or results for a single image to a segmentation map and write it to disk.
* [`ocrd-segment-extract-pages`](https://github.com/OCR-D/ocrd_segment/blob/master/ocrd_segment/extract_pages.py)
> Extract region classes and their colours in mask (pseg) images. Allows the color map as free dict parameter, and comes with a default that mimics PageViewer's coloring for quick debugging; it also warns when regions do overlap.
# Training documentation # Training documentation
This document aims to assist users in preparing training datasets, training models, and This document aims to assist users in preparing training datasets, training models, and

View file

@ -11,7 +11,12 @@ description = "Document Layout Analysis"
readme = "README.md" readme = "README.md"
license.file = "LICENSE" license.file = "LICENSE"
requires-python = ">=3.8" requires-python = ">=3.8"
keywords = ["document layout analysis", "image segmentation"] keywords = [
"document layout analysis",
"image segmentation",
"binarization",
"optical character recognition"
]
dynamic = [ dynamic = [
"dependencies", "dependencies",
@ -25,6 +30,10 @@ classifiers = [
"Intended Audience :: Science/Research", "Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License", "License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3", "Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3 :: Only",
"Topic :: Scientific/Engineering :: Image Processing", "Topic :: Scientific/Engineering :: Image Processing",
] ]

View file

@ -6,3 +6,4 @@ tensorflow < 2.13
numba <= 0.58.1 numba <= 0.58.1
scikit-image scikit-image
biopython biopython
tabulate

View file

@ -1,15 +1,67 @@
import sys from dataclasses import dataclass
import click
import logging import logging
from ocrd_utils import initLogging, getLevelName, getLogger import sys
from eynollah.eynollah import Eynollah, Eynollah_ocr import os
from eynollah.sbb_binarize import SbbBinarizer from typing import Union
from eynollah.image_enhancer import Enhancer
from eynollah.mb_ro_on_layout import machine_based_reading_order_on_layout import click
# NOTE: For debugging/predictable order of imports
from .eynollah_imports import imported_libs
from .model_zoo import EynollahModelZoo
from .cli_models import models_cli
@dataclass()
class EynollahCliCtx:
"""
Holds options relevant for all eynollah subcommands
"""
model_zoo: EynollahModelZoo
log_level : Union[str, None] = 'INFO'
@click.group() @click.group()
def main(): @click.option(
pass "--model-basedir",
"-m",
help="directory of models",
type=click.Path(exists=True),
default=f'{os.getcwd()}/models_eynollah',
)
@click.option(
"--model-overrides",
"-mv",
help="override default versions of model categories, syntax is 'CATEGORY VARIANT PATH', e.g 'region light /path/to/model'. See eynollah list-models for the full list",
type=(str, str, str),
multiple=True,
)
@click.option(
"--log_level",
"-l",
type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']),
help="Override log level globally to this",
)
@click.pass_context
def main(ctx, model_basedir, model_overrides, log_level):
"""
eynollah - Document Layout Analysis, Image Enhancement, OCR
"""
# Initialize logging
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.NOTSET)
formatter = logging.Formatter('%(asctime)s.%(msecs)03d %(levelname)s %(name)s - %(message)s', datefmt='%H:%M:%S')
console_handler.setFormatter(formatter)
logging.getLogger('eynollah').addHandler(console_handler)
logging.getLogger('eynollah').setLevel(log_level or logging.INFO)
# Initialize model zoo
model_zoo = EynollahModelZoo(basedir=model_basedir, model_overrides=model_overrides)
# Initialize CLI context
ctx.obj = EynollahCliCtx(
model_zoo=model_zoo,
log_level=log_level,
)
main.add_command(models_cli, 'models')
@main.command() @main.command()
@click.option( @click.option(
@ -31,26 +83,14 @@ def main():
type=click.Path(exists=True, file_okay=False), type=click.Path(exists=True, file_okay=False),
required=True, required=True,
) )
@click.option( @click.pass_context
"--model", def machine_based_reading_order(ctx, input, dir_in, out):
"-m", """
help="directory of models", Generate ReadingOrder with a ML model
type=click.Path(exists=True, file_okay=False), """
required=True, from eynollah.mb_ro_on_layout import machine_based_reading_order_on_layout
)
@click.option(
"--log_level",
"-l",
type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']),
help="Override log level globally to this",
)
def machine_based_reading_order(input, dir_in, out, model, log_level):
assert bool(input) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both." assert bool(input) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both."
orderer = machine_based_reading_order_on_layout(model) orderer = machine_based_reading_order_on_layout(model_zoo=ctx.obj.model_zoo)
if log_level:
orderer.logger.setLevel(getLevelName(log_level))
orderer.run(xml_filename=input, orderer.run(xml_filename=input,
dir_in=dir_in, dir_in=dir_in,
dir_out=out, dir_out=out,
@ -59,7 +99,6 @@ def machine_based_reading_order(input, dir_in, out, model, log_level):
@main.command() @main.command()
@click.option('--patches/--no-patches', default=True, help='by enabling this parameter you let the model to see the image in patches.') @click.option('--patches/--no-patches', default=True, help='by enabling this parameter you let the model to see the image in patches.')
@click.option('--model_dir', '-m', type=click.Path(exists=True, file_okay=False), required=True, help='directory containing models for prediction')
@click.option( @click.option(
"--input-image", "--image", "--input-image", "--image",
"-i", "-i",
@ -80,17 +119,33 @@ def machine_based_reading_order(input, dir_in, out, model, log_level):
required=True, required=True,
) )
@click.option( @click.option(
"--log_level", '-M',
"-l", '--mode',
type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']), type=click.Choice(['single', 'multi']),
help="Override log level globally to this", default='single',
help="Whether to use the (newer and faster) single-model binarization or the (slightly better) multi-model binarization"
) )
def binarization(patches, model_dir, input_image, dir_in, output, log_level): @click.pass_context
def binarization(
ctx,
patches,
input_image,
mode,
dir_in,
output,
):
"""
Binarize images with a ML model
"""
from eynollah.sbb_binarize import SbbBinarizer
assert bool(input_image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both." assert bool(input_image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both."
binarizer = SbbBinarizer(model_dir) binarizer = SbbBinarizer(model_zoo=ctx.obj.model_zoo, mode=mode)
if log_level: binarizer.run(
binarizer.log.setLevel(getLevelName(log_level)) image_path=input_image,
binarizer.run(image_path=input_image, use_patches=patches, output=output, dir_in=dir_in) use_patches=patches,
output=output,
dir_in=dir_in
)
@main.command() @main.command()
@ -120,14 +175,6 @@ def binarization(patches, model_dir, input_image, dir_in, output, log_level):
help="directory of input images (instead of --image)", help="directory of input images (instead of --image)",
type=click.Path(exists=True, file_okay=False), type=click.Path(exists=True, file_okay=False),
) )
@click.option(
"--model",
"-m",
help="directory of models",
type=click.Path(exists=True, file_okay=False),
required=True,
)
@click.option( @click.option(
"--num_col_upper", "--num_col_upper",
"-ncu", "-ncu",
@ -144,24 +191,19 @@ def binarization(patches, model_dir, input_image, dir_in, output, log_level):
is_flag=True, is_flag=True,
help="if this parameter set to true, this tool will save the enhanced image in org scale.", help="if this parameter set to true, this tool will save the enhanced image in org scale.",
) )
@click.option( @click.pass_context
"--log_level", def enhancement(ctx, image, out, overwrite, dir_in, num_col_upper, num_col_lower, save_org_scale):
"-l", """
type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']), Enhance image
help="Override log level globally to this", """
)
def enhancement(image, out, overwrite, dir_in, model, num_col_upper, num_col_lower, save_org_scale, log_level):
assert bool(image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both." assert bool(image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both."
initLogging() from .image_enhancer import Enhancer
enhancer = Enhancer( enhancer = Enhancer(
model, model_zoo=ctx.obj.model_zoo,
num_col_upper=num_col_upper, num_col_upper=num_col_upper,
num_col_lower=num_col_lower, num_col_lower=num_col_lower,
save_org_scale=save_org_scale, save_org_scale=save_org_scale,
) )
if log_level:
enhancer.logger.setLevel(getLevelName(log_level))
enhancer.run(overwrite=overwrite, enhancer.run(overwrite=overwrite,
dir_in=dir_in, dir_in=dir_in,
image_filename=image, image_filename=image,
@ -195,20 +237,6 @@ def enhancement(image, out, overwrite, dir_in, model, num_col_upper, num_col_low
help="directory of input images (instead of --image)", help="directory of input images (instead of --image)",
type=click.Path(exists=True, file_okay=False), type=click.Path(exists=True, file_okay=False),
) )
@click.option(
"--model",
"-m",
help="directory of models",
type=click.Path(exists=True, file_okay=False),
required=True,
)
@click.option(
"--model_version",
"-mv",
help="override default versions of model categories",
type=(str, str),
multiple=True,
)
@click.option( @click.option(
"--save_images", "--save_images",
"-si", "-si",
@ -366,30 +394,45 @@ def enhancement(image, out, overwrite, dir_in, model, num_col_upper, num_col_low
is_flag=True, is_flag=True,
help="if this parameter set to true, this tool will ignore layout detection and reading order. It means that textline detection will be done within printspace and contours of textline will be written in xml output file.", help="if this parameter set to true, this tool will ignore layout detection and reading order. It means that textline detection will be done within printspace and contours of textline will be written in xml output file.",
) )
# TODO move to top-level CLI context @click.pass_context
@click.option( def layout(
"--log_level", ctx,
"-l", image,
type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']), out,
help="Override 'eynollah' log level globally to this", overwrite,
) dir_in,
# save_images,
@click.option( save_layout,
"--setup-logging", save_deskewed,
is_flag=True, save_all,
help="Setup a basic console logger", extract_only_images,
) save_page,
enable_plotting,
def layout(image, out, overwrite, dir_in, model, model_version, save_images, save_layout, save_deskewed, save_all, extract_only_images, save_page, enable_plotting, allow_enhancement, curved_line, textline_light, full_layout, tables, right2left, input_binary, allow_scaling, headers_off, light_version, reading_order_machine_based, do_ocr, transformer_ocr, batch_size_ocr, num_col_upper, num_col_lower, threshold_art_class_textline, threshold_art_class_layout, skip_layout_and_reading_order, ignore_page_extraction, log_level, setup_logging): allow_enhancement,
if setup_logging: curved_line,
console_handler = logging.StreamHandler(sys.stdout) textline_light,
console_handler.setLevel(logging.INFO) full_layout,
formatter = logging.Formatter('%(message)s') tables,
console_handler.setFormatter(formatter) right2left,
getLogger('eynollah').addHandler(console_handler) input_binary,
getLogger('eynollah').setLevel(logging.INFO) allow_scaling,
else: headers_off,
initLogging() light_version,
reading_order_machine_based,
do_ocr,
transformer_ocr,
batch_size_ocr,
num_col_upper,
num_col_lower,
threshold_art_class_textline,
threshold_art_class_layout,
skip_layout_and_reading_order,
ignore_page_extraction,
):
"""
Detect Layout (with optional image enhancement and reading order detection)
"""
from .eynollah import Eynollah
assert enable_plotting or not save_layout, "Plotting with -sl also requires -ep" assert enable_plotting or not save_layout, "Plotting with -sl also requires -ep"
assert enable_plotting or not save_deskewed, "Plotting with -sd also requires -ep" assert enable_plotting or not save_deskewed, "Plotting with -sd also requires -ep"
assert enable_plotting or not save_all, "Plotting with -sa also requires -ep" assert enable_plotting or not save_all, "Plotting with -sa also requires -ep"
@ -410,8 +453,7 @@ def layout(image, out, overwrite, dir_in, model, model_version, save_images, sav
assert not extract_only_images or not headers_off, "Image extraction -eoi can not be set alongside headers_off -ho" assert not extract_only_images or not headers_off, "Image extraction -eoi can not be set alongside headers_off -ho"
assert bool(image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both." assert bool(image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both."
eynollah = Eynollah( eynollah = Eynollah(
model, model_zoo=ctx.obj.model_zoo,
model_versions=model_version,
extract_only_images=extract_only_images, extract_only_images=extract_only_images,
enable_plotting=enable_plotting, enable_plotting=enable_plotting,
allow_enhancement=allow_enhancement, allow_enhancement=allow_enhancement,
@ -435,8 +477,6 @@ def layout(image, out, overwrite, dir_in, model, model_version, save_images, sav
threshold_art_class_textline=threshold_art_class_textline, threshold_art_class_textline=threshold_art_class_textline,
threshold_art_class_layout=threshold_art_class_layout, threshold_art_class_layout=threshold_art_class_layout,
) )
if log_level:
eynollah.logger.setLevel(getLevelName(log_level))
eynollah.run(overwrite=overwrite, eynollah.run(overwrite=overwrite,
image_filename=image, image_filename=image,
dir_in=dir_in, dir_in=dir_in,
@ -493,17 +533,6 @@ def layout(image, out, overwrite, dir_in, model, model_version, save_images, sav
help="overwrite (instead of skipping) if output xml exists", help="overwrite (instead of skipping) if output xml exists",
is_flag=True, is_flag=True,
) )
@click.option(
"--model",
"-m",
help="directory of models",
type=click.Path(exists=True, file_okay=False),
)
@click.option(
"--model_name",
help="Specific model file path to use for OCR",
type=click.Path(exists=True, file_okay=False),
)
@click.option( @click.option(
"--tr_ocr", "--tr_ocr",
"-trocr/-notrocr", "-trocr/-notrocr",
@ -537,35 +566,42 @@ def layout(image, out, overwrite, dir_in, model, model_version, save_images, sav
"-min_conf", "-min_conf",
help="minimum OCR confidence value. Text lines with a confidence value lower than this threshold will not be included in the output XML file.", help="minimum OCR confidence value. Text lines with a confidence value lower than this threshold will not be included in the output XML file.",
) )
@click.option( @click.pass_context
"--log_level", def ocr(
"-l", ctx,
type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']), image,
help="Override log level globally to this", dir_in,
) dir_in_bin,
dir_xmls,
def ocr(image, dir_in, dir_in_bin, dir_xmls, out, dir_out_image_text, overwrite, model, model_name, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, batch_size, dataset_abbrevation, min_conf_value_of_textline_text, log_level): out,
initLogging() dir_out_image_text,
overwrite,
assert bool(model) != bool(model_name), "Either -m (model directory) or --model_name (specific model name) must be provided." tr_ocr,
export_textline_images_and_text,
do_not_mask_with_textline_contour,
batch_size,
dataset_abbrevation,
min_conf_value_of_textline_text,
):
"""
Recognize text with a CNN/RNN or transformer ML model.
"""
assert not export_textline_images_and_text or not tr_ocr, "Exporting textline and text -etit can not be set alongside transformer ocr -tr_ocr" assert not export_textline_images_and_text or not tr_ocr, "Exporting textline and text -etit can not be set alongside transformer ocr -tr_ocr"
assert not export_textline_images_and_text or not model, "Exporting textline and text -etit can not be set alongside model -m" # FIXME: refactor: move export_textline_images_and_text out of eynollah.py
# assert not export_textline_images_and_text or not model, "Exporting textline and text -etit can not be set alongside model -m"
assert not export_textline_images_and_text or not batch_size, "Exporting textline and text -etit can not be set alongside batch size -bs" assert not export_textline_images_and_text or not batch_size, "Exporting textline and text -etit can not be set alongside batch size -bs"
assert not export_textline_images_and_text or not dir_in_bin, "Exporting textline and text -etit can not be set alongside directory of bin images -dib" assert not export_textline_images_and_text or not dir_in_bin, "Exporting textline and text -etit can not be set alongside directory of bin images -dib"
assert not export_textline_images_and_text or not dir_out_image_text, "Exporting textline and text -etit can not be set alongside directory of images with predicted text -doit" assert not export_textline_images_and_text or not dir_out_image_text, "Exporting textline and text -etit can not be set alongside directory of images with predicted text -doit"
assert bool(image) != bool(dir_in), "Either -i (single image) or -di (directory) must be provided, but not both." assert bool(image) != bool(dir_in), "Either -i (single image) or -di (directory) must be provided, but not both."
from .eynollah_ocr import Eynollah_ocr
eynollah_ocr = Eynollah_ocr( eynollah_ocr = Eynollah_ocr(
dir_models=model, model_zoo=ctx.obj.model_zoo,
model_name=model_name,
tr_ocr=tr_ocr, tr_ocr=tr_ocr,
export_textline_images_and_text=export_textline_images_and_text, export_textline_images_and_text=export_textline_images_and_text,
do_not_mask_with_textline_contour=do_not_mask_with_textline_contour, do_not_mask_with_textline_contour=do_not_mask_with_textline_contour,
batch_size=batch_size, batch_size=batch_size,
pref_of_dataset=dataset_abbrevation, pref_of_dataset=dataset_abbrevation,
min_conf_value_of_textline_text=min_conf_value_of_textline_text, min_conf_value_of_textline_text=min_conf_value_of_textline_text)
)
if log_level:
eynollah_ocr.logger.setLevel(getLevelName(log_level))
eynollah_ocr.run(overwrite=overwrite, eynollah_ocr.run(overwrite=overwrite,
dir_in=dir_in, dir_in=dir_in,
dir_in_bin=dir_in_bin, dir_in_bin=dir_in_bin,

View file

@ -0,0 +1,69 @@
from pathlib import Path
from typing import Set, Tuple
import click
from eynollah.model_zoo.default_specs import MODELS_VERSION
@click.group()
@click.pass_context
def models_cli(
ctx,
):
"""
Organize models for the various runners in eynollah.
"""
assert ctx.obj.model_zoo
@models_cli.command('list')
@click.pass_context
def list_models(
ctx,
):
"""
List all the models in the zoo
"""
print(f"Model basedir: {ctx.obj.model_zoo.model_basedir}")
print(f"Model overrides: {ctx.obj.model_zoo.model_overrides}")
print(ctx.obj.model_zoo)
@models_cli.command('package')
@click.option(
'--set-version', '-V', 'version', help="Version to use for packaging", default=MODELS_VERSION, show_default=True
)
@click.argument('output_dir')
@click.pass_context
def package(
ctx,
version,
output_dir,
):
"""
Generate shell code to copy all the models in the zoo into properly named folders in OUTPUT_DIR for distribution.
eynollah models -m SRC package OUTPUT_DIR
SRC should contain a directory "models_eynollah" containing all the models.
"""
mkdirs: Set[Path] = set([])
copies: Set[Tuple[Path, Path]] = set([])
for spec in ctx.obj.model_zoo.specs.specs:
# skip these as they are dependent on the ocr model
if spec.category in ('num_to_char', 'characters'):
continue
src: Path = ctx.obj.model_zoo.model_path(spec.category, spec.variant)
# Only copy the top-most directory relative to models_eynollah
while src.parent.name != 'models_eynollah':
src = src.parent
for dist in spec.dists:
dist_dir = Path(f"{output_dir}/models_{dist}_{version}/models_eynollah")
copies.add((src, dist_dir))
mkdirs.add(dist_dir)
for dir in mkdirs:
print(f"mkdir -vp {dir}")
for (src, dst) in copies:
print(f"cp -vr {src} {dst}")
for dir in mkdirs:
zip_path = Path(f'../{dir.parent.name}.zip')
print(f"(cd {dir}/..; zip -vr {zip_path} models_eynollah)")

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,8 @@
"""
Load libraries with possible race conditions once. This must be imported as the first module of eynollah.
"""
from torch import *
import tensorflow.keras
from shapely import *
imported_libs = True
__all__ = ['imported_libs']

1001
src/eynollah/eynollah_ocr.py Normal file

File diff suppressed because it is too large Load diff

View file

@ -2,27 +2,32 @@
Image enhancer. The output can be written as same scale of input or in new predicted scale. Image enhancer. The output can be written as same scale of input or in new predicted scale.
""" """
from logging import Logger # FIXME: fix all of those...
# pyright: reportUnboundVariable=false
# pyright: reportCallIssue=false
# pyright: reportArgumentType=false
import logging
import os import os
import time import time
from typing import Optional from typing import Dict, Optional
from pathlib import Path from pathlib import Path
import gc import gc
import cv2 import cv2
from keras.models import Model
import numpy as np import numpy as np
from ocrd_utils import getLogger, tf_disable_interactive_logs
import tensorflow as tf import tensorflow as tf
from skimage.morphology import skeletonize from skimage.morphology import skeletonize
from tensorflow.keras.models import load_model
from .model_zoo import EynollahModelZoo
from .utils.resize import resize_image from .utils.resize import resize_image
from .utils.pil_cv2 import pil2cv from .utils.pil_cv2 import pil2cv
from .utils import ( from .utils import (
is_image_filename, is_image_filename,
crop_image_inside_box crop_image_inside_box
) )
from .eynollah import PatchEncoder, Patches from .patch_encoder import PatchEncoder, Patches
DPI_THRESHOLD = 298 DPI_THRESHOLD = 298
KERNEL = np.ones((5, 5), np.uint8) KERNEL = np.ones((5, 5), np.uint8)
@ -31,11 +36,11 @@ KERNEL = np.ones((5, 5), np.uint8)
class Enhancer: class Enhancer:
def __init__( def __init__(
self, self,
dir_models : str, *,
model_zoo: EynollahModelZoo,
num_col_upper : Optional[int] = None, num_col_upper : Optional[int] = None,
num_col_lower : Optional[int] = None, num_col_lower : Optional[int] = None,
save_org_scale : bool = False, save_org_scale : bool = False,
logger : Optional[Logger] = None,
): ):
self.input_binary = False self.input_binary = False
self.light_version = False self.light_version = False
@ -49,12 +54,10 @@ class Enhancer:
else: else:
self.num_col_lower = num_col_lower self.num_col_lower = num_col_lower
self.logger = logger if logger else getLogger('enhancement') self.logger = logging.getLogger('eynollah.enhance')
self.dir_models = dir_models self.model_zoo = model_zoo
self.model_dir_of_binarization = dir_models + "/eynollah-binarization_20210425" for v in ['binarization', 'enhancement', 'col_classifier', 'page']:
self.model_dir_of_enhancement = dir_models + "/eynollah-enhancement_20210425" self.model_zoo.load_model(v)
self.model_dir_of_col_classifier = dir_models + "/eynollah-column-classifier_20210425"
self.model_page_dir = dir_models + "/model_eynollah_page_extraction_20250915"
try: try:
for device in tf.config.list_physical_devices('GPU'): for device in tf.config.list_physical_devices('GPU'):
@ -62,11 +65,6 @@ class Enhancer:
except: except:
self.logger.warning("no GPU device available") self.logger.warning("no GPU device available")
self.model_page = self.our_load_model(self.model_page_dir)
self.model_classifier = self.our_load_model(self.model_dir_of_col_classifier)
self.model_enhancement = self.our_load_model(self.model_dir_of_enhancement)
self.model_bin = self.our_load_model(self.model_dir_of_binarization)
def cache_images(self, image_filename=None, image_pil=None, dpi=None): def cache_images(self, image_filename=None, image_pil=None, dpi=None):
ret = {} ret = {}
if image_filename: if image_filename:
@ -103,23 +101,11 @@ class Enhancer:
def isNaN(self, num): def isNaN(self, num):
return num != num return num != num
@staticmethod
def our_load_model(model_file):
if model_file.endswith('.h5') and Path(model_file[:-3]).exists():
# prefer SavedModel over HDF5 format if it exists
model_file = model_file[:-3]
try:
model = load_model(model_file, compile=False)
except:
model = load_model(model_file, compile=False, custom_objects={
"PatchEncoder": PatchEncoder, "Patches": Patches})
return model
def predict_enhancement(self, img): def predict_enhancement(self, img):
self.logger.debug("enter predict_enhancement") self.logger.debug("enter predict_enhancement")
img_height_model = self.model_enhancement.layers[-1].output_shape[1] img_height_model = self.model_zoo.get('enhancement', Model).layers[-1].output_shape[1]
img_width_model = self.model_enhancement.layers[-1].output_shape[2] img_width_model = self.model_zoo.get('enhancement', Model).layers[-1].output_shape[2]
if img.shape[0] < img_height_model: if img.shape[0] < img_height_model:
img = cv2.resize(img, (img.shape[1], img_width_model), interpolation=cv2.INTER_NEAREST) img = cv2.resize(img, (img.shape[1], img_width_model), interpolation=cv2.INTER_NEAREST)
if img.shape[1] < img_width_model: if img.shape[1] < img_width_model:
@ -160,7 +146,7 @@ class Enhancer:
index_y_d = img_h - img_height_model index_y_d = img_h - img_height_model
img_patch = img[np.newaxis, index_y_d:index_y_u, index_x_d:index_x_u, :] img_patch = img[np.newaxis, index_y_d:index_y_u, index_x_d:index_x_u, :]
label_p_pred = self.model_enhancement.predict(img_patch, verbose=0) label_p_pred = self.model_zoo.get('enhancement', Model).predict(img_patch, verbose='0')
seg = label_p_pred[0, :, :, :] * 255 seg = label_p_pred[0, :, :, :] * 255
if i == 0 and j == 0: if i == 0 and j == 0:
@ -246,7 +232,7 @@ class Enhancer:
else: else:
img = self.imread() img = self.imread()
img = cv2.GaussianBlur(img, (5, 5), 0) img = cv2.GaussianBlur(img, (5, 5), 0)
img_page_prediction = self.do_prediction(False, img, self.model_page) img_page_prediction = self.do_prediction(False, img, self.model_zoo.get('page'))
imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY) imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(imgray, 0, 255, 0) _, thresh = cv2.threshold(imgray, 0, 255, 0)
@ -291,7 +277,7 @@ class Enhancer:
self.logger.info("Detected %s DPI", dpi) self.logger.info("Detected %s DPI", dpi)
if self.input_binary: if self.input_binary:
img = self.imread() img = self.imread()
prediction_bin = self.do_prediction(True, img, self.model_bin, n_batch_inference=5) prediction_bin = self.do_prediction(True, img, self.model_zoo.get('binarization'), n_batch_inference=5)
prediction_bin = 255 * (prediction_bin[:,:,0]==0) prediction_bin = 255 * (prediction_bin[:,:,0]==0)
prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2).astype(np.uint8) prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2).astype(np.uint8)
img= np.copy(prediction_bin) img= np.copy(prediction_bin)
@ -332,7 +318,7 @@ class Enhancer:
img_in[0, :, :, 1] = img_1ch[:, :] img_in[0, :, :, 1] = img_1ch[:, :]
img_in[0, :, :, 2] = img_1ch[:, :] img_in[0, :, :, 2] = img_1ch[:, :]
label_p_pred = self.model_classifier.predict(img_in, verbose=0) label_p_pred = self.model_zoo.get('col_classifier').predict(img_in, verbose=0)
num_col = np.argmax(label_p_pred[0]) + 1 num_col = np.argmax(label_p_pred[0]) + 1
elif (self.num_col_upper and self.num_col_lower) and (self.num_col_upper!=self.num_col_lower): elif (self.num_col_upper and self.num_col_lower) and (self.num_col_upper!=self.num_col_lower):
if self.input_binary: if self.input_binary:
@ -352,7 +338,7 @@ class Enhancer:
img_in[0, :, :, 1] = img_1ch[:, :] img_in[0, :, :, 1] = img_1ch[:, :]
img_in[0, :, :, 2] = img_1ch[:, :] img_in[0, :, :, 2] = img_1ch[:, :]
label_p_pred = self.model_classifier.predict(img_in, verbose=0) label_p_pred = self.model_zoo.get('col_classifier').predict(img_in, verbose=0)
num_col = np.argmax(label_p_pred[0]) + 1 num_col = np.argmax(label_p_pred[0]) + 1
if num_col > self.num_col_upper: if num_col > self.num_col_upper:
@ -685,7 +671,7 @@ class Enhancer:
t0 = time.time() t0 = time.time()
img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(light_version=False) img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(light_version=False)
return img_res return img_res, is_image_enhanced
def run(self, def run(self,
@ -723,9 +709,18 @@ class Enhancer:
self.logger.warning("will skip input for existing output file '%s'", self.output_filename) self.logger.warning("will skip input for existing output file '%s'", self.output_filename)
continue continue
image_enhanced = self.run_single() did_resize = False
image_enhanced, did_enhance = self.run_single()
if self.save_org_scale: if self.save_org_scale:
image_enhanced = resize_image(image_enhanced, self.h_org, self.w_org) image_enhanced = resize_image(image_enhanced, self.h_org, self.w_org)
did_resize = True
self.logger.info(
"Image %s was %senhanced%s.",
img_filename,
'' if did_enhance else 'not ',
'and resized' if did_resize else ''
)
cv2.imwrite(self.output_filename, image_enhanced) cv2.imwrite(self.output_filename, image_enhanced)

View file

@ -1,8 +1,12 @@
""" """
Image enhancer. The output can be written as same scale of input or in new predicted scale. Machine learning based reading order detection
""" """
from logging import Logger # pyright: reportCallIssue=false
# pyright: reportUnboundVariable=false
# pyright: reportArgumentType=false
import logging
import os import os
import time import time
from typing import Optional from typing import Optional
@ -10,12 +14,12 @@ from pathlib import Path
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import cv2 import cv2
from keras.models import Model
import numpy as np import numpy as np
from ocrd_utils import getLogger
import statistics import statistics
import tensorflow as tf import tensorflow as tf
from tensorflow.keras.models import load_model
from .model_zoo import EynollahModelZoo
from .utils.resize import resize_image from .utils.resize import resize_image
from .utils.contour import ( from .utils.contour import (
find_new_features_of_contours, find_new_features_of_contours,
@ -23,7 +27,6 @@ from .utils.contour import (
return_parent_contours, return_parent_contours,
) )
from .utils import is_xml_filename from .utils import is_xml_filename
from .eynollah import PatchEncoder, Patches
DPI_THRESHOLD = 298 DPI_THRESHOLD = 298
KERNEL = np.ones((5, 5), np.uint8) KERNEL = np.ones((5, 5), np.uint8)
@ -32,12 +35,12 @@ KERNEL = np.ones((5, 5), np.uint8)
class machine_based_reading_order_on_layout: class machine_based_reading_order_on_layout:
def __init__( def __init__(
self, self,
dir_models : str, *,
logger : Optional[Logger] = None, model_zoo: EynollahModelZoo,
logger : Optional[logging.Logger] = None,
): ):
self.logger = logger if logger else getLogger('mbreorder') self.logger = logger or logging.getLogger('eynollah.mbreorder')
self.dir_models = dir_models self.model_zoo = model_zoo
self.model_reading_order_dir = dir_models + "/model_eynollah_reading_order_20250824"
try: try:
for device in tf.config.list_physical_devices('GPU'): for device in tf.config.list_physical_devices('GPU'):
@ -45,21 +48,10 @@ class machine_based_reading_order_on_layout:
except: except:
self.logger.warning("no GPU device available") self.logger.warning("no GPU device available")
self.model_reading_order = self.our_load_model(self.model_reading_order_dir) self.model_zoo.load_model('reading_order')
# FIXME: light_version is always true, no need for checks in the code
self.light_version = True self.light_version = True
@staticmethod
def our_load_model(model_file):
if model_file.endswith('.h5') and Path(model_file[:-3]).exists():
# prefer SavedModel over HDF5 format if it exists
model_file = model_file[:-3]
try:
model = load_model(model_file, compile=False)
except:
model = load_model(model_file, compile=False, custom_objects={
"PatchEncoder": PatchEncoder, "Patches": Patches})
return model
def read_xml(self, xml_file): def read_xml(self, xml_file):
tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding='utf-8')) tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding='utf-8'))
root1=tree1.getroot() root1=tree1.getroot()
@ -69,6 +61,7 @@ class machine_based_reading_order_on_layout:
index_tot_regions = [] index_tot_regions = []
tot_region_ref = [] tot_region_ref = []
y_len, x_len = 0, 0
for jj in root1.iter(link+'Page'): for jj in root1.iter(link+'Page'):
y_len=int(jj.attrib['imageHeight']) y_len=int(jj.attrib['imageHeight'])
x_len=int(jj.attrib['imageWidth']) x_len=int(jj.attrib['imageWidth'])
@ -81,13 +74,13 @@ class machine_based_reading_order_on_layout:
co_printspace = [] co_printspace = []
if link+'PrintSpace' in alltags: if link+'PrintSpace' in alltags:
region_tags_printspace = np.unique([x for x in alltags if x.endswith('PrintSpace')]) region_tags_printspace = np.unique([x for x in alltags if x.endswith('PrintSpace')])
elif link+'Border' in alltags: else:
region_tags_printspace = np.unique([x for x in alltags if x.endswith('Border')]) region_tags_printspace = np.unique([x for x in alltags if x.endswith('Border')])
for tag in region_tags_printspace: for tag in region_tags_printspace:
if link+'PrintSpace' in alltags: if link+'PrintSpace' in alltags:
tag_endings_printspace = ['}PrintSpace','}printspace'] tag_endings_printspace = ['}PrintSpace','}printspace']
elif link+'Border' in alltags: else:
tag_endings_printspace = ['}Border','}border'] tag_endings_printspace = ['}Border','}border']
if tag.endswith(tag_endings_printspace[0]) or tag.endswith(tag_endings_printspace[1]): if tag.endswith(tag_endings_printspace[0]) or tag.endswith(tag_endings_printspace[1]):
@ -683,7 +676,7 @@ class machine_based_reading_order_on_layout:
tot_counter += 1 tot_counter += 1
batch.append(j) batch.append(j)
if tot_counter % inference_bs == 0 or tot_counter == len(ij_list): if tot_counter % inference_bs == 0 or tot_counter == len(ij_list):
y_pr = self.model_reading_order.predict(input_1 , verbose=0) y_pr = self.model_zoo.get('reading_order', Model).predict(input_1 , verbose='0')
for jb, j in enumerate(batch): for jb, j in enumerate(batch):
if y_pr[jb][0]>=0.5: if y_pr[jb][0]>=0.5:
post_list.append(j) post_list.append(j)
@ -802,6 +795,7 @@ class machine_based_reading_order_on_layout:
alltags=[elem.tag for elem in root_xml.iter()] alltags=[elem.tag for elem in root_xml.iter()]
ET.register_namespace("",name_space) ET.register_namespace("",name_space)
assert dir_out
tree_xml.write(os.path.join(dir_out, file_name+'.xml'), tree_xml.write(os.path.join(dir_out, file_name+'.xml'),
xml_declaration=True, xml_declaration=True,
method='xml', method='xml',

View file

@ -0,0 +1,4 @@
__all__ = [
'EynollahModelZoo',
]
from .model_zoo import EynollahModelZoo

View file

@ -0,0 +1,313 @@
from .specs import EynollahModelSpec, EynollahModelSpecSet
# NOTE: This needs to change whenever models/versions change
ZENODO = "https://zenodo.org/records/17295988/files"
MODELS_VERSION = "v0_7_0"
def dist_url(dist_name: str) -> str:
return f'{ZENODO}/models_{dist_name}_{MODELS_VERSION}.zip'
DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
EynollahModelSpec(
category="enhancement",
variant='',
filename="models_eynollah/eynollah-enhancement_20210425",
dists=['enhancement', 'layout', 'ci'],
dist_url=dist_url("enhancement"),
type='Keras',
),
EynollahModelSpec(
category="binarization",
variant='hybrid',
filename="models_eynollah/eynollah-binarization-hybrid_20230504/model_bin_hybrid_trans_cnn_sbb_ens",
dists=['layout', 'binarization', ],
dist_url=dist_url("binarization"),
type='Keras',
),
EynollahModelSpec(
category="binarization",
variant='20210309',
filename="models_eynollah/eynollah-binarization_20210309",
dists=['binarization'],
dist_url=dist_url("binarization"),
type='Keras',
),
EynollahModelSpec(
category="binarization",
variant='',
filename="models_eynollah/eynollah-binarization_20210425",
dists=['binarization'],
dist_url=dist_url("binarization"),
type='Keras',
),
EynollahModelSpec(
category="binarization_multi_1",
variant='',
filename="models_eynollah/eynollah-binarization-multi_2020_01_16/model_bin1",
dist_url=dist_url("binarization"),
dists=['binarization'],
type='Keras',
),
EynollahModelSpec(
category="binarization_multi_2",
variant='',
filename="models_eynollah/eynollah-binarization-multi_2020_01_16/model_bin2",
dist_url=dist_url("binarization"),
dists=['binarization'],
type='Keras',
),
EynollahModelSpec(
category="binarization_multi_3",
variant='',
filename="models_eynollah/eynollah-binarization-multi_2020_01_16/model_bin3",
dist_url=dist_url("binarization"),
dists=['binarization'],
type='Keras',
),
EynollahModelSpec(
category="binarization_multi_4",
variant='',
filename="models_eynollah/eynollah-binarization-multi_2020_01_16/model_bin4",
dist_url=dist_url("binarization"),
dists=['binarization'],
type='Keras',
),
EynollahModelSpec(
category="col_classifier",
variant='',
filename="models_eynollah/eynollah-column-classifier_20210425",
dist_url=dist_url("layout"),
dists=['layout'],
type='Keras',
),
EynollahModelSpec(
category="page",
variant='',
filename="models_eynollah/model_eynollah_page_extraction_20250915",
dist_url=dist_url("layout"),
dists=['layout'],
type='Keras',
),
EynollahModelSpec(
category="region",
variant='',
filename="models_eynollah/eynollah-main-regions-ensembled_20210425",
dist_url=dist_url("layout"),
dists=['layout'],
type='Keras',
),
EynollahModelSpec(
category="region",
variant='extract_only_images',
filename="models_eynollah/eynollah-main-regions_20231127_672_org_ens_11_13_16_17_18",
dist_url=dist_url("layout"),
dists=['layout'],
type='Keras',
),
EynollahModelSpec(
category="region",
variant='light',
filename="models_eynollah/eynollah-main-regions_20220314",
dist_url=dist_url("layout"),
help="early layout",
dists=['layout'],
type='Keras',
),
EynollahModelSpec(
category="region_p2",
variant='',
filename="models_eynollah/eynollah-main-regions-aug-rotation_20210425",
dist_url=dist_url("layout"),
help="early layout, non-light, 2nd part",
dists=['layout'],
type='Keras',
),
EynollahModelSpec(
category="region_1_2",
variant='',
#filename="models_eynollah/modelens_12sp_elay_0_3_4__3_6_n",
#filename="models_eynollah/modelens_earlylayout_12spaltige_2_3_5_6_7_8",
#filename="models_eynollah/modelens_early12_sp_2_3_5_6_7_8_9_10_12_14_15_16_18",
#filename="models_eynollah/modelens_1_2_4_5_early_lay_1_2_spaltige",
#filename="models_eynollah/model_3_eraly_layout_no_patches_1_2_spaltige",
filename="models_eynollah/modelens_e_l_all_sp_0_1_2_3_4_171024",
dist_url=dist_url("layout"),
dists=['layout'],
help="early layout, light, 1-or-2-column",
type='Keras',
),
EynollahModelSpec(
category="region_fl_np",
variant='',
#'filename="models_eynollah/modelens_full_lay_1_3_031124",
#'filename="models_eynollah/modelens_full_lay_13__3_19_241024",
#'filename="models_eynollah/model_full_lay_13_241024",
#'filename="models_eynollah/modelens_full_lay_13_17_231024",
#'filename="models_eynollah/modelens_full_lay_1_2_221024",
#'filename="models_eynollah/eynollah-full-regions-1column_20210425",
filename="models_eynollah/modelens_full_lay_1__4_3_091124",
dist_url=dist_url("layout"),
help="full layout / no patches",
dists=['layout'],
type='Keras',
),
# FIXME: Why is region_fl and region_fl_np the same model?
EynollahModelSpec(
category="region_fl",
variant='',
# filename="models_eynollah/eynollah-full-regions-3+column_20210425",
# filename="models_eynollah/model_2_full_layout_new_trans",
# filename="models_eynollah/modelens_full_lay_1_3_031124",
# filename="models_eynollah/modelens_full_lay_13__3_19_241024",
# filename="models_eynollah/model_full_lay_13_241024",
# filename="models_eynollah/modelens_full_lay_13_17_231024",
# filename="models_eynollah/modelens_full_lay_1_2_221024",
# filename="models_eynollah/modelens_full_layout_24_till_28",
# filename="models_eynollah/model_2_full_layout_new_trans",
filename="models_eynollah/modelens_full_lay_1__4_3_091124",
dist_url=dist_url("layout"),
help="full layout / with patches",
dists=['layout'],
type='Keras',
),
EynollahModelSpec(
category="reading_order",
variant='',
#filename="models_eynollah/model_mb_ro_aug_ens_11",
#filename="models_eynollah/model_step_3200000_mb_ro",
#filename="models_eynollah/model_ens_reading_order_machine_based",
#filename="models_eynollah/model_mb_ro_aug_ens_8",
#filename="models_eynollah/model_ens_reading_order_machine_based",
filename="models_eynollah/model_eynollah_reading_order_20250824",
dist_url=dist_url("reading_order"),
dists=['layout', 'reading_order'],
type='Keras',
),
EynollahModelSpec(
category="textline",
variant='',
#filename="models_eynollah/modelens_textline_1_4_16092024",
#filename="models_eynollah/model_textline_ens_3_4_5_6_artificial",
#filename="models_eynollah/modelens_textline_1_3_4_20240915",
#filename="models_eynollah/model_textline_ens_3_4_5_6_artificial",
#filename="models_eynollah/modelens_textline_9_12_13_14_15",
#filename="models_eynollah/eynollah-textline_20210425",
filename="models_eynollah/modelens_textline_0_1__2_4_16092024",
dist_url=dist_url("layout"),
dists=['layout'],
type='Keras',
),
EynollahModelSpec(
category="textline",
variant='light',
#filename="models_eynollah/eynollah-textline_light_20210425",
filename="models_eynollah/modelens_textline_0_1__2_4_16092024",
dist_url=dist_url("layout"),
dists=['layout'],
type='Keras',
),
EynollahModelSpec(
category="table",
variant='',
filename="models_eynollah/eynollah-tables_20210319",
dist_url=dist_url("layout"),
dists=['layout'],
type='Keras',
),
EynollahModelSpec(
category="table",
variant='light',
filename="models_eynollah/modelens_table_0t4_201124",
dist_url=dist_url("layout"),
dists=['layout'],
type='Keras',
),
EynollahModelSpec(
category="ocr",
variant='',
filename="models_eynollah/model_eynollah_ocr_cnnrnn_20250930",
dist_url=dist_url("ocr"),
dists=['layout', 'ocr'],
type='Keras',
),
EynollahModelSpec(
category="ocr",
variant='degraded',
filename="models_eynollah/model_eynollah_ocr_cnnrnn__degraded_20250805/",
help="slightly better at degraded Fraktur",
dist_url=dist_url("ocr"),
dists=['ocr'],
type='Keras',
),
EynollahModelSpec(
category="num_to_char",
variant='',
filename="characters_org.txt",
dist_url=dist_url("ocr"),
dists=['ocr'],
type='decoder',
),
EynollahModelSpec(
category="characters",
variant='',
filename="characters_org.txt",
dist_url=dist_url("ocr"),
dists=['ocr'],
type='List[str]',
),
EynollahModelSpec(
category="ocr",
variant='tr',
filename="models_eynollah/model_eynollah_ocr_trocr_20250919",
dist_url=dist_url("trocr"),
help='much slower transformer-based',
dists=['trocr'],
type='Keras',
),
EynollahModelSpec(
category="trocr_processor",
variant='',
filename="models_eynollah/model_eynollah_ocr_trocr_20250919",
dist_url=dist_url("trocr"),
dists=['trocr'],
type='TrOCRProcessor',
),
EynollahModelSpec(
category="trocr_processor",
variant='htr',
filename="models_eynollah/microsoft/trocr-base-handwritten",
dist_url=dist_url("trocr"),
dists=['trocr'],
type='TrOCRProcessor',
),
])

View file

@ -0,0 +1,204 @@
import json
import logging
from copy import deepcopy
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Type, Union
from ocrd_utils import tf_disable_interactive_logs
tf_disable_interactive_logs()
from keras.layers import StringLookup
from keras.models import Model as KerasModel
from keras.models import load_model
from tabulate import tabulate
from ..patch_encoder import PatchEncoder, Patches
from .specs import EynollahModelSpecSet
from .default_specs import DEFAULT_MODEL_SPECS
from .types import AnyModel, T
class EynollahModelZoo:
"""
Wrapper class that handles storage and loading of models for all eynollah runners.
"""
model_basedir: Path
specs: EynollahModelSpecSet
def __init__(
self,
basedir: str,
model_overrides: Optional[List[Tuple[str, str, str]]] = None,
) -> None:
self.model_basedir = Path(basedir)
self.logger = logging.getLogger('eynollah.model_zoo')
if not self.model_basedir.exists():
self.logger.warning(f"Model basedir does not exist: {basedir}. Set eynollah --model-basedir to the correct directory.")
self.specs = deepcopy(DEFAULT_MODEL_SPECS)
self._overrides = []
if model_overrides:
self.override_models(*model_overrides)
self._loaded: Dict[str, AnyModel] = {}
@property
def model_overrides(self):
return self._overrides
def override_models(
self,
*model_overrides: Tuple[str, str, str],
):
"""
Override the default model versions
"""
for model_category, model_variant, model_filename in model_overrides:
spec = self.specs.get(model_category, model_variant)
self.logger.warning("Overriding filename for model spec %s to %s", spec, model_filename)
self.specs.get(model_category, model_variant).filename = model_filename
self._overrides += model_overrides
def model_path(
self,
model_category: str,
model_variant: str = '',
absolute: bool = True,
) -> Path:
"""
Translate model_{type,variant} tuple into an absolute (or relative) Path
"""
spec = self.specs.get(model_category, model_variant)
if spec.category in ('characters', 'num_to_char'):
return self.model_path('ocr') / spec.filename
if not Path(spec.filename).is_absolute() and absolute:
model_path = Path(self.model_basedir).joinpath(spec.filename)
else:
model_path = Path(spec.filename)
return model_path
def load_models(
self,
*all_load_args: Union[str, Tuple[str], Tuple[str, str], Tuple[str, str, str]],
) -> Dict:
"""
Load all models by calling load_model and return a dictionary mapping model_category to loaded model
"""
ret = {}
for load_args in all_load_args:
if isinstance(load_args, str):
ret[load_args] = self.load_model(load_args)
else:
ret[load_args[0]] = self.load_model(*load_args)
return ret
def load_model(
self,
model_category: str,
model_variant: str = '',
model_path_override: Optional[str] = None,
) -> AnyModel:
"""
Load any model
"""
if model_path_override:
self.override_models((model_category, model_variant, model_path_override))
model_path = self.model_path(model_category, model_variant)
if model_path.suffix == '.h5' and Path(model_path.stem).exists():
# prefer SavedModel over HDF5 format if it exists
model_path = Path(model_path.stem)
if model_category == 'ocr':
model = self._load_ocr_model(variant=model_variant)
elif model_category == 'num_to_char':
model = self._load_num_to_char()
elif model_category == 'characters':
model = self._load_characters()
elif model_category == 'trocr_processor':
from transformers import TrOCRProcessor
model = TrOCRProcessor.from_pretrained(model_path)
else:
try:
model = load_model(model_path, compile=False)
except Exception as e:
self.logger.exception(e)
model = load_model(
model_path, compile=False, custom_objects={"PatchEncoder": PatchEncoder, "Patches": Patches}
)
self._loaded[model_category] = model
return model # type: ignore
def get(self, model_category: str, model_type: Optional[Type[T]] = None) -> T:
if model_category not in self._loaded:
raise ValueError(f'Model "{model_category} not previously loaded with "load_model(..)"')
ret = self._loaded[model_category]
if model_type:
assert isinstance(ret, model_type)
return ret # type: ignore # FIXME: convince typing that we're returning generic type
def _load_ocr_model(self, variant: str) -> AnyModel:
"""
Load OCR model
"""
ocr_model_dir = self.model_path('ocr', variant)
if variant == 'tr':
from transformers import VisionEncoderDecoderModel
ret = VisionEncoderDecoderModel.from_pretrained(ocr_model_dir)
assert isinstance(ret, VisionEncoderDecoderModel)
return ret
else:
ocr_model = load_model(ocr_model_dir, compile=False)
assert isinstance(ocr_model, KerasModel)
return KerasModel(
ocr_model.get_layer(name="image").input, # type: ignore
ocr_model.get_layer(name="dense2").output, # type: ignore
)
def _load_characters(self) -> List[str]:
"""
Load encoding for OCR
"""
with open(self.model_path('num_to_char'), "r") as config_file:
return json.load(config_file)
def _load_num_to_char(self) -> StringLookup:
"""
Load decoder for OCR
"""
characters = self._load_characters()
# Mapping characters to integers.
char_to_num = StringLookup(vocabulary=characters, mask_token=None)
# Mapping integers back to original characters.
return StringLookup(vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True)
def __str__(self):
return tabulate(
[
[
spec.type,
spec.category,
spec.variant,
spec.help,
', '.join(spec.dists),
f'Yes, at {self.model_path(spec.category, spec.variant)}'
if self.model_path(spec.category, spec.variant).exists()
else f'No, download {spec.dist_url}',
# self.model_path(spec.category, spec.variant),
]
for spec in self.specs.specs
],
headers=[
'Type',
'Category',
'Variant',
'Help',
'Used in',
'Installed',
],
tablefmt='github',
)
def shutdown(self):
"""
Ensure that a loaded models is not referenced by ``self._loaded`` anymore
"""
if hasattr(self, '_loaded') and getattr(self, '_loaded'):
for needle in list(self._loaded.keys()):
del self._loaded[needle]

View file

@ -0,0 +1,54 @@
from dataclasses import dataclass
from typing import Dict, List, Set, Tuple
@dataclass
class EynollahModelSpec():
"""
Describing a single model abstractly.
"""
category: str
# Relative filename to the models_eynollah directory in the dists
filename: str
# basename of the ZIP files that should contain this model
dists: List[str]
# URL to the smallest model distribution containing this model (link to Zenodo)
dist_url: str
type: str
variant: str = ''
help: str = ''
class EynollahModelSpecSet():
"""
List of all used models for eynollah.
"""
specs: List[EynollahModelSpec]
def __init__(self, specs: List[EynollahModelSpec]) -> None:
self.specs = sorted(specs, key=lambda x: x.category + '0' + x.variant)
self.categories: Set[str] = set([spec.category for spec in self.specs])
self.variants: Dict[str, Set[str]] = {
spec.category: set([x.variant for x in self.specs if x.category == spec.category])
for spec in self.specs
}
self._index_category_variant: Dict[Tuple[str, str], EynollahModelSpec] = {
(spec.category, spec.variant): spec
for spec in self.specs
}
def asdict(self) -> Dict[str, Dict[str, str]]:
return {
spec.category: {
spec.variant: spec.filename
}
for spec in self.specs
}
def get(self, category: str, variant: str) -> EynollahModelSpec:
if category not in self.categories:
raise ValueError(f"Unknown category '{category}', must be one of {self.categories}")
if variant not in self.variants[category]:
raise ValueError(f"Unknown variant {variant} for {category}. Known variants: {self.variants[category]}")
return self._index_category_variant[(category, variant)]

View file

@ -0,0 +1,7 @@
from typing import TypeVar
# NOTE: Creating an actual union type requires loading transformers which is expensive and error-prone
# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
# AnyModel = Union[VisionEncoderDecoderModel, TrOCRProcessor, KerasModel, List]
AnyModel = object
T = TypeVar('T')

View file

@ -83,10 +83,10 @@
}, },
"resources": [ "resources": [
{ {
"url": "https://zenodo.org/records/17194824/files/models_layout_v0_5_0.tar.gz?download=1", "url": "https://zenodo.org/records/17295988/files/models_layout_v0_6_0.tar.gz?download=1",
"name": "models_layout_v0_5_0", "name": "models_layout_v0_6_0",
"type": "archive", "type": "archive",
"path_in_archive": "models_layout_v0_5_0", "path_in_archive": "models_layout_v0_6_0",
"size": 3525684179, "size": 3525684179,
"description": "Models for layout detection, reading order detection, textline detection, page extraction, column classification, table detection, binarization, image enhancement", "description": "Models for layout detection, reading order detection, textline detection, page extraction, column classification, table detection, binarization, image enhancement",
"version_range": ">= v0.5.0" "version_range": ">= v0.5.0"

View file

@ -34,6 +34,7 @@ class SbbBinarizeProcessor(Processor):
Set up the model prior to processing. Set up the model prior to processing.
""" """
# resolve relative path via OCR-D ResourceManager # resolve relative path via OCR-D ResourceManager
assert isinstance(self.parameter, dict)
model_path = self.resolve_resource(self.parameter['model']) model_path = self.resolve_resource(self.parameter['model'])
self.binarizer = SbbBinarizer(model_dir=model_path, logger=self.logger) self.binarizer = SbbBinarizer(model_dir=model_path, logger=self.logger)

View file

@ -0,0 +1,52 @@
from keras import layers
import tensorflow as tf
projection_dim = 64
patch_size = 1
num_patches =21*21#14*14#28*28#14*14#28*28
class PatchEncoder(layers.Layer):
def __init__(self):
super().__init__()
self.projection = layers.Dense(units=projection_dim)
self.position_embedding = layers.Embedding(input_dim=num_patches, output_dim=projection_dim)
def call(self, patch):
positions = tf.range(start=0, limit=num_patches, delta=1)
encoded = self.projection(patch) + self.position_embedding(positions)
return encoded
def get_config(self):
config = super().get_config().copy()
config.update({
'num_patches': num_patches,
'projection': self.projection,
'position_embedding': self.position_embedding,
})
return config
class Patches(layers.Layer):
def __init__(self, **kwargs):
super(Patches, self).__init__()
self.patch_size = patch_size
def call(self, images):
batch_size = tf.shape(images)[0]
patches = tf.image.extract_patches(
images=images,
sizes=[1, self.patch_size, self.patch_size, 1],
strides=[1, self.patch_size, self.patch_size, 1],
rates=[1, 1, 1, 1],
padding="VALID",
)
patch_dims = patches.shape[-1]
patches = tf.reshape(patches, [batch_size, -1, patch_dims])
return patches
def get_config(self):
config = super().get_config().copy()
config.update({
'patch_size': self.patch_size,
})
return config

View file

@ -40,8 +40,8 @@ class EynollahPlotter:
self.image_filename_stem = image_filename_stem self.image_filename_stem = image_filename_stem
# XXX TODO hacky these cannot be set at init time # XXX TODO hacky these cannot be set at init time
self.image_org = image_org self.image_org = image_org
self.scale_x = scale_x self.scale_x : float = scale_x
self.scale_y = scale_y self.scale_y : float = scale_y
def save_plot_of_layout_main(self, text_regions_p, image_page): def save_plot_of_layout_main(self, text_regions_p, image_page):
if self.dir_of_layout is not None: if self.dir_of_layout is not None:

View file

@ -32,8 +32,8 @@ class EynollahProcessor(Processor):
allow_scaling=self.parameter['allow_scaling'], allow_scaling=self.parameter['allow_scaling'],
headers_off=self.parameter['headers_off'], headers_off=self.parameter['headers_off'],
tables=self.parameter['tables'], tables=self.parameter['tables'],
logger=self.logger
) )
self.eynollah.logger = self.logger
self.eynollah.plotter = None self.eynollah.plotter = None
def shutdown(self): def shutdown(self):

View file

@ -2,18 +2,24 @@
Tool to load model and binarize a given image. Tool to load model and binarize a given image.
""" """
import sys # pyright: reportIndexIssue=false
from glob import glob # pyright: reportCallIssue=false
# pyright: reportArgumentType=false
# pyright: reportPossiblyUnboundVariable=false
import os import os
import logging import logging
from pathlib import Path
from typing import Dict, Optional
import numpy as np import numpy as np
from PIL import Image
import cv2 import cv2
from ocrd_utils import tf_disable_interactive_logs from ocrd_utils import tf_disable_interactive_logs
from eynollah.model_zoo import EynollahModelZoo
from eynollah.model_zoo.types import AnyModel
tf_disable_interactive_logs() tf_disable_interactive_logs()
import tensorflow as tf import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.python.keras import backend as tensorflow_backend from tensorflow.python.keras import backend as tensorflow_backend
from .utils import is_image_filename from .utils import is_image_filename
@ -23,40 +29,40 @@ def resize_image(img_in, input_height, input_width):
class SbbBinarizer: class SbbBinarizer:
def __init__(self, model_dir, logger=None): def __init__(
self.model_dir = model_dir self,
self.log = logger if logger else logging.getLogger('SbbBinarizer') *,
model_zoo: EynollahModelZoo,
self.start_new_session() mode: str,
logger: Optional[logging.Logger] = None,
self.model_files = glob(self.model_dir+"/*/", recursive = True) ):
self.logger = logger if logger else logging.getLogger('eynollah.binarization')
self.models = [] self.model_zoo = model_zoo
for model_file in self.model_files: self.models = self.setup_models(mode)
self.models.append(self.load_model(model_file)) self.session = self.start_new_session()
def start_new_session(self): def start_new_session(self):
config = tf.compat.v1.ConfigProto() config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True config.gpu_options.allow_growth = True
self.session = tf.compat.v1.Session(config=config) # tf.InteractiveSession() session = tf.compat.v1.Session(config=config) # tf.InteractiveSession()
tensorflow_backend.set_session(self.session) tensorflow_backend.set_session(session)
return session
def setup_models(self, mode: str) -> Dict[Path, AnyModel]:
return {
self.model_zoo.model_path(v): self.model_zoo.load_model(v)
for v in (['binarization'] if mode == 'single' else [f'binarization_multi_{i}' for i in range(1, 5)])
}
def end_session(self): def end_session(self):
tensorflow_backend.clear_session() tensorflow_backend.clear_session()
self.session.close() self.session.close()
del self.session del self.session
def load_model(self, model_name): def predict(self, model, img, use_patches, n_batch_inference=5):
model = load_model(os.path.join(self.model_dir, model_name), compile=False)
model_height = model.layers[len(model.layers)-1].output_shape[1] model_height = model.layers[len(model.layers)-1].output_shape[1]
model_width = model.layers[len(model.layers)-1].output_shape[2] model_width = model.layers[len(model.layers)-1].output_shape[2]
n_classes = model.layers[len(model.layers)-1].output_shape[3]
return model, model_height, model_width, n_classes
def predict(self, model_in, img, use_patches, n_batch_inference=5):
tensorflow_backend.set_session(self.session)
model, model_height, model_width, n_classes = model_in
img_org_h = img.shape[0] img_org_h = img.shape[0]
img_org_w = img.shape[1] img_org_w = img.shape[1]
@ -324,9 +330,8 @@ class SbbBinarizer:
if image_path is not None: if image_path is not None:
image = cv2.imread(image_path) image = cv2.imread(image_path)
img_last = 0 img_last = 0
for n, (model, model_file) in enumerate(zip(self.models, self.model_files)): for n, (model_file, model) in enumerate(self.models.items()):
self.log.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.model_files))) self.logger.info('Predicting %s with model %s [%s/%s]', image_path if image_path else '[image]', model_file, n + 1, len(self.models.keys()))
res = self.predict(model, image, use_patches) res = self.predict(model, image, use_patches)
img_fin = np.zeros((res.shape[0], res.shape[1], 3)) img_fin = np.zeros((res.shape[0], res.shape[1], 3))
@ -345,17 +350,19 @@ class SbbBinarizer:
img_last[:, :][img_last[:, :] > 0] = 255 img_last[:, :][img_last[:, :] > 0] = 255
img_last = (img_last[:, :] == 0) * 255 img_last = (img_last[:, :] == 0) * 255
if output: if output:
self.logger.info('Writing binarized image to %s', output)
cv2.imwrite(output, img_last) cv2.imwrite(output, img_last)
return img_last return img_last
else: else:
ls_imgs = list(filter(is_image_filename, os.listdir(dir_in))) ls_imgs = list(filter(is_image_filename, os.listdir(dir_in)))
for image_name in ls_imgs: self.logger.info("Found %d image files to binarize in %s", len(ls_imgs), dir_in)
for i, image_name in enumerate(ls_imgs):
image_stem = image_name.split('.')[0] image_stem = image_name.split('.')[0]
print(image_name,'image_name') self.logger.info('Binarizing [%3d/%d] %s', i + 1, len(ls_imgs), image_name)
image = cv2.imread(os.path.join(dir_in,image_name) ) image = cv2.imread(os.path.join(dir_in,image_name) )
img_last = 0 img_last = 0
for n, (model, model_file) in enumerate(zip(self.models, self.model_files)): for n, (model_file, model) in enumerate(self.models.items()):
self.log.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.model_files))) self.logger.info('Predicting %s with model %s [%s/%s]', image_name, model_file, n + 1, len(self.models.keys()))
res = self.predict(model, image, use_patches) res = self.predict(model, image, use_patches)
@ -375,4 +382,6 @@ class SbbBinarizer:
img_last[:, :][img_last[:, :] > 0] = 255 img_last[:, :][img_last[:, :] > 0] = 255
img_last = (img_last[:, :] == 0) * 255 img_last = (img_last[:, :] == 0) * 255
cv2.imwrite(os.path.join(output, image_stem + '.png'), img_last) output_filename = os.path.join(output, image_stem + '.png')
self.logger.info('Writing binarized image to %s', output_filename)
cv2.imwrite(output_filename, img_last)

View file

@ -19,7 +19,6 @@ from .contour import (contours_in_same_horizon,
find_new_features_of_contours, find_new_features_of_contours,
return_contours_of_image, return_contours_of_image,
return_parent_contours) return_parent_contours)
def pairwise(iterable): def pairwise(iterable):
# pairwise('ABCDEFG') → AB BC CD DE EF FG # pairwise('ABCDEFG') → AB BC CD DE EF FG
@ -393,7 +392,12 @@ def find_num_col_deskew(regions_without_separators, sigma_, multiplier=3.8):
z = gaussian_filter1d(regions_without_separators_0, sigma_) z = gaussian_filter1d(regions_without_separators_0, sigma_)
return np.std(z) return np.std(z)
def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8): def find_num_col(
regions_without_separators,
num_col_classifier,
tables,
multiplier=3.8,
):
if not regions_without_separators.any(): if not regions_without_separators.any():
return 0, [] return 0, []
#plt.imshow(regions_without_separators) #plt.imshow(regions_without_separators)

View file

@ -357,7 +357,7 @@ def join_polygons(polygons: Sequence[Polygon], scale=20) -> Polygon:
assert jointp.geom_type == 'Polygon', jointp.wkt assert jointp.geom_type == 'Polygon', jointp.wkt
# follow-up calculations will necessarily be integer; # follow-up calculations will necessarily be integer;
# so anticipate rounding here and then ensure validity # so anticipate rounding here and then ensure validity
jointp2 = set_precision(jointp, 1.0) jointp2 = set_precision(jointp, 1.0, mode="keep_collapsed")
if jointp2.geom_type != 'Polygon' or not jointp2.is_valid: if jointp2.geom_type != 'Polygon' or not jointp2.is_valid:
jointp2 = Polygon(np.round(jointp.exterior.coords)) jointp2 = Polygon(np.round(jointp.exterior.coords))
jointp2 = make_valid(jointp2) jointp2 = make_valid(jointp2)

View file

@ -2,15 +2,15 @@
# pylint: disable=import-error # pylint: disable=import-error
from pathlib import Path from pathlib import Path
import os.path import os.path
from typing import Optional
import logging
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from .utils.xml import create_page_xml, xml_reading_order from .utils.xml import create_page_xml, xml_reading_order
from .utils.counter import EynollahIdCounter from .utils.counter import EynollahIdCounter
from ocrd_utils import getLogger
from ocrd_models.ocrd_page import ( from ocrd_models.ocrd_page import (
BorderType, BorderType,
CoordsType, CoordsType,
PcGtsType,
TextLineType, TextLineType,
TextEquivType, TextEquivType,
TextRegionType, TextRegionType,
@ -24,7 +24,7 @@ import numpy as np
class EynollahXmlWriter: class EynollahXmlWriter:
def __init__(self, *, dir_out, image_filename, curved_line,textline_light, pcgts=None): def __init__(self, *, dir_out, image_filename, curved_line,textline_light, pcgts=None):
self.logger = getLogger('eynollah.writer') self.logger = logging.getLogger('eynollah.writer')
self.counter = EynollahIdCounter() self.counter = EynollahIdCounter()
self.dir_out = dir_out self.dir_out = dir_out
self.image_filename = image_filename self.image_filename = image_filename
@ -32,10 +32,10 @@ class EynollahXmlWriter:
self.curved_line = curved_line self.curved_line = curved_line
self.textline_light = textline_light self.textline_light = textline_light
self.pcgts = pcgts self.pcgts = pcgts
self.scale_x = None # XXX set outside __init__ self.scale_x: Optional[float] = None # XXX set outside __init__
self.scale_y = None # XXX set outside __init__ self.scale_y: Optional[float] = None # XXX set outside __init__
self.height_org = None # XXX set outside __init__ self.height_org: Optional[int] = None # XXX set outside __init__
self.width_org = None # XXX set outside __init__ self.width_org: Optional[int] = None # XXX set outside __init__
@property @property
def image_filename_stem(self): def image_filename_stem(self):
@ -135,6 +135,7 @@ class EynollahXmlWriter:
# create the file structure # create the file structure
pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org) pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org)
page = pcgts.get_Page() page = pcgts.get_Page()
assert page
page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page)))) page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page))))
counter = EynollahIdCounter() counter = EynollahIdCounter()
@ -152,6 +153,7 @@ class EynollahXmlWriter:
Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord, Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord,
skip_layout_reading_order)) skip_layout_reading_order))
) )
assert textregion.Coords
if conf_contours_textregions: if conf_contours_textregions:
textregion.Coords.set_conf(conf_contours_textregions[mm]) textregion.Coords.set_conf(conf_contours_textregions[mm])
page.add_TextRegion(textregion) page.add_TextRegion(textregion)
@ -168,6 +170,7 @@ class EynollahXmlWriter:
id=counter.next_region_id, type_='heading', id=counter.next_region_id, type_='heading',
Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord))
) )
assert textregion.Coords
if conf_contours_textregions_h: if conf_contours_textregions_h:
textregion.Coords.set_conf(conf_contours_textregions_h[mm]) textregion.Coords.set_conf(conf_contours_textregions_h[mm])
page.add_TextRegion(textregion) page.add_TextRegion(textregion)

View file

View file

@ -0,0 +1,47 @@
from typing import List
import pytest
import logging
from click.testing import CliRunner, Result
from eynollah.cli import main as eynollah_cli
@pytest.fixture
def run_eynollah_ok_and_check_logs(
pytestconfig,
caplog,
model_dir,
eynollah_subcommands,
eynollah_log_filter,
):
"""
Generates a Click Runner for `cli`, injects model_path and logging level
to `args`, runs the command and checks whether the logs generated contain
every fragment in `expected_logs`
"""
def _run_click_ok_logs(
subcommand: 'str',
args: List[str],
expected_logs: List[str],
) -> Result:
assert subcommand in eynollah_subcommands, f'subcommand {subcommand} must be one of {eynollah_subcommands}'
args = [
'-m', model_dir,
subcommand,
*args
]
if pytestconfig.getoption('verbose') > 0:
args = ['-l', 'DEBUG'] + args
caplog.set_level(logging.INFO)
runner = CliRunner()
with caplog.filtering(eynollah_log_filter):
result = runner.invoke(eynollah_cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
if expected_logs:
logmsgs = [logrec.message for logrec in caplog.records]
assert any(logmsg.startswith(needle) for needle in expected_logs for logmsg in logmsgs), f'{expected_logs} not in {logmsgs}'
return result
return _run_click_ok_logs

View file

@ -0,0 +1,53 @@
import pytest
from PIL import Image
@pytest.mark.parametrize(
"options",
[
[], # defaults
["--no-patches"],
], ids=str)
def test_run_eynollah_binarization_filename(
tmp_path,
run_eynollah_ok_and_check_logs,
tests_dir,
options,
):
infile = tests_dir.joinpath('resources/2files/kant_aufklaerung_1784_0020.tif')
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
run_eynollah_ok_and_check_logs(
'binarization',
[
'-i', str(infile),
'-o', str(outfile),
] + options,
[
'Predicting'
]
)
assert outfile.exists()
with Image.open(infile) as original_img:
original_size = original_img.size
with Image.open(outfile) as binarized_img:
binarized_size = binarized_img.size
assert original_size == binarized_size
def test_run_eynollah_binarization_directory(
tmp_path,
run_eynollah_ok_and_check_logs,
resources_dir,
image_resources,
):
outdir = tmp_path
run_eynollah_ok_and_check_logs(
'binarization',
[
'-di', str(resources_dir),
'-o', str(outdir),
],
[
f'Predicting {image_resources[0].name}',
f'Predicting {image_resources[1].name}',
]
)
assert len(list(outdir.iterdir())) == 2

View file

@ -0,0 +1,52 @@
import pytest
from PIL import Image
@pytest.mark.parametrize(
"options",
[
[], # defaults
["-sos"],
], ids=str)
def test_run_eynollah_enhancement_filename(
tmp_path,
resources_dir,
run_eynollah_ok_and_check_logs,
options,
):
infile = resources_dir / 'kant_aufklaerung_1784_0020.tif'
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
run_eynollah_ok_and_check_logs(
'enhancement',
[
'-i', str(infile),
'-o', str(outfile.parent),
] + options,
[
'Image was enhanced',
]
)
with Image.open(infile) as original_img:
original_size = original_img.size
with Image.open(outfile) as enhanced_img:
enhanced_size = enhanced_img.size
assert (original_size == enhanced_size) == ("-sos" in options)
def test_run_eynollah_enhancement_directory(
tmp_path,
resources_dir,
image_resources,
run_eynollah_ok_and_check_logs,
):
outdir = tmp_path
run_eynollah_ok_and_check_logs(
'enhancement',
[
'-di', str(resources_dir),
'-o', str(outdir),
],
[
f'Image {image_resources[0]} was enhanced',
f'Image {image_resources[1]} was enhanced',
]
)
assert len(list(outdir.iterdir())) == 2

View file

@ -0,0 +1,128 @@
import pytest
from ocrd_modelfactory import page_from_file
from ocrd_models.constants import NAMESPACES as NS
@pytest.mark.parametrize(
"options",
[
[], # defaults
#["--allow_scaling", "--curved-line"],
["--allow_scaling", "--curved-line", "--full-layout"],
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"],
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based",
"--textline_light", "--light_version"],
# -ep ...
# -eoi ...
# FIXME: find out whether OCR extra was installed, otherwise skip these
["--do_ocr"],
["--do_ocr", "--light_version", "--textline_light"],
["--do_ocr", "--transformer_ocr"],
#["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"],
["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"],
# --skip_layout_and_reading_order
], ids=str)
def test_run_eynollah_layout_filename(
tmp_path,
run_eynollah_ok_and_check_logs,
resources_dir,
options,
):
infile = resources_dir / 'kant_aufklaerung_1784_0020.tif'
outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml'
run_eynollah_ok_and_check_logs(
'layout',
[
'-i', str(infile),
'-o', str(outfile.parent),
] + options,
[
str(infile)
]
)
assert outfile.exists()
tree = page_from_file(str(outfile)).etree
regions = tree.xpath("//page:TextRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
lines = tree.xpath("//page:TextLine", namespaces=NS)
assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line
@pytest.mark.parametrize(
"options",
[
["--tables"],
["--tables", "--full-layout"],
["--tables", "--full-layout", "--textline_light", "--light_version"],
], ids=str)
def test_run_eynollah_layout_filename2(
tmp_path,
resources_dir,
run_eynollah_ok_and_check_logs,
options,
):
infile = resources_dir / 'euler_rechenkunst01_1738_0025.tif'
outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml'
run_eynollah_ok_and_check_logs(
'layout',
[
'-i', str(infile),
'-o', str(outfile.parent),
] + options,
[
str(infile)
]
)
assert outfile.exists()
tree = page_from_file(str(outfile)).etree
regions = tree.xpath("//page:TextRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
regions = tree.xpath("//page:TableRegion", namespaces=NS)
# model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP
assert len(regions) >= 1, "result is inaccurate"
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
lines = tree.xpath("//page:TextLine", namespaces=NS)
assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line
def test_run_eynollah_layout_directory(
tmp_path,
resources_dir,
run_eynollah_ok_and_check_logs,
):
outdir = tmp_path
run_eynollah_ok_and_check_logs(
'layout',
[
'-di', str(resources_dir),
'-o', str(outdir),
],
[
'Job done in',
'All jobs done in',
]
)
assert len(list(outdir.iterdir())) == 2
# def test_run_eynollah_layout_marginalia(
# tmp_path,
# resources_dir,
# run_eynollah_ok_and_check_logs,
# ):
# outdir = tmp_path
# outfile = outdir / 'estor_rechtsgelehrsamkeit02_1758_0880_800px.xml'
# run_eynollah_ok_and_check_logs(
# 'layout',
# [
# '-i', str(resources_dir / 'estor_rechtsgelehrsamkeit02_1758_0880_800px.jpg'),
# '-o', str(outdir),
# ],
# [
# 'Job done in',
# 'All jobs done in',
# ]
# )
# assert outfile.exists()
# tree = page_from_file(str(outfile)).etree
# regions = tree.xpath('//page:TextRegion[type="marginalia"]', namespaces=NS)
# assert len(regions) == 5, "expected 5 marginalia regions"

View file

@ -0,0 +1,47 @@
from ocrd_modelfactory import page_from_file
from ocrd_models.constants import NAMESPACES as NS
def test_run_eynollah_mbreorder_filename(
tmp_path,
resources_dir,
run_eynollah_ok_and_check_logs,
):
infile = resources_dir / 'kant_aufklaerung_1784_0020.xml'
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
run_eynollah_ok_and_check_logs(
'machine-based-reading-order',
[
'-i', str(infile),
'-o', str(outfile.parent),
],
[
# FIXME: mbreorder has no logging!
]
)
assert outfile.exists()
#in_tree = page_from_file(str(infile)).etree
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
out_tree = page_from_file(str(outfile)).etree
out_order = out_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
#assert len(out_order) >= 2, "result is inaccurate"
#assert in_order != out_order
assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3']
def test_run_eynollah_mbreorder_directory(
tmp_path,
resources_dir,
run_eynollah_ok_and_check_logs,
):
outdir = tmp_path
run_eynollah_ok_and_check_logs(
'machine-based-reading-order',
[
'-di', str(resources_dir),
'-o', str(outdir),
],
[
# FIXME: mbreorder has no logging!
]
)
assert len(list(outdir.iterdir())) == 2

View file

@ -0,0 +1,64 @@
import pytest
from ocrd_modelfactory import page_from_file
from ocrd_models.constants import NAMESPACES as NS
@pytest.mark.parametrize(
"options",
[
["-trocr"],
[], # defaults
["-doit", #str(outrenderfile.parent)],
],
], ids=str)
def test_run_eynollah_ocr_filename(
tmp_path,
run_eynollah_ok_and_check_logs,
resources_dir,
options,
):
infile = resources_dir / 'kant_aufklaerung_1784_0020.tif'
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
outrenderfile = tmp_path / 'render' / 'kant_aufklaerung_1784_0020.png'
outrenderfile.parent.mkdir()
if "-doit" in options:
options.insert(options.index("-doit") + 1, str(outrenderfile.parent))
run_eynollah_ok_and_check_logs(
'ocr',
[
'-i', str(infile),
'-dx', str(infile.parent),
'-o', str(outfile.parent),
] + options,
[
# FIXME: ocr has no logging!
]
)
assert outfile.exists()
if "-doit" in options:
assert outrenderfile.exists()
#in_tree = page_from_file(str(infile)).etree
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
out_tree = page_from_file(str(outfile)).etree
out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS)
assert len(out_texts) >= 2, ("result is inaccurate", out_texts)
assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts)
def test_run_eynollah_ocr_directory(
tmp_path,
run_eynollah_ok_and_check_logs,
resources_dir,
):
outdir = tmp_path
run_eynollah_ok_and_check_logs(
'ocr',
[
'-di', str(resources_dir),
'-dx', str(resources_dir),
'-o', str(outdir),
],
[
# FIXME: ocr has no logging!
]
)
assert len(list(outdir.iterdir())) == 2

37
tests/conftest.py Normal file
View file

@ -0,0 +1,37 @@
from glob import glob
import os
import pytest
from pathlib import Path
@pytest.fixture()
def tests_dir():
return Path(__file__).parent.resolve()
@pytest.fixture()
def model_dir(tests_dir):
return os.environ.get('EYNOLLAH_MODELS_DIR', str(tests_dir.joinpath('..').resolve()))
@pytest.fixture()
def resources_dir(tests_dir):
return tests_dir / 'resources/2files'
@pytest.fixture()
def image_resources(resources_dir):
return [Path(x) for x in glob(str(resources_dir / '*.tif'))]
@pytest.fixture()
def eynollah_log_filter():
return lambda logrec: logrec.name.startswith('eynollah')
@pytest.fixture
def eynollah_subcommands():
return [
'binarization',
'layout',
'ocr',
'enhancement',
'machine-based-reading-order',
'models',
]

Binary file not shown.

After

Width:  |  Height:  |  Size: 221 KiB

View file

@ -0,0 +1,235 @@
<?xml version="1.0" encoding="UTF-8"?>
<pc:PcGts xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd">
<pc:Metadata>
<pc:Creator>SBB_QURATOR</pc:Creator>
<pc:Created>2025-10-30T16:38:21.180191</pc:Created>
<pc:LastChange>2025-10-30T16:38:21.180191</pc:LastChange>
</pc:Metadata>
<pc:Page imageFilename="/home/vahid/Downloads/estor_rechtsgelehrsamkeit02_1758_0880_800px.jpg" imageWidth="800" imageHeight="1376" readingDirection="left-to-right" textLineOrder="top-to-bottom">
<pc:Border>
<pc:Coords points="780,40 780,42 780,43 736,43 736,45 735,46 675,46 675,48 674,49 627,49 627,52 626,52 570,52 570,55 569,56 530,56 530,57 529,58 472,58 472,60 471,61 296,61 296,60 296,58 66,58 65,57 65,56 50,56 50,57 49,58 47,58 47,60 46,61 44,61 44,64 46,64 47,64 47,79 48,79 48,80 48,162 48,163 47,163 47,340 48,340 48,341 48,364 48,365 47,365 47,444 48,444 48,445 48,1052 49,1052 50,1053 50,1182 51,1182 52,1183 52,1332 74,1332 74,1330 75,1329 76,1329 76,1330 76,1332 78,1332 78,1330 79,1329 80,1329 80,1330 80,1332 87,1332 87,1330 88,1329 272,1329 272,1327 273,1326 380,1326 380,1324 380,1323 401,1323 401,1320 402,1320 423,1320 423,1318 424,1317 444,1317 444,1315 444,1314 458,1314 458,1312 459,1311 472,1311 472,1308 473,1308 512,1308 512,1305 512,1304 569,1304 569,1302 570,1301 601,1301 601,1300 602,1299 624,1299 624,1296 625,1296 644,1296 644,1293 644,1292 708,1292 708,1290 709,1289 790,1289 790,1287 791,1286 792,1286 792,1272 793,1271 794,1271 794,1152 793,1152 792,1151 792,1128 793,1127 794,1127 794,1108 793,1108 792,1108 792,1103 793,1102 794,1102 794,1075 793,1075 792,1074 792,829 793,828 794,828 794,820 793,820 792,820 792,789 791,789 790,788 790,786 791,785 792,785 792,784 791,784 790,783 790,697 791,696 792,696 792,685 791,685 790,684 790,632 789,632 788,632 788,525 788,525 787,524 787,212 786,212 785,211 785,43 784,43 783,42 783,40"/>
</pc:Border>
<pc:ReadingOrder>
<pc:OrderedGroup id="ro357564684568544579089">
<pc:RegionRefIndexed index="0" regionRef="region_0013"/>
<pc:RegionRefIndexed index="1" regionRef="region_0014"/>
<pc:RegionRefIndexed index="2" regionRef="region_0015"/>
<pc:RegionRefIndexed index="3" regionRef="region_0016"/>
<pc:RegionRefIndexed index="4" regionRef="region_0017"/>
<pc:RegionRefIndexed index="5" regionRef="region_0012"/>
<pc:RegionRefIndexed index="6" regionRef="region_0010"/>
<pc:RegionRefIndexed index="7" regionRef="region_0006"/>
<pc:RegionRefIndexed index="8" regionRef="region_0011"/>
<pc:RegionRefIndexed index="9" regionRef="region_0003"/>
<pc:RegionRefIndexed index="10" regionRef="region_0009"/>
<pc:RegionRefIndexed index="11" regionRef="region_0005"/>
<pc:RegionRefIndexed index="12" regionRef="region_0007"/>
<pc:RegionRefIndexed index="13" regionRef="region_0004"/>
<pc:RegionRefIndexed index="14" regionRef="region_0008"/>
<pc:RegionRefIndexed index="15" regionRef="region_0002"/>
<pc:RegionRefIndexed index="16" regionRef="region_0001"/>
</pc:OrderedGroup>
</pc:ReadingOrder>
<pc:TextRegion id="region_0001" orientation="1.241379310344827" type="paragraph">
<pc:Coords points="743,1172 740,1172 740,1172 716,1170 714,1172 709,1172 707,1172 702,1172 697,1177 697,1175 692,1175 690,1175 671,1175 668,1180 668,1177 668,1177 664,1180 664,1180 664,1201 666,1204 664,1206 668,1208 666,1208 666,1206 668,1211 668,1211 685,1213 690,1208 690,1211 690,1211 692,1208 697,1211 700,1208 704,1208 709,1208 714,1208 719,1204 719,1206 731,1206 733,1204 733,1204 748,1206 750,1204 750,1204 748,1204 752,1206 755,1201 755,1201 752,1204 757,1201 755,1201 760,1194 760,1196 757,1196 760,1175 757,1172 757,1172 755,1170 755,1170 757,1172" conf="0.21158782745235"/>
<pc:TextLine id="region_0001_line_0001">
<pc:Coords points="751,1167 747,1169 745,1168 741,1170 744,1168 740,1171 736,1169 732,1172 735,1170 731,1172 731,1171 727,1173 728,1172 724,1174 723,1172 719,1175 719,1173 715,1176 716,1174 712,1176 712,1175 708,1177 688,1176 684,1178 675,1176 671,1179 672,1177 668,1180 671,1178 667,1180 665,1185 666,1183 664,1194 667,1198 665,1204 668,1208 666,1207 676,1212 680,1209 681,1211 685,1208 684,1210 688,1207 688,1208 692,1206 711,1208 715,1205 714,1207 718,1204 718,1206 722,1204 720,1205 718,1204 730,1206 734,1204 732,1205 736,1203 734,1204 738,1202 738,1204 742,1201 740,1203 744,1200 741,1202 746,1199 746,1200 750,1198 748,1200 752,1196 752,1198 757,1194 759,1190 758,1192 760,1188 759,1191 760,1183 758,1179 760,1174 754,1169 756,1170 758,1172 756,1168 751,1167"/>
</pc:TextLine>
</pc:TextRegion>
<pc:TextRegion id="region_0002" orientation="1.241379310344827" type="paragraph">
<pc:Coords points="724,1086 719,1084 700,1084 697,1086 690,1084 685,1088 685,1088 680,1086 678,1088 671,1088 666,1088 652,1088 649,1093 649,1091 642,1091 640,1091 628,1091 623,1096 623,1093 623,1093 623,1093 623,1100 625,1100 625,1100 611,1093 606,1096 606,1096 592,1093 587,1096 587,1096 589,1096 568,1093 563,1096 560,1096 556,1100 556,1100 556,1100 558,1098 558,1100 558,1100 558,1100 558,1100 551,1096 546,1096 534,1096 529,1098 520,1096 515,1100 515,1100 498,1098 496,1100 500,1100 469,1098 467,1100 342,1100 340,1100 330,1100 330,1100 328,1103 328,1103 330,1103 330,1103 323,1100 323,1100 323,1100 320,1105 323,1105 323,1105 308,1100 308,1100 294,1100 289,1100 289,1100 287,1129 287,1129 287,1129 284,1132 284,1132 284,1129 287,1129 287,1132 263,1129 258,1132 258,1132 258,1129 260,1129 260,1132 253,1129 251,1132 251,1132 251,1129 248,1132 246,1158 248,1160 248,1160 248,1163 248,1163 282,1165 287,1160 287,1163 287,1163 284,1163 284,1163 289,1165 292,1163 292,1165 289,1165 342,1165 344,1160 344,1163 352,1163 352,1163 349,1160 356,1165 356,1165 378,1165 380,1160 380,1163 404,1163 407,1160 436,1163 433,1163 448,1163 450,1160 457,1163 462,1160 464,1160 469,1160 467,1160 469,1156 469,1158 472,1134 472,1136 469,1136 472,1136 474,1132 474,1134 479,1134 484,1132 481,1134 484,1132 498,1132 500,1129 500,1129 508,1132 508,1132 508,1132 510,1127 508,1127 508,1127 515,1132 520,1129 541,1129 546,1129 575,1129 577,1124 577,1127 611,1127 613,1124 623,1127 625,1124 637,1124 642,1124 652,1124 654,1120 654,1122 656,1122 659,1120 659,1120 664,1122 666,1120 680,1120 685,1117 685,1117 685,1120 683,1120 683,1117 707,1120 712,1117 712,1117 721,1120 719,1120 719,1117 728,1120 733,1117 733,1117 731,1120 733,1117 736,1117 738,1117 738,1117 740,1112 736,1110 736,1110 738,1112 738,1112 738,1112 736,1110 736,1110 736,1110 736,1110 736,1110 733,1110 733,1110 745,1115 750,1112 752,1115 757,1112 757,1108 757,1110 760,1105 760,1105 757,1108 760,1098 760,1100 762,1096 760,1093 760,1096 762,1088 760,1084 760,1086" conf="0.285657570399826"/>
<pc:TextLine id="region_0002_line_0001">
<pc:Coords points="741,1081 737,1084 735,1082 731,1084 719,1083 715,1085 712,1084 714,1084 708,1083 711,1084 702,1082 697,1085 696,1089 696,1087 696,1085 692,1088 688,1086 684,1088 687,1087 682,1092 683,1090 684,1088 680,1091 680,1089 676,1092 667,1090 663,1092 644,1091 640,1093 639,1092 634,1095 626,1093 626,1093 626,1093 624,1098 624,1098 624,1098 601,1093 597,1096 603,1095 579,1093 581,1094 573,1092 569,1095 571,1093 567,1096 566,1094 562,1096 560,1095 555,1099 556,1097 559,1098 555,1096 551,1099 506,1097 502,1100 499,1098 495,1100 464,1099 466,1100 380,1098 375,1101 375,1101 374,1103 376,1101 379,1102 379,1102 360,1100 356,1102 348,1100 344,1103 342,1101 344,1102 346,1104 340,1100 343,1100 330,1099 330,1099 326,1102 328,1100 330,1101 330,1101 323,1099 323,1099 323,1099 320,1103 320,1103 320,1103 308,1099 310,1100 293,1098 289,1100 292,1099 288,1101 286,1108 287,1105 285,1117 288,1121 286,1122 288,1126 287,1126 289,1130 288,1128 293,1132 291,1132 299,1133 296,1132 311,1134 315,1132 318,1133 322,1131 326,1132 330,1130 341,1132 346,1126 345,1128 374,1130 372,1129 379,1128 373,1129 378,1131 376,1131 375,1129 391,1134 395,1132 395,1133 399,1131 396,1132 401,1126 401,1128 428,1129 432,1127 507,1128 511,1126 509,1128 507,1127 512,1128 516,1126 557,1128 561,1125 572,1127 576,1124 591,1126 595,1124 612,1125 616,1123 624,1124 628,1122 626,1124 630,1121 628,1123 632,1120 645,1122 649,1120 657,1121 661,1119 665,1120 669,1118 675,1120 679,1117 682,1119 686,1116 694,1118 698,1116 720,1117 724,1115 727,1116 731,1114 729,1116 734,1110 733,1112 735,1108 735,1109 732,1111 736,1108 736,1110 740,1108 739,1109 743,1107 741,1108 745,1106 744,1108 748,1105 746,1107 750,1104 748,1106 752,1104 749,1105 747,1104 751,1106 756,1102 758,1096 757,1099 759,1093 758,1096 760,1087 757,1083 741,1081"/>
</pc:TextLine>
<pc:TextLine id="region_0002_line_0002">
<pc:Coords points="412,1126 408,1128 400,1127 395,1130 397,1128 393,1131 394,1129 390,1132 387,1130 383,1132 376,1131 379,1132 373,1130 376,1131 360,1129 355,1132 356,1131 352,1133 353,1132 356,1132 350,1131 352,1132 348,1130 348,1129 339,1128 335,1130 327,1128 323,1131 320,1129 316,1132 308,1130 304,1132 296,1131 298,1132 290,1130 292,1131 292,1131 292,1131 292,1131 288,1134 288,1134 288,1134 291,1133 284,1128 286,1128 262,1127 262,1127 258,1130 258,1128 260,1129 256,1128 252,1130 253,1128 249,1131 252,1129 248,1132 250,1130 245,1158 248,1162 246,1160 248,1164 247,1162 283,1167 283,1167 287,1163 286,1164 284,1164 288,1165 292,1163 312,1164 316,1162 328,1164 325,1163 335,1164 332,1164 336,1165 340,1163 338,1164 342,1162 341,1164 345,1161 345,1163 349,1160 351,1156 350,1159 348,1160 352,1158 358,1160 362,1157 405,1159 403,1158 409,1160 407,1159 411,1160 408,1160 414,1161 412,1160 421,1162 425,1160 435,1161 432,1160 444,1162 449,1159 451,1155 452,1156 459,1157 463,1155 463,1156 467,1154 464,1156 469,1148 467,1144 461,1142 462,1141 464,1138 460,1133 431,1132 433,1132 428,1131 430,1132 423,1130 425,1131 421,1129 424,1130 425,1132 423,1128 419,1127 421,1128 412,1126"/>
</pc:TextLine>
</pc:TextRegion>
<pc:TextRegion id="region_0003" orientation="1.241379310344827" type="paragraph">
<pc:Coords points="738,520 731,520 728,520 726,520 726,520 721,522 714,520 712,524 712,522 700,522 695,527 695,524 635,524 635,524 637,524 630,522 628,527 628,524 620,524 616,524 606,524 601,529 601,527 572,527 570,527 568,527 568,527 568,527 565,529 565,529 565,529 563,527 563,527 553,527 551,527 522,527 517,529 390,527 388,532 388,532 373,529 371,532 361,532 364,532 349,529 347,532 335,532 332,532 335,532 335,532 313,532 313,532 292,529 289,532 287,536 289,539 289,551 289,548 287,553 289,558 289,558 289,556 287,558 287,558 287,560 287,560 287,558 284,560 282,560 280,560 272,560 268,565 268,563 263,563 260,565 260,565 258,563 260,563 253,563 251,565 251,563 248,568 248,568 248,577 248,575 246,599 246,596 246,618 248,620 253,620 253,620 349,623 354,620 354,620 354,620 359,623 361,620 385,620 388,620 397,620 400,616 400,618 400,618 402,616 402,618 404,594 404,596 404,592 407,594 404,594 409,592 409,594 407,594 414,594 419,592 428,594 431,592 433,592 436,589 436,589 520,592 522,589 529,589 534,589 551,589 548,589 556,589 558,589 558,589 558,589 611,589 608,589 623,592 625,589 659,589 656,589 695,592 697,589 697,589 700,589 700,589 702,587 709,589 712,584 714,584 712,587 714,584 714,584 719,584 719,584 721,580 721,582 726,582 724,582 724,582 726,584 726,584 728,584 733,584 738,584 740,580 740,582 750,582 752,582 755,582 757,577 757,580 757,577 757,577 757,580 760,575 760,575 762,570 760,568 760,539 760,541 762,527 760,524 760,524 757,522 757,522 760,524 752,520 755,520 748,520 750,520 738,520 740,520" conf="0.410852900020834"/>
<pc:TextLine id="region_0003_line_0001">
<pc:Coords points="735,516 731,519 732,517 728,523 728,520 727,524 728,522 728,520 724,523 720,521 716,524 716,522 712,524 675,523 671,525 640,524 643,524 637,523 640,524 634,522 636,523 638,525 631,520 626,526 627,524 624,522 620,524 617,523 613,525 613,524 609,526 604,524 600,527 598,525 600,526 587,524 583,527 567,525 567,525 567,525 564,528 567,529 567,529 553,525 549,528 544,526 540,528 543,527 539,529 540,528 536,530 533,528 529,531 513,529 509,532 502,530 504,531 500,529 501,529 448,528 451,528 440,527 442,528 428,526 424,528 404,527 400,529 395,528 391,530 376,528 372,531 370,529 366,532 332,530 335,531 312,529 315,530 308,531 297,529 293,532 296,530 292,532 290,536 291,534 289,546 292,550 290,552 292,556 291,556 293,560 318,562 322,560 326,561 330,559 359,560 363,558 365,560 363,559 372,560 372,560 370,559 374,560 372,560 370,557 373,562 388,564 392,561 410,563 414,560 416,556 416,558 420,560 424,557 464,559 468,556 516,558 514,557 521,559 520,559 526,560 530,558 528,560 533,556 531,558 535,556 533,557 537,555 548,556 552,554 591,556 588,555 592,556 591,556 616,558 614,557 619,559 623,556 623,558 627,556 624,557 630,553 667,555 671,552 687,554 684,553 695,555 699,552 704,554 708,552 706,553 710,551 708,552 712,550 717,552 721,549 727,551 731,548 732,550 736,548 736,549 740,547 740,548 744,546 743,548 747,545 746,547 751,544 752,540 752,541 750,543 754,540 753,542 758,536 757,539 759,533 758,536 760,526 757,522 740,520 741,520 743,522 740,518 735,516"/>
</pc:TextLine>
<pc:TextLine id="region_0003_line_0002">
<pc:Coords points="719,548 715,550 717,548 712,552 711,556 711,554 716,553 712,552 708,554 709,552 705,555 705,553 701,556 697,554 693,556 696,555 692,557 694,556 696,556 687,555 683,557 673,556 676,556 666,555 668,556 664,554 667,555 663,553 665,554 640,552 642,553 636,552 639,552 635,551 637,552 632,550 628,552 630,551 626,553 628,552 624,554 625,552 621,555 582,553 578,556 576,554 579,555 574,553 570,556 572,554 574,555 568,553 564,556 560,554 562,555 535,553 531,556 532,554 528,556 528,555 524,557 522,556 524,556 514,555 510,557 512,556 508,558 509,556 505,559 500,557 496,560 472,558 468,560 464,559 465,559 456,557 452,560 455,558 451,560 439,559 435,561 414,560 416,560 412,559 415,560 397,558 400,559 393,557 389,560 386,558 382,560 379,559 381,560 371,558 373,559 358,557 354,560 351,558 347,560 316,559 319,560 293,558 289,560 284,559 280,562 278,566 278,564 280,563 276,565 267,564 263,566 252,564 248,568 246,587 248,591 247,590 259,595 263,592 268,594 272,592 280,593 284,591 300,592 298,592 312,593 316,591 314,592 319,587 319,588 320,590 324,588 341,589 345,587 365,588 369,586 398,588 396,587 419,588 417,588 416,588 425,592 429,590 431,585 430,588 432,584 431,586 475,588 479,585 507,587 504,586 513,588 511,587 521,588 525,586 523,588 527,585 528,581 528,583 534,584 538,582 575,584 579,581 604,583 601,582 608,584 612,581 614,583 612,582 617,584 616,584 614,582 616,586 629,588 633,585 652,587 649,586 666,588 670,585 676,587 673,586 685,588 690,582 689,584 691,580 691,582 705,584 709,581 712,583 716,580 716,582 720,580 721,581 725,579 728,580 732,578 731,580 735,577 736,579 740,576 744,578 748,576 751,577 755,575 752,576 757,573 759,566 758,568 760,562 757,558 759,560 753,556 756,556 752,555 753,555 755,556 752,551 740,549 736,552 732,550 735,551 728,549 731,550 723,548 725,549 719,548"/>
</pc:TextLine>
<pc:TextLine id="region_0003_line_0003">
<pc:Coords points="360,584 356,586 357,584 353,587 342,585 338,588 330,586 326,588 322,587 318,589 309,588 305,590 305,588 308,589 283,588 278,593 279,591 277,596 277,594 257,592 253,595 250,593 246,596 244,615 269,620 273,617 321,619 319,618 326,617 320,618 324,620 324,620 338,622 342,620 349,621 353,619 379,620 383,618 384,614 384,616 389,617 393,615 396,616 400,614 402,608 400,604 392,602 394,603 396,600 393,596 395,597 392,593 392,593 392,593 369,594 369,594 369,594 371,596 372,590 369,585 360,584"/>
</pc:TextLine>
</pc:TextRegion>
<pc:TextRegion id="region_0004" orientation="1.241379310344827" type="paragraph">
<pc:Coords points="740,940 738,940 738,940 733,937 731,940 719,940 714,940 712,940 707,942 692,940 690,944 690,944 685,942 683,944 671,944 668,944 630,944 628,949 628,947 599,947 596,947 553,947 551,952 551,949 532,949 532,949 529,952 529,949 532,949 532,949 527,949 524,952 524,952 515,949 517,949 517,952 467,949 464,952 464,952 448,949 450,949 450,952 438,949 440,949 440,949 431,947 426,952 426,949 412,949 407,952 407,952 385,949 383,952 352,952 349,952 304,952 306,952 294,952 292,952 294,952 289,956 289,956 292,954 289,959 289,956 287,978 287,976 287,980 287,980 287,980 284,980 284,980 282,985 282,983 280,983 275,983 270,983 268,985 263,983 260,985 258,985 260,985 251,983 248,988 248,988 248,1009 248,1007 246,1036 248,1038 248,1038 248,1040 368,1043 373,1040 373,1040 371,1040 378,1043 380,1040 414,1040 412,1040 443,1043 445,1040 481,1040 486,1040 520,1040 522,1036 522,1038 568,1038 570,1036 570,1036 599,1038 601,1036 601,1036 604,1033 604,1033 606,1024 604,1021 604,1012 606,1012 606,1009 606,1009 606,1012 608,1009 623,1009 620,1009 628,1012 630,1009 642,1009 644,1009 659,1009 664,1007 661,1009 664,1004 664,1007 671,1007 676,1004 695,1007 697,1004 697,1004 695,1004 697,1007 702,1004 700,1004 702,1004 709,1004 712,1000 712,1002 714,1002 719,1002 719,1002 721,1000 721,1002 726,1000 726,1000 728,997 728,997 728,1000 731,997 733,997 738,997 740,997 745,995 752,997 757,992 757,995 755,995 757,990 757,992 760,988 760,990 757,990 760,942 757,940 757,940 760,940" conf="0.530462165249498"/>
<pc:TextLine id="region_0004_line_0001">
<pc:Coords points="748,935 744,937 743,936 739,938 735,936 731,939 720,937 716,940 714,945 715,943 713,947 713,945 716,944 712,946 710,944 706,947 700,945 696,948 682,946 678,948 671,947 667,949 646,948 648,948 649,949 646,944 636,943 632,945 635,944 631,946 631,944 627,947 608,945 604,948 599,946 595,948 597,947 592,950 588,948 584,951 569,949 565,952 550,950 552,951 548,949 550,950 498,948 494,951 496,949 492,952 480,950 476,952 476,951 472,953 468,952 471,952 466,951 462,953 461,952 457,954 450,952 452,953 448,952 450,952 444,951 446,952 439,950 440,950 442,952 439,947 430,945 425,951 426,948 416,947 412,949 411,948 407,950 402,948 398,951 365,949 361,952 304,950 307,951 295,949 291,952 293,950 288,957 289,955 292,953 287,959 288,956 286,969 288,973 287,972 289,976 288,978 290,982 294,984 292,983 313,984 311,984 336,985 340,983 338,984 345,979 379,980 383,978 389,980 387,979 394,980 398,978 406,980 404,979 408,980 406,980 411,982 416,979 416,980 420,978 467,980 471,977 524,979 528,976 529,978 527,977 542,979 546,976 604,978 603,978 601,976 608,980 605,980 616,981 620,979 621,975 621,976 619,978 623,976 632,977 636,975 653,976 657,974 660,976 664,973 670,975 674,972 680,974 684,972 690,973 694,971 699,972 703,970 704,972 708,969 709,971 713,968 711,970 715,968 712,969 717,966 717,968 721,965 724,967 728,964 728,966 732,964 732,965 736,963 736,964 740,962 739,964 743,961 742,963 746,960 745,962 749,960 748,961 752,959 749,960 753,958 752,960 756,957 758,953 757,956 759,940 756,936 748,935"/>
</pc:TextLine>
<pc:TextLine id="region_0004_line_0002">
<pc:Coords points="731,963 725,967 728,965 724,968 724,966 720,968 720,967 716,969 713,968 709,970 706,968 702,971 696,969 692,972 689,970 685,972 683,971 679,973 677,972 673,974 674,972 670,975 668,973 664,976 650,974 646,976 640,975 636,977 627,976 623,978 614,976 609,982 607,983 607,983 608,982 609,984 609,984 609,984 610,985 612,981 532,976 533,976 526,975 522,977 520,981 521,979 522,977 518,980 488,978 484,980 482,984 482,983 484,981 480,984 476,982 479,983 475,981 471,984 465,982 468,983 439,981 441,982 426,980 422,983 419,981 421,982 417,980 419,980 420,983 401,978 397,980 397,979 393,981 394,980 389,983 366,981 362,984 363,982 359,984 344,983 340,985 342,984 344,984 346,987 339,982 341,983 333,981 329,984 320,982 323,983 294,981 296,982 290,980 292,981 288,980 290,980 285,979 280,984 281,982 279,980 275,983 270,981 266,984 263,982 259,984 258,983 260,984 253,984 254,983 250,985 252,984 248,986 246,1008 273,1012 271,1012 281,1013 279,1012 289,1014 287,1013 292,1015 290,1014 304,1016 304,1016 304,1016 307,1010 305,1012 305,1012 305,1012 304,1013 314,1015 318,1012 319,1014 323,1012 324,1013 328,1011 330,1012 334,1010 341,1012 346,1006 345,1008 370,1010 368,1010 367,1008 369,1012 380,1013 384,1011 408,1012 405,1012 418,1013 422,1011 449,1012 454,1007 453,1009 455,1005 455,1007 462,1008 466,1006 545,1008 543,1007 541,1004 545,1010 552,1012 556,1008 558,1004 558,1006 556,1008 560,1005 626,1007 625,1008 633,1009 637,1007 641,1008 645,1006 664,1008 668,1005 685,1007 689,1004 694,1006 698,1004 696,1005 700,1003 701,998 702,999 702,1000 706,998 708,1000 712,997 714,999 718,996 717,998 722,995 720,996 724,994 722,996 726,993 724,995 728,992 730,994 734,992 734,993 738,991 739,992 743,990 742,992 746,989 745,991 749,988 747,990 752,987 753,983 752,985 747,986 747,986 751,989 756,986 757,982 756,984 758,978 757,980 759,968 741,964 744,964 731,963"/>
</pc:TextLine>
<pc:TextLine id="region_0004_line_0003">
<pc:Coords points="466,1006 462,1008 452,1007 448,1009 430,1008 426,1010 407,1008 403,1011 392,1009 394,1010 380,1008 382,1009 357,1008 360,1008 346,1007 342,1009 334,1008 330,1010 328,1008 324,1011 324,1009 320,1012 318,1010 314,1012 312,1011 308,1013 306,1012 302,1014 292,1012 294,1013 286,1012 288,1012 278,1011 280,1012 266,1010 268,1011 250,1009 245,1036 248,1040 246,1038 264,1043 261,1042 284,1044 288,1041 299,1043 296,1042 337,1044 342,1040 341,1042 341,1042 345,1036 344,1038 344,1038 344,1038 344,1040 354,1042 358,1040 360,1036 359,1038 356,1040 360,1037 361,1039 365,1036 451,1038 448,1037 447,1035 450,1040 461,1041 466,1038 468,1033 468,1035 496,1036 500,1034 572,1036 576,1033 571,1034 591,1036 595,1033 596,1035 600,1032 597,1034 601,1032 603,1027 600,1023 602,1025 596,1020 599,1021 592,1020 594,1020 596,1016 593,1012 595,1015 528,1010 524,1012 527,1011 529,1012 522,1010 524,1011 526,1012 524,1008 525,1011 504,1006 500,1008 484,1007 487,1008 466,1006"/>
</pc:TextLine>
</pc:TextRegion>
<pc:TextRegion id="region_0005" orientation="1.241379310344827" type="paragraph">
<pc:Coords points="736,664 731,664 728,664 724,664 724,664 719,666 719,664 716,668 716,668 716,666 712,668 656,668 652,668 620,668 618,673 618,671 604,671 606,671 606,671 582,668 584,668 575,668 572,668 467,668 464,673 464,671 450,671 452,671 452,671 419,668 416,673 416,673 414,676 414,676 414,676 416,676 416,676 416,676 419,673 419,673 419,673 412,668 414,668 409,668 407,668 400,668 397,673 397,671 371,671 366,671 335,671 332,673 304,671 301,676 301,676 299,673 301,673 301,676 299,671 294,676 294,676 294,673 289,676 292,676 289,695 289,695 287,700 287,700 287,700 287,700 287,700 284,702 284,702 284,702 277,700 277,700 277,700 277,700 260,700 260,700 277,702 277,702 277,702 292,695 292,695 289,697 289,697 289,678 289,678 292,680 289,676 282,676 284,676 265,676 268,676 253,673 251,676 251,676 248,676 248,683 248,680 246,697 246,695 246,702 246,702 248,700 244,719 248,721 246,721 246,786 248,788 246,817 248,822 248,836 248,839 248,851 253,853 251,853 251,889 251,894 251,904 253,906 251,906 306,908 304,908 318,908 320,908 335,908 337,904 337,906 356,906 359,904 359,904 368,906 371,901 371,904 380,904 383,901 380,904 385,899 385,901 385,892 383,887 385,887 385,889 385,880 385,880 385,877 385,877 385,877 383,880 388,877 421,877 419,877 428,880 431,877 467,877 472,877 472,877 474,875 474,877 476,872 476,875 546,875 546,875 544,872 599,877 596,877 613,877 618,872 618,875 623,875 620,875 620,875 642,877 647,875 644,877 649,872 649,875 690,875 692,872 692,875 695,872 709,872 712,872 712,872 714,868 714,870 721,870 726,870 724,870 726,868 726,870 728,868 726,868 731,865 731,865 750,868 752,865 752,865 757,858 757,860 757,856 757,858 760,856 760,856 757,858 760,750 757,748 757,748 760,743 760,745 757,745 760,666 757,664 757,664" conf="0.750609157462953"/>
<pc:TextLine id="region_0005_line_0001">
<pc:Coords points="734,660 730,663 730,661 726,664 726,662 721,665 720,671 720,671 720,671 721,673 722,671 724,669 720,672 716,670 712,672 707,671 703,673 698,672 694,674 670,672 666,675 664,673 666,674 660,672 662,673 664,675 660,670 655,668 651,671 652,669 648,672 650,670 646,672 647,671 649,672 635,670 631,672 619,671 621,672 614,670 610,672 609,671 612,672 604,670 607,671 603,669 605,670 533,668 536,669 523,668 525,668 513,667 509,669 492,668 495,668 476,667 472,669 467,668 463,670 456,668 452,671 444,669 440,672 418,670 418,670 418,670 415,673 417,674 417,674 417,674 420,674 416,668 411,667 413,668 409,666 405,668 400,667 395,672 396,670 370,668 366,671 343,669 339,672 344,671 336,669 332,672 304,670 304,670 300,674 300,672 302,673 302,673 298,670 293,676 294,674 293,672 288,692 291,696 289,696 295,701 292,700 308,702 312,700 315,701 319,699 336,700 340,698 363,700 367,697 420,699 418,698 426,700 424,700 423,697 425,701 437,703 442,700 475,701 472,700 488,702 493,698 498,700 502,697 683,699 687,696 692,698 696,696 708,697 712,695 718,696 722,694 725,696 729,693 734,695 738,692 736,694 740,692 738,693 742,691 742,692 746,690 748,686 752,683 750,684 748,684 752,685 756,683 757,679 756,681 758,676 757,678 759,668 756,664 747,662 749,663 745,661 748,662 734,660"/>
</pc:TextLine>
<pc:TextLine id="region_0005_line_0002">
<pc:Coords points="740,688 736,690 737,688 732,692 735,690 731,692 730,691 726,693 725,692 721,694 722,692 718,695 718,693 714,696 700,694 696,696 693,695 696,696 687,694 682,697 683,696 679,698 674,696 670,699 660,697 656,700 640,698 636,700 628,699 631,700 626,698 628,699 622,697 618,700 600,698 596,700 591,699 587,701 576,700 579,700 574,699 576,699 544,697 540,700 534,698 529,701 528,706 528,704 431,703 433,704 435,704 432,700 396,698 398,699 393,697 389,700 383,698 379,700 303,699 299,701 289,700 285,702 286,700 282,703 284,701 286,702 276,700 272,703 269,701 270,700 266,699 268,699 260,697 256,700 255,704 256,701 254,705 254,704 253,702 249,704 251,703 247,705 245,709 246,707 244,726 287,731 291,728 329,730 333,728 343,729 347,727 370,728 368,728 408,729 405,728 409,730 407,729 405,727 408,732 418,733 422,731 427,732 431,730 430,732 435,726 434,728 432,730 436,728 443,729 447,727 512,728 509,728 513,729 511,728 509,726 512,730 533,732 532,732 550,733 554,731 561,732 565,730 569,732 567,731 576,728 574,730 572,729 572,729 584,732 589,725 590,726 588,728 585,727 592,726 652,728 649,727 648,724 652,730 659,732 663,729 680,731 684,728 685,730 689,728 704,729 708,727 709,728 707,728 714,727 716,728 720,726 717,728 721,725 723,720 722,723 724,719 723,721 720,723 724,720 727,722 731,720 732,721 736,719 735,720 739,718 740,720 744,717 744,719 748,716 749,712 750,713 752,715 756,712 753,714 758,693 752,688 755,689 740,688"/>
</pc:TextLine>
<pc:TextLine id="region_0005_line_0003">
<pc:Coords points="740,715 736,718 738,716 733,720 733,718 729,720 730,719 726,721 728,720 724,722 727,720 723,723 725,721 721,724 723,722 719,724 720,723 716,725 708,724 703,731 703,731 704,728 704,728 704,728 704,728 705,732 705,732 705,732 705,732 689,724 685,727 660,725 656,728 637,726 632,732 632,732 632,732 632,732 635,736 636,733 636,732 636,732 632,733 577,732 580,732 581,734 578,729 564,728 560,730 530,728 532,729 520,728 522,728 513,727 516,728 496,726 498,727 490,725 486,728 488,726 484,728 481,727 477,729 443,728 445,728 431,727 427,729 387,728 383,730 381,735 382,732 380,736 381,734 383,732 379,735 377,733 380,734 366,732 362,735 325,733 327,733 328,734 326,730 305,728 308,729 298,728 294,730 274,728 270,731 250,729 246,732 244,758 247,762 245,761 259,766 263,764 281,765 285,763 283,764 287,762 287,764 291,761 298,763 302,760 305,762 303,761 316,763 320,760 324,762 328,760 336,761 336,761 335,760 333,759 350,764 354,761 362,763 360,762 376,764 380,761 382,756 382,758 425,760 423,759 427,760 424,760 431,761 428,760 439,762 443,760 443,761 447,759 478,760 476,760 484,761 488,759 485,760 483,760 491,761 495,759 502,760 506,758 504,760 502,759 509,758 512,760 516,757 520,759 524,756 551,758 555,756 613,757 612,758 619,760 623,757 623,759 627,756 624,758 628,756 632,757 636,755 656,756 653,756 664,757 662,756 684,758 689,752 690,753 696,755 700,752 704,754 708,752 707,753 712,750 714,752 718,749 722,751 726,748 727,750 731,748 732,749 736,747 740,748 744,746 744,748 748,745 745,747 750,740 749,743 751,738 752,739 749,740 753,738 752,740 756,737 758,733 757,736 759,724 756,720 746,718 748,719 750,720 748,716 740,715"/>
</pc:TextLine>
<pc:TextLine id="region_0005_line_0004">
<pc:Coords points="744,747 740,749 729,748 725,750 720,748 716,751 717,749 713,752 712,750 708,752 709,751 705,753 706,752 702,754 704,752 698,757 700,756 696,758 697,756 693,759 692,757 688,760 672,758 668,760 637,759 633,761 634,760 636,760 632,759 634,760 636,760 629,755 632,756 628,754 624,756 629,756 624,754 620,756 619,755 615,757 596,756 598,756 584,755 580,758 578,763 578,761 520,760 520,759 510,757 512,758 508,756 504,759 482,757 477,763 478,760 476,764 476,763 454,761 450,764 406,762 408,763 410,763 404,758 407,759 403,757 399,760 401,758 397,760 396,765 396,763 398,761 394,764 394,762 396,763 391,761 387,764 354,762 350,764 351,763 353,764 347,762 349,763 328,761 330,762 323,760 324,760 288,759 284,761 287,760 281,764 268,762 264,764 253,763 256,764 251,762 247,764 249,763 244,785 247,789 254,791 258,788 286,790 284,789 288,791 285,790 295,792 299,789 302,791 306,788 316,790 320,788 331,789 335,787 380,788 377,788 381,789 379,788 377,787 380,792 386,793 390,791 390,792 394,790 408,792 406,791 411,792 415,790 418,792 422,789 426,791 430,788 437,790 435,789 439,791 436,790 443,792 448,786 447,788 444,790 449,784 450,785 452,787 449,786 456,785 483,787 487,784 620,786 617,785 622,787 620,786 624,788 622,787 628,788 635,784 644,785 648,783 670,784 674,782 696,784 700,781 716,783 713,782 720,781 718,783 722,780 720,782 728,776 727,778 731,776 728,777 732,775 731,776 735,774 734,776 738,773 736,775 740,772 738,774 742,772 740,773 744,771 747,772 751,770 748,772 752,769 752,771 756,768 758,760 752,755 754,756 750,754 752,754 753,754 751,750 752,752 750,748 744,747"/>
</pc:TextLine>
<pc:TextLine id="region_0005_line_0005">
<pc:Coords points="751,770 747,772 748,771 744,773 742,772 738,774 736,772 732,775 732,773 728,776 730,774 726,776 728,775 724,777 726,776 722,778 724,776 720,779 721,777 717,780 717,778 713,780 709,779 705,781 685,780 685,780 685,780 685,780 685,780 685,780 691,790 691,790 684,786 684,786 688,788 688,788 688,788 681,780 684,780 679,779 675,781 676,780 672,782 674,780 670,783 668,781 663,784 661,788 661,787 632,785 628,788 625,786 628,787 621,785 617,788 623,787 616,788 616,786 618,787 609,785 612,786 600,784 603,785 599,784 595,786 566,784 562,787 564,785 560,788 548,786 544,788 530,787 532,788 507,786 509,787 496,785 492,788 492,788 491,790 493,788 496,789 481,788 477,790 444,788 440,791 443,789 439,792 439,790 441,791 443,792 438,786 430,784 426,787 420,785 416,788 391,786 386,789 387,788 383,790 380,788 380,788 376,792 376,791 378,792 359,790 355,792 342,791 344,792 346,792 344,788 345,791 317,786 313,788 252,787 248,789 250,788 245,813 248,817 256,819 253,818 282,820 280,819 284,820 282,820 297,822 301,820 325,821 329,819 329,820 327,820 337,821 341,819 340,820 344,818 345,814 345,816 354,817 358,815 392,816 389,816 396,817 394,817 399,819 403,816 401,818 405,816 406,817 410,815 432,816 436,814 478,816 482,813 561,815 559,814 568,816 565,815 569,816 567,816 572,817 569,816 573,818 577,816 576,817 580,815 577,816 582,813 584,815 588,812 640,814 638,813 644,815 642,815 648,816 645,816 655,817 659,815 671,816 675,814 678,816 682,813 701,815 705,812 707,814 711,812 709,813 713,811 713,812 717,810 716,812 720,809 719,811 723,808 725,810 729,808 731,809 735,807 735,808 739,806 736,808 741,804 739,806 744,803 745,799 744,801 746,797 747,798 744,800 748,797 747,799 751,796 752,798 756,796 758,788 757,791 759,777 756,773 758,775 751,770"/>
</pc:TextLine>
<pc:TextLine id="region_0005_line_0006">
<pc:Coords points="748,797 744,800 746,798 742,800 744,799 739,803 741,801 737,804 739,802 735,804 735,803 731,805 731,804 727,806 723,804 719,807 720,805 716,808 719,806 715,808 715,807 711,809 711,808 707,810 703,808 699,811 684,809 680,812 676,810 672,812 664,811 660,813 647,812 649,812 641,811 644,812 639,810 634,813 629,812 625,814 624,819 624,816 623,820 624,818 611,816 607,819 574,817 576,817 577,818 571,813 573,814 556,812 552,815 510,813 506,816 502,814 504,815 490,813 492,814 482,812 478,815 474,813 470,816 468,820 469,817 469,817 468,821 470,820 470,820 433,819 436,820 432,818 428,820 417,819 420,820 414,818 416,819 408,817 410,817 412,820 390,815 392,816 384,814 380,816 365,815 368,816 352,814 348,816 342,815 338,817 320,816 316,818 302,816 298,819 300,817 296,820 264,818 267,819 252,817 247,835 249,839 248,841 255,846 252,845 271,847 275,844 328,846 332,844 359,845 363,843 381,844 379,844 383,845 387,843 392,844 390,844 420,845 420,845 421,844 420,845 419,843 422,848 430,849 435,846 436,842 436,844 436,846 440,844 450,845 454,843 452,844 450,844 454,845 458,843 472,844 470,844 516,845 520,843 520,844 517,844 516,841 519,846 528,848 532,845 533,847 537,844 543,846 547,844 556,845 554,844 586,846 590,844 601,845 599,844 615,846 619,844 624,845 621,844 644,846 648,844 651,845 648,844 653,846 657,844 668,845 672,843 671,844 675,842 676,844 680,841 684,843 688,840 700,842 704,840 706,841 710,839 712,840 716,838 717,840 721,837 724,839 728,836 726,838 732,834 729,836 733,833 731,835 736,832 735,833 739,831 742,832 746,830 746,832 750,829 748,831 752,828 751,830 756,827 757,823 756,825 758,818 757,820 759,814 756,810 758,803 752,798 755,799 748,797"/>
</pc:TextLine>
<pc:TextLine id="region_0005_line_0007">
<pc:Coords points="745,826 741,828 744,827 740,829 741,828 737,830 739,828 735,831 737,829 733,832 733,832 733,832 733,832 734,834 736,834 736,834 732,831 728,833 730,832 726,834 727,832 723,835 724,833 720,836 718,834 714,836 711,835 707,837 703,836 699,838 691,836 693,837 686,838 687,836 683,839 682,837 678,840 679,838 675,840 677,839 672,845 673,843 672,848 672,845 671,844 667,846 625,844 628,845 621,844 623,844 624,846 622,842 592,840 588,843 556,841 559,842 546,840 542,843 534,841 530,844 532,842 528,844 491,843 486,850 487,848 484,846 480,848 468,847 470,848 443,846 439,848 436,847 439,848 440,849 438,845 424,844 427,844 404,843 400,845 399,844 394,847 363,845 364,845 366,848 364,844 360,842 362,843 358,841 360,842 354,840 349,844 352,842 348,844 340,843 336,845 339,844 334,849 335,847 333,852 333,850 275,848 276,848 278,851 276,847 272,845 274,846 254,844 250,847 248,868 251,872 249,870 256,875 254,874 258,876 256,875 260,876 257,876 256,873 258,877 256,875 264,880 268,877 265,879 269,876 267,878 272,875 272,876 276,874 273,876 277,873 317,875 321,872 319,874 316,873 322,875 326,872 370,874 368,873 372,875 370,875 368,872 371,876 379,878 383,876 383,877 387,875 387,876 391,874 415,876 412,875 420,876 418,876 424,877 421,876 435,878 439,876 452,877 456,875 465,876 469,874 471,870 471,872 469,873 473,871 506,872 510,870 604,872 601,871 607,872 604,872 608,873 606,872 610,874 608,873 620,875 624,872 624,874 628,872 650,873 648,872 655,872 649,872 653,874 651,873 664,875 668,872 677,874 681,872 681,873 685,871 684,872 688,870 688,872 692,869 691,871 695,868 702,870 706,868 707,869 704,868 712,870 716,868 716,869 720,867 717,868 722,863 723,864 720,865 724,863 735,864 739,862 741,864 745,861 744,863 748,860 746,862 751,856 750,859 748,860 756,855 757,850 756,852 758,848 757,850 759,835 756,831 758,833 752,828 755,829 751,828 752,828 745,826"/>
</pc:TextLine>
<pc:TextLine id="region_0005_line_0008">
<pc:Coords points="309,872 305,875 308,873 301,878 280,876 276,879 278,877 274,880 256,878 252,880 253,879 248,890 251,894 249,898 252,902 294,904 292,903 305,904 304,904 302,902 304,906 303,904 320,908 324,906 327,908 331,905 329,907 334,900 333,902 331,904 335,901 348,903 352,900 374,902 378,900 377,901 382,895 381,897 383,893 380,889 382,892 380,888 372,886 375,887 376,889 374,885 376,883 372,878 351,876 353,877 349,876 352,876 353,879 350,874 309,872"/>
</pc:TextLine>
</pc:TextRegion>
<pc:TextRegion id="region_0006" orientation="1.241379310344827" type="paragraph">
<pc:Coords points="656,203 652,203 630,203 625,205 620,203 620,203 620,203 618,212 618,212 618,212 616,203 611,208 611,208 572,205 568,208 553,208 553,208 505,205 503,208 467,208 462,208 455,208 450,210 438,208 436,212 436,210 436,210 438,210 438,212 431,208 428,212 428,210 400,210 402,210 402,212 395,208 392,212 392,210 366,210 364,215 364,212 347,212 349,212 349,212 330,210 328,215 328,212 325,212 328,212 328,212 292,210 289,215 289,212 289,212 287,217 287,215 287,227 287,229 287,239 287,236 287,241 287,239 287,239 282,239 284,239 280,241 280,239 277,241 275,241 275,241 272,239 268,241 268,241 270,241 263,239 260,241 253,241 251,244 251,244 251,241 248,248 248,246 248,306 248,304 246,349 248,352 248,407 248,409 248,409 248,409 248,464 248,467 248,464 253,469 251,469 251,467 251,469 263,472 260,472 260,469 301,472 304,469 304,469 344,472 349,469 371,469 373,467 373,467 388,469 388,469 385,467 407,469 412,467 412,467 414,469 414,469 412,467 433,469 438,467 493,469 493,469 491,467 503,469 508,467 508,467 510,469 508,469 508,467 522,469 524,467 524,467 532,469 534,464 534,467 534,440 534,440 534,443 536,438 536,440 534,440 539,436 539,436 606,438 604,438 620,438 618,438 630,438 628,438 628,438 642,440 642,440 642,440 644,438 642,438 642,438 647,440 649,436 649,438 652,438 654,438 659,438 661,436 673,438 678,436 678,436 680,433 680,433 690,436 692,433 709,433 712,433 714,433 716,431 736,433 738,428 738,431 748,431 750,428 750,431 755,428 752,428 757,428 755,428 757,424 757,426 757,426 760,424 762,308 762,311 762,272 762,272 762,248 762,246 762,239 762,241 762,234 762,232 762,232 762,234 762,229 762,227 762,224 762,227 762,217 762,215 762,208 760,203 750,203 752,203 695,203 692,203 690,203 692,203 678,203 676,203 656,203 659,203" conf="0.780629329792061"/>
<pc:TextLine id="region_0006_line_0001">
<pc:Coords points="678,200 674,203 668,201 664,204 664,202 660,204 658,203 658,203 658,203 657,210 657,210 658,206 650,204 646,207 620,205 620,205 620,205 618,211 618,211 618,211 612,205 615,206 572,204 568,207 552,205 555,206 545,204 541,207 527,205 529,206 505,204 501,207 489,205 485,208 488,206 484,208 474,207 470,209 439,208 435,210 436,208 438,209 434,208 430,210 400,208 403,209 404,212 396,207 391,212 392,210 366,208 361,214 362,212 347,210 349,211 351,213 330,208 330,208 326,212 326,210 328,211 328,211 292,208 287,214 288,212 290,210 285,216 286,214 284,227 287,231 285,235 291,240 288,239 293,240 291,240 296,241 294,240 301,242 299,241 308,243 312,240 336,242 340,240 361,241 365,239 407,240 411,238 415,240 419,237 442,239 440,238 452,240 450,239 480,240 484,238 491,240 488,239 496,240 493,240 497,241 495,240 500,242 498,241 506,243 510,240 509,242 513,240 512,241 516,239 516,240 520,238 520,240 524,237 542,239 540,238 560,240 557,239 575,240 579,238 582,240 586,237 596,239 594,238 600,240 598,239 618,240 622,238 638,240 636,239 649,240 653,238 653,240 657,237 672,239 676,236 687,238 691,236 701,237 705,235 704,236 702,236 713,237 717,235 720,236 724,234 725,236 729,233 730,235 734,232 737,234 741,232 743,233 747,231 752,232 756,230 759,232 764,224 763,227 764,217 762,213 764,207 761,203 751,201 753,202 728,200 725,209 724,210 724,210 724,210 724,210 724,210 724,210 723,210 723,210 723,210 723,210 723,210 723,210 728,204 728,204 720,201 716,204 711,202 706,208 706,208 706,208 706,208 711,212 712,209 712,209 710,205 693,204 693,204 693,204 693,204 693,204 692,208 692,208 692,208 694,208 694,208 692,203 688,201 690,202 678,200"/>
</pc:TextLine>
<pc:TextLine id="region_0006_line_0002">
<pc:Coords points="751,228 747,230 744,228 740,231 735,229 731,232 732,230 728,232 728,231 724,233 725,232 721,234 696,232 692,235 680,233 676,236 657,234 653,236 654,235 650,237 650,236 646,238 637,236 640,237 622,236 618,238 600,236 602,237 596,236 598,236 591,235 593,236 588,234 584,236 579,235 575,237 573,236 569,238 562,236 564,237 536,236 538,236 525,235 521,237 516,236 512,238 513,236 509,239 509,237 505,240 500,238 503,239 496,237 498,238 492,236 495,237 480,236 476,238 452,236 455,237 449,236 449,236 449,236 447,240 447,240 447,240 440,236 442,236 415,235 411,237 408,236 404,238 348,236 344,239 336,237 332,240 320,238 323,239 316,237 312,240 306,238 302,240 304,239 307,240 296,238 298,239 292,237 295,238 286,236 282,239 284,237 280,240 280,238 276,240 274,239 276,240 272,238 268,240 268,239 270,240 263,238 259,240 256,239 252,241 255,240 251,242 252,240 247,248 248,246 246,270 252,275 256,272 263,274 267,272 269,273 273,271 273,272 277,270 279,272 283,269 295,271 299,268 300,270 297,269 381,271 385,268 404,270 402,269 410,271 408,270 414,272 412,271 432,272 436,270 437,272 441,269 459,271 463,268 468,270 472,268 477,269 481,267 489,268 493,266 516,268 520,265 532,267 536,264 564,266 561,265 570,267 568,266 574,268 572,267 579,268 576,268 585,269 583,268 591,270 588,269 599,271 596,270 607,272 611,269 610,271 614,268 612,270 616,268 615,269 619,267 616,268 620,266 620,268 624,265 632,267 636,264 641,266 639,265 649,267 647,266 654,268 652,267 659,268 656,268 667,269 671,267 678,268 682,266 688,268 692,265 693,267 697,264 699,266 703,264 704,265 708,263 711,264 715,262 728,264 732,261 734,263 738,260 739,262 743,260 759,261 763,259 764,249 762,245 764,240 763,242 764,235 762,231 764,232 751,228"/>
</pc:TextLine>
<pc:TextLine id="region_0006_line_0003">
<pc:Coords points="740,257 736,260 733,258 729,260 724,259 720,261 709,260 705,262 706,260 702,263 700,261 696,264 693,262 689,264 689,263 685,265 680,264 676,266 666,264 662,267 659,265 661,266 651,264 653,265 645,264 648,264 639,263 641,264 632,262 628,264 622,263 618,265 620,264 616,266 618,264 614,267 616,265 612,268 613,266 609,268 592,267 595,268 584,266 587,267 578,265 580,266 572,264 574,265 568,264 571,264 565,263 568,264 525,262 521,264 512,263 508,265 510,264 512,264 498,263 494,265 490,264 486,266 477,264 473,267 467,265 463,268 456,266 452,268 452,267 454,268 440,266 436,268 436,267 432,269 411,268 413,268 404,267 407,268 398,266 400,267 389,265 385,268 382,266 378,268 314,267 316,268 282,266 278,268 279,267 275,269 275,268 271,270 270,268 266,271 264,269 260,272 255,270 251,272 251,271 246,294 270,299 268,298 275,300 272,299 277,300 275,300 280,301 277,300 285,302 289,300 308,301 306,300 316,302 314,301 337,303 341,300 362,302 366,300 368,301 372,299 371,300 375,298 372,300 376,297 374,299 378,296 377,298 381,296 381,297 385,295 412,296 409,296 422,297 420,296 428,298 426,297 435,299 432,298 444,300 448,297 448,299 452,296 453,298 457,296 464,297 461,296 468,298 466,297 474,299 472,298 478,300 476,299 486,300 484,300 495,301 499,299 500,300 504,298 504,300 508,297 510,299 514,296 517,298 515,297 523,299 520,298 526,300 530,297 532,299 536,296 536,298 540,296 543,297 547,295 547,296 544,296 554,297 552,296 561,298 559,297 563,299 567,296 571,298 575,296 578,297 582,295 586,296 590,294 592,296 596,293 597,295 601,292 604,294 608,292 614,293 612,292 625,294 623,293 632,295 629,294 656,296 660,293 670,295 668,294 673,296 671,295 676,296 674,296 679,297 676,296 684,298 682,297 688,299 692,296 704,298 708,296 711,297 715,295 714,296 718,294 717,296 721,293 720,295 724,292 724,294 728,292 726,293 730,291 730,292 734,290 746,292 750,289 756,291 760,288 758,290 763,284 762,287 764,272 763,274 764,268 762,264 764,264 761,260 763,263 756,258 759,259 740,257"/>
</pc:TextLine>
<pc:TextLine id="region_0006_line_0004">
<pc:Coords points="755,286 751,288 740,287 736,289 732,288 728,290 728,288 724,291 727,289 723,292 724,290 720,292 720,291 716,293 717,292 713,294 714,292 710,295 702,293 698,296 680,294 682,295 675,293 677,294 672,292 675,293 670,292 672,292 664,291 667,292 661,290 657,292 655,291 651,293 632,292 635,292 625,291 628,292 615,290 617,291 606,289 602,292 600,290 596,292 594,291 590,293 587,292 583,294 580,292 576,295 568,293 564,296 559,294 561,295 548,293 550,294 540,292 536,295 535,293 531,296 529,294 525,296 523,295 525,296 508,294 504,296 506,295 502,297 501,296 497,298 479,296 481,297 472,296 474,296 468,295 470,296 464,294 466,295 452,293 448,296 450,294 446,296 448,295 444,297 433,296 436,296 428,295 431,296 422,294 424,295 410,293 412,294 381,292 377,295 379,293 375,296 377,294 372,297 375,296 371,298 372,296 368,299 364,297 360,300 357,298 360,299 344,297 340,300 342,298 344,299 337,300 327,298 323,300 321,299 324,300 312,298 314,299 305,297 308,298 276,296 279,297 272,296 275,296 268,295 270,296 255,294 251,296 252,295 248,298 246,306 247,304 245,326 276,331 280,328 284,330 288,328 292,329 296,327 304,328 308,326 306,328 304,327 332,328 336,326 340,328 344,325 346,327 350,324 366,326 364,325 383,327 380,326 395,328 399,325 434,327 438,324 445,326 443,325 451,327 448,326 457,328 455,327 464,328 464,328 464,328 464,328 467,321 465,323 465,323 465,323 468,328 472,326 476,328 480,325 483,327 487,324 487,326 491,324 490,325 494,323 495,324 499,322 516,324 514,323 596,324 593,324 612,325 609,324 617,326 615,325 636,327 640,324 642,326 646,324 664,325 668,323 675,324 672,324 683,325 683,325 683,325 686,322 684,321 684,321 684,321 684,324 688,326 686,325 700,327 704,324 704,326 708,324 708,325 712,323 712,324 716,322 716,324 720,321 720,323 724,320 726,322 730,320 731,321 735,319 734,320 738,318 736,320 740,317 738,319 742,316 740,318 744,316 743,317 747,315 746,316 750,314 751,316 755,313 756,315 760,312 758,314 763,308 762,311 764,292 761,288 755,286"/>
</pc:TextLine>
<pc:TextLine id="region_0006_line_0005">
<pc:Coords points="754,310 750,312 751,311 747,313 748,312 744,314 747,312 743,315 744,313 740,316 743,314 739,316 740,315 736,317 736,316 732,318 732,316 728,319 728,317 724,320 724,318 720,320 719,319 715,321 715,320 711,322 712,320 708,323 708,321 704,324 700,322 696,324 690,323 692,324 681,322 684,323 672,321 674,322 664,320 660,323 648,321 644,324 637,322 633,324 621,323 624,324 609,322 612,323 597,321 600,322 584,320 587,321 556,320 552,322 524,320 527,321 506,320 508,320 501,319 497,321 497,320 493,322 494,320 490,323 491,321 487,324 484,322 480,324 475,323 471,325 452,324 455,324 447,323 449,324 432,322 428,324 418,323 420,324 402,322 398,324 372,323 375,324 347,322 343,324 340,323 336,325 328,324 324,326 319,324 321,325 298,324 294,326 292,324 288,327 284,325 280,328 272,326 268,328 251,327 247,329 245,349 248,353 246,351 268,356 265,355 278,356 276,356 289,357 293,355 303,356 307,354 312,356 316,353 323,355 327,352 357,354 355,353 363,355 360,354 367,356 364,355 370,356 368,356 372,357 370,356 382,358 386,356 384,357 388,355 386,356 390,354 389,356 393,353 394,355 398,352 406,354 404,353 432,355 429,354 474,356 478,353 482,355 486,352 489,354 493,352 494,353 498,351 502,352 506,350 512,352 516,349 528,351 526,350 550,352 554,349 585,351 589,348 607,350 604,349 616,351 613,350 628,352 632,349 637,351 641,348 656,350 653,349 674,351 672,350 683,352 687,349 697,351 701,348 704,350 708,348 711,349 715,347 720,348 724,346 728,348 732,345 734,347 738,344 746,346 750,344 756,345 760,343 758,344 763,316 757,311 760,312 754,310"/>
</pc:TextLine>
<pc:TextLine id="region_0006_line_0006">
<pc:Coords points="744,341 740,344 732,342 728,344 727,343 723,345 720,344 716,346 712,344 708,347 707,345 703,348 698,346 694,348 688,347 684,349 668,348 671,348 656,347 659,348 642,346 638,348 632,347 628,349 613,348 616,348 604,347 607,348 588,346 584,348 573,347 576,348 566,346 562,348 556,347 552,349 528,348 531,348 512,347 508,349 504,348 500,350 495,348 491,351 492,349 488,352 489,350 489,350 485,353 485,353 485,353 485,353 487,356 487,356 487,356 487,356 482,350 478,352 473,351 469,353 464,352 466,352 458,351 454,353 421,352 424,352 400,351 403,352 399,350 395,352 392,351 388,353 389,352 385,354 388,352 384,355 384,353 380,356 373,354 376,355 368,353 371,354 366,352 368,353 362,352 364,352 356,351 358,352 345,350 348,351 330,349 326,352 317,350 313,352 309,351 305,353 301,352 297,354 269,352 272,353 260,352 263,352 251,351 246,380 265,384 263,384 292,385 289,384 298,386 296,385 318,387 322,384 328,386 332,384 336,385 333,384 345,386 349,384 354,385 358,383 363,384 367,382 374,384 378,381 376,383 374,382 388,384 386,383 405,384 403,384 422,385 420,384 440,386 444,384 452,385 456,383 456,384 460,382 459,384 463,381 464,383 468,380 474,382 478,380 492,381 490,380 495,382 492,381 500,383 497,382 522,384 526,381 528,383 532,380 533,382 537,380 590,381 588,380 593,382 591,381 599,383 596,382 600,384 604,381 606,383 610,380 610,382 614,380 615,381 619,379 630,380 634,378 640,380 644,377 648,379 652,376 656,378 653,377 661,379 659,378 665,380 663,379 670,380 668,380 680,381 677,380 686,382 684,381 697,383 701,380 722,382 726,380 728,381 732,379 732,380 736,378 735,380 739,377 737,379 741,376 740,378 744,376 743,377 747,375 746,376 750,374 752,376 756,373 756,375 760,372 758,374 763,347 760,343 744,341"/>
</pc:TextLine>
<pc:TextLine id="region_0006_line_0007">
<pc:Coords points="752,371 748,373 749,372 745,374 746,372 742,375 743,373 739,376 739,374 735,376 735,375 731,377 731,376 727,378 725,376 721,379 717,377 713,380 709,378 712,379 707,377 703,380 694,378 690,380 687,379 689,380 683,378 685,379 674,377 676,378 664,376 666,377 660,376 662,376 656,375 659,376 648,374 644,376 641,375 637,377 631,376 627,378 616,376 612,379 612,377 608,380 606,378 602,380 598,379 600,380 592,378 594,379 584,377 586,378 578,376 574,379 534,377 530,380 532,378 528,380 525,379 521,381 498,380 500,380 494,379 496,380 491,378 493,379 485,377 488,378 482,376 478,379 474,377 470,380 468,378 464,380 461,379 457,381 458,380 454,382 452,380 448,383 410,381 412,382 388,380 391,381 375,380 377,380 370,379 366,381 364,380 360,382 356,380 352,383 322,381 318,384 296,382 298,383 294,381 290,384 292,382 295,383 264,381 266,382 251,380 246,407 248,411 247,410 296,415 300,412 304,414 308,412 350,413 348,412 356,414 354,413 369,415 373,412 372,414 376,412 376,413 380,411 380,412 384,410 412,412 409,411 428,412 432,410 445,412 449,409 464,411 468,408 476,410 474,409 502,411 500,410 519,412 523,409 536,411 540,408 541,410 545,408 544,409 548,407 559,408 556,408 560,409 558,408 564,410 561,409 568,411 566,410 572,412 569,411 576,412 573,412 584,413 588,411 589,412 593,410 592,412 596,409 595,411 599,408 598,410 602,408 604,409 608,407 636,408 634,408 641,409 639,408 644,410 648,408 653,409 657,407 666,408 664,408 688,409 692,407 696,408 700,406 707,408 704,407 718,408 722,406 736,408 740,405 740,407 744,404 742,406 746,404 745,405 749,403 749,404 753,402 755,404 759,401 757,403 761,400 763,376 760,372 752,371"/>
</pc:TextLine>
<pc:TextLine id="region_0006_line_0008">
<pc:Coords points="760,400 752,401 751,400 747,402 748,400 744,403 744,401 740,404 739,402 735,404 726,403 722,405 696,404 692,406 692,404 688,407 655,405 657,406 650,407 656,406 648,407 639,405 641,406 634,404 636,405 627,404 623,406 604,404 600,407 600,405 596,408 597,406 593,408 595,407 591,409 590,408 586,410 572,408 574,409 565,408 568,408 561,407 564,408 559,406 561,407 556,405 559,406 554,404 556,405 549,404 545,406 548,404 544,407 544,405 540,408 539,406 535,408 520,407 516,409 498,408 500,408 476,407 478,408 459,406 455,408 436,407 432,409 428,408 424,410 410,408 412,409 379,408 375,410 376,408 372,411 373,409 369,412 371,410 367,412 357,411 360,412 351,410 353,411 337,409 340,410 309,408 305,411 303,409 299,412 295,410 291,412 251,411 246,438 260,443 264,440 270,442 274,440 279,441 276,440 283,442 280,441 284,443 282,442 286,444 284,443 289,444 287,444 312,445 316,443 315,444 319,442 317,444 321,441 320,443 324,440 325,442 329,440 339,441 343,439 356,440 354,440 364,441 362,440 368,442 365,441 372,443 369,442 409,444 407,443 425,444 429,442 442,444 446,441 475,443 479,440 491,442 495,440 528,441 525,440 534,442 538,440 540,436 540,437 632,439 630,438 634,440 632,439 636,440 633,440 640,441 644,439 642,440 646,438 648,440 652,437 650,439 654,436 660,438 664,436 676,437 680,435 684,436 688,434 700,436 704,433 713,435 717,432 726,434 730,432 734,433 738,431 739,432 743,430 743,432 747,429 746,431 750,428 751,430 755,428 754,429 759,426 757,428 761,425 763,404 757,400 760,400"/>
</pc:TextLine>
<pc:TextLine id="region_0006_line_0009">
<pc:Coords points="344,437 337,438 329,436 325,439 323,437 319,440 320,438 316,440 319,439 315,441 315,440 311,442 287,440 289,441 284,440 286,440 281,439 284,440 279,438 281,439 266,437 262,440 261,438 257,440 251,439 246,460 252,465 249,464 260,466 257,465 278,467 277,468 276,468 278,472 291,474 296,471 297,464 296,467 298,463 297,465 295,467 299,464 339,466 343,464 357,465 361,463 415,464 412,464 416,465 414,464 412,463 415,467 413,465 427,470 432,466 434,462 434,462 433,460 432,463 451,464 455,462 461,464 459,463 465,464 469,462 475,464 472,463 481,464 485,462 493,464 492,464 490,464 492,468 499,470 503,468 519,469 523,467 520,468 525,463 524,465 526,461 525,464 527,465 531,463 528,464 533,459 532,461 534,457 534,459 536,455 535,457 536,446 534,442 536,443 529,438 532,439 485,437 481,440 470,438 466,440 436,439 432,441 398,440 400,440 396,439 392,441 375,440 377,440 368,439 371,440 365,438 368,439 359,437 361,438 349,436 352,437 343,436 339,438 344,437"/>
</pc:TextLine>
</pc:TextRegion>
<pc:TextRegion id="region_0007" orientation="1.241379310344827" type="header">
<pc:Coords points="467,908 462,911 462,908 464,908 460,935 464,940 462,940 462,937 467,942 464,942 464,940 479,942 481,940 481,940 534,942 532,942 532,940 541,942 546,940 546,940 548,942 548,942 546,940 551,942 556,940 556,940 556,942 558,940 558,940 560,920 560,920 560,916 560,913 560,913 556,911 558,911 558,911 539,908 541,908 536,908 536,908 536,908 534,913 536,913 536,913 505,908 503,913 503,913 503,911 503,911 503,911 503,911 505,913 505,913 505,913 484,908 484,908"/>
<pc:TextLine id="region_0007_line_0001">
<pc:Coords points="470,905 466,908 467,906 462,909 460,916 461,913 460,917 462,921 460,924 463,928 461,933 464,937 462,936 476,940 480,938 480,940 484,937 512,939 509,938 548,940 552,937 552,939 556,936 558,931 557,933 559,917 556,913 552,912 555,912 539,911 541,912 537,910 537,910 533,914 535,912 537,913 539,915 496,910 498,911 480,909 483,910 479,908 480,908 482,911 476,906 479,907 470,905"/>
</pc:TextLine>
</pc:TextRegion>
<pc:TextRegion id="region_0008" orientation="1.241379310344827" type="header">
<pc:Coords points="460,1060 457,1060 455,1060 455,1086 457,1091 457,1091 476,1093 481,1088 481,1091 534,1091 536,1088 548,1091 553,1088 556,1088 558,1088 558,1076 558,1079 558,1079 560,1067 556,1062 558,1062 558,1064 556,1060 556,1060 548,1060 546,1062 546,1060 548,1060 539,1060 536,1062 536,1060 536,1060 524,1060 524,1060 512,1060 515,1060 503,1057 500,1060 491,1060 491,1060 476,1057 479,1057 479,1060"/>
<pc:TextLine id="region_0008_line_0001">
<pc:Coords points="463,1055 459,1057 461,1056 456,1060 454,1075 456,1079 455,1083 457,1087 456,1085 458,1089 466,1091 470,1088 468,1090 465,1089 472,1091 476,1088 525,1090 529,1088 535,1089 539,1087 540,1088 544,1086 549,1088 555,1084 556,1080 557,1080 559,1075 556,1070 557,1071 555,1067 556,1072 557,1068 555,1064 556,1067 554,1063 550,1061 552,1062 473,1060 476,1061 472,1060 474,1060 476,1063 473,1059 475,1060 469,1056 472,1056 463,1055"/>
</pc:TextLine>
</pc:TextRegion>
<pc:TextRegion id="region_0009" orientation="1.241379310344827" type="header">
<pc:Coords points="503,623 498,625 498,623 457,623 455,625 455,623 452,644 457,647 455,647 455,647 455,652 455,652 457,654 455,654 474,656 476,652 476,654 476,654 479,649 479,649 476,652 476,652 476,652 476,649 476,649 476,649 476,649 486,654 491,652 488,654 488,654 522,654 522,654 520,654 527,656 529,652 529,654 553,654 558,652 558,625 556,623 558,623"/>
<pc:TextLine id="region_0009_line_0001">
<pc:Coords points="459,621 455,624 457,622 452,628 453,626 452,633 454,637 452,640 455,644 453,645 456,649 454,647 461,652 459,651 464,652 468,650 486,652 490,649 492,651 490,650 494,652 498,649 498,651 496,650 500,652 497,651 502,652 500,652 514,653 518,651 529,652 533,650 533,652 537,649 535,651 539,648 536,650 542,646 541,648 545,645 550,647 554,644 556,630 553,626 555,628 549,624 552,624 530,623 532,624 528,622 524,624 484,623 487,624 472,622 474,623 459,621"/>
</pc:TextLine>
</pc:TextRegion>
<pc:TextRegion id="region_0010" orientation="1.241379310344827" type="header">
<pc:Coords points="455,167 452,169 452,167 455,167 450,169 450,169 450,196 452,198 450,198 452,200 469,200 472,200 520,200 522,196 522,198 546,198 548,196 551,198 556,196 553,196 556,191 556,193 558,174 556,172 556,174 553,169 553,169 556,172 534,167 536,167 529,167 532,167 493,167 491,167 491,167 493,167 479,167 476,167 476,167 479,167 464,167 464,167 464,167 462,172 462,172 464,167"/>
<pc:TextLine id="region_0010_line_0001">
<pc:Coords points="460,164 456,167 458,165 454,168 455,166 451,168 449,172 450,170 448,193 451,197 449,196 455,200 452,200 472,201 476,199 480,200 485,197 484,199 488,196 491,198 488,197 494,199 494,199 494,199 497,196 495,195 495,195 495,195 492,195 496,200 494,197 519,202 524,196 523,198 537,200 541,197 540,199 544,196 544,198 548,196 548,197 552,195 551,196 555,194 552,196 556,193 558,187 557,189 559,177 556,173 558,176 556,172 557,174 555,170 550,168 552,169 548,168 550,168 536,167 538,168 526,166 522,168 513,167 509,169 500,168 496,170 475,168 477,169 472,168 474,168 470,167 472,168 468,166 470,167 464,165 464,165 464,165 462,169 462,169 466,169 460,164"/>
</pc:TextLine>
</pc:TextRegion>
<pc:TextRegion id="region_0011" orientation="1.241379310344827" type="header">
<pc:Coords points="457,488 455,488 455,486 455,491 455,488 455,488 452,510 457,515 455,515 455,515 455,517 455,517 460,520 457,520 464,522 467,520 546,520 551,517 551,517 551,520 553,517 553,491 553,488 548,488 551,488 551,488 529,486 532,486 532,488 498,484 496,488 496,486 479,486 479,486 481,488"/>
<pc:TextLine id="region_0011_line_0001">
<pc:Coords points="460,484 456,487 459,485 454,488 452,504 455,508 453,513 456,517 460,519 457,518 464,520 468,517 475,519 479,516 484,518 488,516 495,517 499,515 498,516 496,516 500,517 497,516 504,518 501,517 508,519 505,518 511,520 515,517 521,519 525,516 531,518 535,516 536,517 540,515 537,516 541,514 539,516 544,512 544,513 548,511 548,512 553,500 552,503 554,495 548,490 551,491 533,489 536,490 529,488 525,491 511,489 507,492 502,490 504,491 484,489 486,490 476,488 479,489 473,488 475,488 471,486 472,486 460,484"/>
</pc:TextLine>
</pc:TextRegion>
<pc:TextRegion id="region_0012" orientation="1.241379310344827" type="header">
<pc:Coords points="491,107 486,109 467,107 462,112 462,112 402,109 400,112 395,112 397,112 378,109 376,112 344,112 342,112 337,112 340,112 335,112 330,112 332,112 332,112 289,112 289,112 287,109 282,112 263,112 260,112 258,112 256,114 253,112 251,116 251,114 251,119 251,116 251,116 248,121 248,119 248,126 248,124 246,143 248,148 248,145 248,150 248,150 253,152 251,152 272,152 277,152 280,152 277,152 284,152 289,152 294,152 299,148 299,150 306,150 311,148 313,150 311,150 318,150 316,150 316,150 328,152 325,152 337,152 340,152 378,152 376,152 426,152 428,152 460,152 462,148 462,150 469,150 467,150 467,150 479,152 481,148 481,150 491,150 493,148 524,150 522,150 544,150 541,150 553,150 558,150 580,150 582,148 582,150 584,143 584,143 584,143 582,145 582,143 582,143 582,143 582,143 606,150 611,148 642,148 644,145 644,145 666,148 671,145 685,145 690,143 690,143 716,145 721,140 721,143 724,143 726,140 726,143 731,140 728,140 733,121 733,121 731,119 731,119 731,121 731,121 733,114 728,112 731,112 731,114 728,109 731,112 719,107 721,107 680,107 678,109 680,107 680,107 676,107 673,109 575,107 577,107 572,107 568,109 539,107 536,112 536,112 520,109 522,109 522,112 508,107 508,107"/>
<pc:TextLine id="region_0012_line_0001">
<pc:Coords points="689,105 685,108 603,106 599,108 601,107 597,109 599,108 595,110 565,108 561,111 536,109 539,110 520,108 522,109 517,108 513,110 513,108 509,111 512,109 508,112 496,110 492,112 483,111 485,112 472,110 474,111 464,109 466,110 403,108 399,111 395,109 397,110 383,108 379,111 362,109 358,112 356,110 359,111 345,109 341,112 338,110 340,111 334,109 330,112 332,110 334,111 321,109 317,112 320,110 316,112 317,111 313,113 316,112 311,115 300,113 303,114 289,112 292,113 283,112 285,112 274,111 276,112 266,110 262,112 258,111 254,113 254,112 250,114 248,118 249,116 252,114 247,120 248,118 246,125 247,123 245,144 248,148 246,146 252,151 249,150 253,152 251,151 256,152 253,152 261,153 259,152 264,154 261,153 269,155 273,152 288,154 292,152 295,153 300,148 299,150 307,152 311,149 312,151 310,150 318,152 316,151 314,148 323,153 320,152 330,154 328,153 337,155 341,152 348,154 352,152 350,153 354,151 396,152 393,152 415,153 419,151 420,152 424,150 424,152 428,149 432,151 436,148 485,150 489,148 532,149 530,148 556,150 556,150 556,150 558,146 558,146 558,146 555,147 579,152 583,149 581,151 581,151 581,151 581,151 582,145 582,145 582,145 582,145 582,145 607,151 611,148 640,150 640,150 640,150 640,150 639,140 638,143 638,143 640,147 667,148 671,146 686,148 691,142 690,144 711,146 715,144 717,145 721,143 724,144 728,142 727,144 732,140 729,142 734,122 734,122 731,118 732,120 732,122 732,122 734,116 731,111 732,113 730,109 726,108 728,108 722,107 724,108 719,106 721,107 689,105"/>
</pc:TextLine>
</pc:TextRegion>
<pc:TextRegion id="region_0013" orientation="1.241379310344827" type="marginalia">
<pc:Coords points="209,221 209,222 208,223 133,223 133,276 134,276 135,276 135,278 134,279 133,279 133,293 134,293 135,294 135,318 134,319 133,319 133,336 134,336 135,336 135,337 136,337 137,338 137,339 151,339 151,338 152,337 174,337 174,336 175,336 198,336 199,335 219,335 220,334 221,334 222,333 226,333 226,331 227,330 227,328 228,328 228,326 228,325 229,325 229,324 231,323 231,322 232,321 232,322 242,322 243,321 243,320 244,319 244,318 244,317 245,317 245,314 246,313 246,304 245,304 245,302 246,301 246,292 247,292 247,280 248,280 248,268 248,267 249,267 249,256 248,256 248,256 248,240 247,240 247,221 209,221"/>
<pc:TextLine id="region_0013_line_0001">
<pc:Coords points="212,219 207,224 208,222 140,220 136,223 138,221 134,224 136,222 132,224 131,247 140,252 138,251 152,252 149,252 158,253 156,252 169,254 173,252 174,253 178,251 180,252 184,250 187,252 191,249 200,251 204,248 228,250 232,248 235,249 239,247 240,248 244,246 243,248 248,224 212,219"/>
</pc:TextLine>
<pc:TextLine id="region_0013_line_0002">
<pc:Coords points="241,244 237,246 236,244 232,247 202,245 198,248 192,246 188,248 186,247 182,249 180,248 176,250 173,248 169,251 148,249 150,250 137,248 132,252 131,272 136,276 134,276 132,274 148,279 146,278 180,280 178,279 204,280 208,278 209,280 213,277 214,279 218,276 224,278 228,276 240,277 244,275 244,276 248,264 248,266 245,268 250,259 244,254 247,255 248,248 241,244"/>
</pc:TextLine>
<pc:TextLine id="region_0013_line_0003">
<pc:Coords points="220,273 216,276 212,274 208,276 206,275 202,277 174,276 176,276 145,275 141,277 137,276 133,278 136,276 131,289 136,294 134,293 132,305 135,309 140,311 144,308 145,310 149,308 153,309 157,307 165,308 169,306 210,308 208,307 216,308 213,308 217,309 222,306 223,308 227,305 235,307 239,304 237,306 241,304 239,305 245,300 247,278 220,273"/>
</pc:TextLine>
<pc:TextLine id="region_0013_line_0004">
<pc:Coords points="168,303 164,305 160,304 156,306 153,304 149,307 148,305 144,308 139,306 135,308 137,307 132,320 133,318 136,316 131,332 136,336 134,336 132,333 139,338 136,337 135,335 147,340 152,334 151,336 164,338 164,338 168,335 167,336 164,336 169,337 173,335 175,336 179,334 177,336 175,335 180,336 184,334 208,336 212,333 213,335 217,332 215,334 220,331 221,327 220,329 222,322 216,317 219,318 220,309 214,304 216,305 195,304 197,304 188,303 184,305 183,304 185,304 168,303"/>
</pc:TextLine>
</pc:TextRegion>
<pc:TextRegion id="region_0014" orientation="1.241379310344827" type="marginalia">
<pc:Coords points="136,537 136,538 136,539 135,539 135,540 134,540 133,540 133,592 134,592 135,592 135,594 136,596 136,597 147,597 148,598 163,598 164,597 164,597 165,598 177,598 178,597 182,597 182,596 184,593 185,593 185,592 186,591 190,591 190,579 189,578 189,576 188,576 188,576 188,574 188,573 190,573 190,572 191,571 193,571 193,568 194,567 200,567 200,566 200,565 226,565 226,564 227,564 229,564 230,563 230,540 229,540 229,539 228,539 227,538 227,537 176,537 176,538 176,539 148,539 147,538 147,537 136,537"/>
<pc:TextLine id="region_0014_line_0001">
<pc:Coords points="179,535 174,540 175,538 157,536 152,540 138,539 134,541 136,540 132,545 132,543 131,551 133,555 132,559 134,563 132,561 138,566 136,565 148,567 152,564 164,566 168,564 194,565 198,563 199,564 203,562 197,563 204,562 203,564 200,563 210,564 214,562 212,564 216,558 216,560 216,561 220,559 224,560 228,558 229,554 228,556 230,548 228,544 223,542 225,543 217,541 219,541 215,540 217,540 213,539 215,539 216,541 213,536 179,535"/>
</pc:TextLine>
<pc:TextLine id="region_0014_line_0002">
<pc:Coords points="141,564 137,567 138,565 133,572 134,570 132,586 135,590 140,592 137,591 136,588 144,593 141,592 165,594 163,593 175,595 180,592 181,588 180,590 182,580 178,575 179,580 174,572 176,573 172,572 175,572 176,575 173,570 163,568 165,569 160,568 156,570 158,568 160,569 154,568 156,568 158,571 156,567 152,565 154,566 141,564"/>
</pc:TextLine>
</pc:TextRegion>
<pc:TextRegion id="region_0015" orientation="1.241379310344827" type="marginalia">
<pc:Coords points="200,684 199,684 177,684 177,686 176,687 165,687 164,688 149,688 148,688 136,688 135,689 135,691 134,692 133,692 133,714 134,714 135,715 135,719 134,720 133,720 133,803 134,803 135,804 136,804 137,804 137,806 162,806 162,804 163,804 177,804 180,801 193,801 193,800 195,799 202,799 203,798 204,798 204,796 204,796 205,796 207,794 207,790 208,789 208,787 208,786 209,786 210,785 210,784 211,784 211,783 212,782 212,779 212,778 213,779 213,784 214,784 215,784 215,785 216,786 220,786 220,787 220,788 227,788 228,788 228,789 235,789 235,788 236,788 236,786 236,785 237,785 237,784 238,784 238,780 239,779 239,772 240,771 240,768 240,768 241,768 241,747 242,746 242,723 243,722 243,718 244,717 244,689 243,688 243,684 242,684 229,684 228,684 224,684 224,684 200,684"/>
<pc:TextLine id="region_0015_line_0001">
<pc:Coords points="216,683 212,685 180,684 176,686 171,684 167,687 164,685 160,688 147,686 143,688 137,687 132,693 133,691 132,704 134,708 132,708 184,712 188,710 185,712 183,711 189,712 193,710 192,712 189,711 195,712 192,712 191,709 193,713 203,715 207,712 208,708 209,709 214,711 218,708 230,710 234,708 234,708 234,708 234,708 234,708 232,703 232,703 232,703 232,703 232,703 232,706 236,708 240,705 242,695 233,690 233,690 233,690 231,692 233,693 235,692 232,688 234,691 228,686 229,685 225,684 228,684 216,683"/>
</pc:TextLine>
<pc:TextLine id="region_0015_line_0002">
<pc:Coords points="193,713 189,716 175,714 171,716 142,715 138,717 139,716 135,718 133,722 134,720 132,738 135,742 133,740 136,744 145,745 149,743 151,744 156,740 154,742 158,740 170,741 174,739 193,740 197,738 222,740 226,737 228,739 232,736 234,720 232,716 227,715 229,716 223,714 225,715 193,713"/>
</pc:TextLine>
<pc:TextLine id="region_0015_line_0003">
<pc:Coords points="137,744 133,746 132,763 134,767 132,768 136,773 145,775 149,772 151,768 151,769 180,771 184,768 201,770 205,768 213,769 217,767 224,768 228,766 228,768 232,765 233,767 238,761 237,764 239,753 230,748 231,748 185,746 188,747 182,745 184,746 165,744 161,747 150,745 152,746 145,744 148,745 137,744"/>
</pc:TextLine>
<pc:TextLine id="region_0015_line_0004">
<pc:Coords points="157,772 153,775 155,773 155,773 151,780 151,778 148,776 144,779 137,777 133,780 132,797 134,801 132,799 145,804 149,801 171,803 176,789 173,785 175,788 175,788 175,788 169,784 171,786 171,786 171,786 172,788 173,778 171,774 157,772"/>
</pc:TextLine>
</pc:TextRegion>
<pc:TextRegion id="region_0016" orientation="1.241379310344827" type="marginalia">
<pc:Coords points="207,937 207,938 206,939 181,939 180,940 165,940 164,940 149,940 148,941 136,941 136,943 135,944 135,946 134,947 133,947 133,966 134,966 135,967 135,968 134,968 133,968 133,1082 136,1082 136,1083 156,1083 156,1082 190,1082 190,1080 191,1080 194,1080 195,1079 201,1079 202,1078 202,1068 201,1067 201,1065 200,1065 200,1064 200,1064 199,1063 199,1059 200,1058 204,1058 204,1059 208,1059 209,1060 208,1061 208,1063 206,1064 205,1064 204,1065 204,1066 204,1067 204,1077 206,1077 207,1078 208,1078 208,1079 225,1079 227,1077 227,1076 228,1076 228,1064 227,1064 227,1060 226,1060 226,1056 227,1056 227,1051 228,1050 229,1050 230,1049 230,1048 232,1046 233,1046 233,1036 234,1035 236,1035 236,1030 236,1029 237,1029 237,1027 239,1025 239,1024 240,1022 241,1022 241,1012 244,1010 244,1007 244,1006 245,1006 245,1002 246,1001 246,988 247,987 247,966 248,965 248,963 248,962 249,962 249,941 248,941 247,940 247,937 207,937"/>
<pc:TextLine id="region_0016_line_0001">
<pc:Coords points="212,935 208,937 209,936 205,938 184,936 180,939 168,937 164,940 152,938 148,940 140,939 136,941 138,940 133,945 134,943 132,948 133,946 136,944 131,960 133,964 132,962 134,966 132,964 145,969 149,967 174,968 178,966 222,968 220,967 228,968 232,966 242,968 247,964 248,959 248,961 245,963 250,945 248,941 244,940 245,940 241,938 244,939 240,937 242,938 237,936 239,936 212,935"/>
</pc:TextLine>
<pc:TextLine id="region_0016_line_0002">
<pc:Coords points="199,964 195,966 170,964 166,967 147,965 143,968 137,966 133,968 132,972 132,970 131,990 133,994 132,993 134,997 151,999 155,996 158,998 162,996 166,997 170,995 174,996 178,994 184,996 188,993 202,995 200,994 220,996 224,993 237,995 241,992 239,994 243,992 244,988 244,990 245,986 244,988 246,975 244,971 245,971 242,966 209,964 212,965 199,964"/>
</pc:TextLine>
<pc:TextLine id="region_0016_line_0003">
<pc:Coords points="186,992 182,994 176,992 172,995 168,993 164,996 160,994 156,996 152,995 148,997 138,996 134,998 136,996 132,999 131,1020 133,1024 132,1022 138,1027 136,1026 169,1028 173,1025 174,1027 178,1024 177,1026 181,1024 182,1025 186,1023 190,1024 194,1022 219,1024 224,1014 223,1016 224,1009 222,1005 224,1001 221,997 223,997 204,992 206,993 186,992"/>
</pc:TextLine>
<pc:TextLine id="region_0016_line_0004">
<pc:Coords points="215,1020 211,1022 188,1020 184,1023 184,1021 180,1024 180,1022 176,1024 176,1023 172,1025 169,1024 165,1026 136,1024 132,1027 131,1050 136,1055 134,1054 154,1056 158,1053 165,1055 163,1054 172,1056 176,1053 185,1055 189,1052 188,1054 185,1053 201,1055 205,1052 207,1054 207,1054 207,1054 207,1054 207,1054 207,1047 207,1047 207,1051 216,1052 220,1050 221,1052 225,1049 224,1051 228,1048 225,1050 230,1042 229,1044 231,1032 228,1028 230,1029 228,1025 229,1026 226,1021 215,1020"/>
</pc:TextLine>
<pc:TextLine id="region_0016_line_0005">
<pc:Coords points="152,1052 148,1054 136,1052 131,1077 133,1081 139,1083 136,1082 152,1084 156,1081 175,1083 179,1080 184,1082 188,1080 186,1081 190,1079 189,1080 193,1078 195,1073 194,1076 196,1057 193,1053 152,1052"/>
</pc:TextLine>
</pc:TextRegion>
<pc:TextRegion id="region_0017" orientation="1.241379310344827" type="marginalia">
<pc:Coords points="208,1112 207,1113 172,1113 172,1114 172,1115 149,1115 148,1116 133,1116 133,1118 132,1119 132,1172 132,1172 132,1181 132,1182 132,1193 133,1194 134,1194 135,1195 135,1196 142,1196 143,1196 143,1197 194,1197 194,1196 195,1196 198,1196 198,1195 199,1194 204,1194 204,1193 205,1193 205,1188 206,1188 207,1188 207,1191 208,1191 208,1192 216,1192 216,1191 221,1191 222,1190 222,1188 223,1188 223,1180 222,1179 223,1178 224,1178 224,1170 224,1169 225,1169 225,1168 227,1167 228,1167 228,1166 231,1166 231,1164 232,1164 240,1164 240,1163 242,1163 242,1160 243,1160 243,1156 244,1156 244,1155 244,1154 245,1154 245,1152 246,1151 246,1137 245,1136 245,1124 244,1124 244,1124 244,1115 243,1114 243,1113 240,1113 240,1112 208,1112"/>
<pc:TextLine id="region_0017_line_0001">
<pc:Coords points="210,1110 206,1112 175,1111 170,1116 171,1114 152,1112 148,1115 136,1113 131,1120 132,1118 130,1136 132,1140 156,1142 153,1141 191,1143 188,1142 204,1144 208,1141 212,1143 216,1140 220,1142 224,1140 225,1141 225,1141 229,1137 227,1139 224,1138 236,1140 240,1137 238,1139 242,1136 240,1138 244,1136 245,1132 244,1134 246,1127 240,1122 243,1123 244,1117 242,1113 244,1116 236,1111 239,1112 210,1110"/>
</pc:TextLine>
<pc:TextLine id="region_0017_line_0002">
<pc:Coords points="241,1135 237,1137 237,1136 233,1138 232,1136 228,1139 226,1137 222,1140 217,1138 213,1140 208,1139 204,1141 140,1140 142,1140 135,1139 130,1164 159,1168 156,1168 167,1169 164,1168 176,1170 174,1169 187,1171 184,1170 202,1172 206,1169 210,1171 214,1168 221,1170 226,1164 226,1166 224,1168 228,1165 227,1167 232,1160 231,1163 236,1164 240,1162 238,1164 243,1156 242,1159 244,1152 243,1155 244,1151 244,1153 241,1155 246,1148 245,1150 247,1140 241,1135"/>
</pc:TextLine>
<pc:TextLine id="region_0017_line_0003">
<pc:Coords points="135,1165 130,1174 131,1172 129,1177 132,1181 130,1189 132,1193 136,1195 134,1194 140,1196 137,1195 150,1196 148,1196 188,1197 192,1195 191,1196 195,1194 196,1189 196,1192 197,1176 195,1172 196,1173 190,1168 192,1169 179,1168 181,1168 167,1167 169,1168 160,1166 163,1167 135,1165"/>
</pc:TextLine>
</pc:TextRegion>
</pc:Page>
</pc:PcGts>

16
tests/test_model_zoo.py Normal file
View file

@ -0,0 +1,16 @@
from eynollah.model_zoo import EynollahModelZoo
def test_trocr1(
model_dir,
):
model_zoo = EynollahModelZoo(model_dir)
try:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
model_zoo.load_model('trocr_processor')
proc = model_zoo.get('trocr_processor', TrOCRProcessor)
assert isinstance(proc, TrOCRProcessor)
model_zoo.load_model('ocr', 'tr')
model = model_zoo.get('ocr', VisionEncoderDecoderModel)
assert isinstance(model, VisionEncoderDecoderModel)
except ImportError:
pass

View file

@ -1,351 +0,0 @@
from os import environ
from pathlib import Path
import pytest
import logging
from PIL import Image
from eynollah.cli import (
layout as layout_cli,
binarization as binarization_cli,
enhancement as enhancement_cli,
machine_based_reading_order as mbreorder_cli,
ocr as ocr_cli,
)
from click.testing import CliRunner
from ocrd_modelfactory import page_from_file
from ocrd_models.constants import NAMESPACES as NS
testdir = Path(__file__).parent.resolve()
MODELS_LAYOUT = environ.get('MODELS_LAYOUT', str(testdir.joinpath('..', 'models_layout_v0_5_0').resolve()))
MODELS_OCR = environ.get('MODELS_OCR', str(testdir.joinpath('..', 'models_ocr_v0_5_1').resolve()))
MODELS_BIN = environ.get('MODELS_BIN', str(testdir.joinpath('..', 'default-2021-03-09').resolve()))
@pytest.mark.parametrize(
"options",
[
[], # defaults
#["--allow_scaling", "--curved-line"],
["--allow_scaling", "--curved-line", "--full-layout"],
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"],
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based",
"--textline_light", "--light_version"],
# -ep ...
# -eoi ...
# FIXME: find out whether OCR extra was installed, otherwise skip these
["--do_ocr"],
["--do_ocr", "--light_version", "--textline_light"],
["--do_ocr", "--transformer_ocr"],
#["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"],
["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"],
# --skip_layout_and_reading_order
], ids=str)
def test_run_eynollah_layout_filename(tmp_path, pytestconfig, caplog, options):
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml'
args = [
'-m', MODELS_LAYOUT,
'-i', str(infile),
'-o', str(outfile.parent),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'eynollah'
runner = CliRunner()
with caplog.filtering(only_eynollah):
result = runner.invoke(layout_cli, args + options, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
assert str(infile) in logmsgs
assert outfile.exists()
tree = page_from_file(str(outfile)).etree
regions = tree.xpath("//page:TextRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
lines = tree.xpath("//page:TextLine", namespaces=NS)
assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line
@pytest.mark.parametrize(
"options",
[
["--tables"],
["--tables", "--full-layout"],
["--tables", "--full-layout", "--textline_light", "--light_version"],
], ids=str)
def test_run_eynollah_layout_filename2(tmp_path, pytestconfig, caplog, options):
infile = testdir.joinpath('resources/euler_rechenkunst01_1738_0025.tif')
outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml'
args = [
'-m', MODELS_LAYOUT,
'-i', str(infile),
'-o', str(outfile.parent),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'eynollah'
runner = CliRunner()
with caplog.filtering(only_eynollah):
result = runner.invoke(layout_cli, args + options, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
assert str(infile) in logmsgs
assert outfile.exists()
tree = page_from_file(str(outfile)).etree
regions = tree.xpath("//page:TextRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
regions = tree.xpath("//page:TableRegion", namespaces=NS)
# model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP
assert len(regions) >= 1, "result is inaccurate"
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
lines = tree.xpath("//page:TextLine", namespaces=NS)
assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line
def test_run_eynollah_layout_directory(tmp_path, pytestconfig, caplog):
indir = testdir.joinpath('resources')
outdir = tmp_path
args = [
'-m', MODELS_LAYOUT,
'-di', str(indir),
'-o', str(outdir),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'eynollah'
runner = CliRunner()
with caplog.filtering(only_eynollah):
result = runner.invoke(layout_cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Job done in')]) == 2
assert any(logmsg for logmsg in logmsgs if logmsg.startswith('All jobs done in'))
assert len(list(outdir.iterdir())) == 2
@pytest.mark.parametrize(
"options",
[
[], # defaults
["--no-patches"],
], ids=str)
def test_run_eynollah_binarization_filename(tmp_path, pytestconfig, caplog, options):
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
args = [
'-m', MODELS_BIN,
'-i', str(infile),
'-o', str(outfile),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'SbbBinarizer'
runner = CliRunner()
with caplog.filtering(only_eynollah):
result = runner.invoke(binarization_cli, args + options, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
assert any(True for logmsg in logmsgs if logmsg.startswith('Predicting'))
assert outfile.exists()
with Image.open(infile) as original_img:
original_size = original_img.size
with Image.open(outfile) as binarized_img:
binarized_size = binarized_img.size
assert original_size == binarized_size
def test_run_eynollah_binarization_directory(tmp_path, pytestconfig, caplog):
indir = testdir.joinpath('resources')
outdir = tmp_path
args = [
'-m', MODELS_BIN,
'-di', str(indir),
'-o', str(outdir),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'SbbBinarizer'
runner = CliRunner()
with caplog.filtering(only_eynollah):
result = runner.invoke(binarization_cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Predicting')]) == 2
assert len(list(outdir.iterdir())) == 2
@pytest.mark.parametrize(
"options",
[
[], # defaults
["-sos"],
], ids=str)
def test_run_eynollah_enhancement_filename(tmp_path, pytestconfig, caplog, options):
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
args = [
'-m', MODELS_LAYOUT,
'-i', str(infile),
'-o', str(outfile.parent),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'enhancement'
runner = CliRunner()
with caplog.filtering(only_eynollah):
result = runner.invoke(enhancement_cli, args + options, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
assert any(True for logmsg in logmsgs if logmsg.startswith('Image was enhanced')), logmsgs
assert outfile.exists()
with Image.open(infile) as original_img:
original_size = original_img.size
with Image.open(outfile) as enhanced_img:
enhanced_size = enhanced_img.size
assert (original_size == enhanced_size) == ("-sos" in options)
def test_run_eynollah_enhancement_directory(tmp_path, pytestconfig, caplog):
indir = testdir.joinpath('resources')
outdir = tmp_path
args = [
'-m', MODELS_LAYOUT,
'-di', str(indir),
'-o', str(outdir),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'enhancement'
runner = CliRunner()
with caplog.filtering(only_eynollah):
result = runner.invoke(enhancement_cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Image was enhanced')]) == 2
assert len(list(outdir.iterdir())) == 2
def test_run_eynollah_mbreorder_filename(tmp_path, pytestconfig, caplog):
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.xml')
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
args = [
'-m', MODELS_LAYOUT,
'-i', str(infile),
'-o', str(outfile.parent),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'mbreorder'
runner = CliRunner()
with caplog.filtering(only_eynollah):
result = runner.invoke(mbreorder_cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
# FIXME: mbreorder has no logging!
#assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
assert outfile.exists()
#in_tree = page_from_file(str(infile)).etree
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
out_tree = page_from_file(str(outfile)).etree
out_order = out_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
#assert len(out_order) >= 2, "result is inaccurate"
#assert in_order != out_order
assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3']
def test_run_eynollah_mbreorder_directory(tmp_path, pytestconfig, caplog):
indir = testdir.joinpath('resources')
outdir = tmp_path
args = [
'-m', MODELS_LAYOUT,
'-di', str(indir),
'-o', str(outdir),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'mbreorder'
runner = CliRunner()
with caplog.filtering(only_eynollah):
result = runner.invoke(mbreorder_cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
# FIXME: mbreorder has no logging!
#assert len([logmsg for logmsg in logmsgs if logmsg.startswith('???')]) == 2
assert len(list(outdir.iterdir())) == 2
@pytest.mark.parametrize(
"options",
[
[], # defaults
["-doit", #str(outrenderfile.parent)],
],
["-trocr"],
], ids=str)
def test_run_eynollah_ocr_filename(tmp_path, pytestconfig, caplog, options):
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
outrenderfile = tmp_path.joinpath('render').joinpath('kant_aufklaerung_1784_0020.png')
outrenderfile.parent.mkdir()
args = [
'-m', MODELS_OCR,
'-i', str(infile),
'-dx', str(infile.parent),
'-o', str(outfile.parent),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.DEBUG)
def only_eynollah(logrec):
return logrec.name == 'eynollah'
runner = CliRunner()
if "-doit" in options:
options.insert(options.index("-doit") + 1, str(outrenderfile.parent))
with caplog.filtering(only_eynollah):
result = runner.invoke(ocr_cli, args + options, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
# FIXME: ocr has no logging!
#assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
assert outfile.exists()
if "-doit" in options:
assert outrenderfile.exists()
#in_tree = page_from_file(str(infile)).etree
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
out_tree = page_from_file(str(outfile)).etree
out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS)
assert len(out_texts) >= 2, ("result is inaccurate", out_texts)
assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts)
def test_run_eynollah_ocr_directory(tmp_path, pytestconfig, caplog):
indir = testdir.joinpath('resources')
outdir = tmp_path
args = [
'-m', MODELS_OCR,
'-di', str(indir),
'-dx', str(indir),
'-o', str(outdir),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'eynollah'
runner = CliRunner()
with caplog.filtering(only_eynollah):
result = runner.invoke(ocr_cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
# FIXME: ocr has no logging!
#assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
assert len(list(outdir.iterdir())) == 2

View file

@ -1,59 +0,0 @@
# Training eynollah
This README explains the technical details of how to set up and run training, for detailed information on parameterization, see [`docs/train.md`](../docs/train.md)
## Introduction
This folder contains the source code for training an encoder model for document image segmentation.
## Installation
Clone the repository and install eynollah along with the dependencies necessary for training:
```sh
git clone https://github.com/qurator-spk/eynollah
cd eynollah
pip install '.[training]'
```
### Pretrained encoder
Download our pretrained weights and add them to a `train/pretrained_model` folder:
```sh
cd train
wget -O pretrained_model.tar.gz https://zenodo.org/records/17243320/files/pretrained_model_v0_5_1.tar.gz?download=1
tar xf pretrained_model.tar.gz
```
### Binarization training data
A small sample of training data for binarization experiment can be found [on
zenodo](https://zenodo.org/records/17243320/files/training_data_sample_binarization_v0_5_1.tar.gz?download=1),
which contains `images` and `labels` folders.
### Helpful tools
* [`pagexml2img`](https://github.com/qurator-spk/page2img)
> Tool to extract 2-D or 3-D RGB images from PAGE-XML data. In the former case, the output will be 1 2-D image array which each class has filled with a pixel value. In the case of a 3-D RGB image,
each class will be defined with a RGB value and beside images, a text file of classes will also be produced.
* [`cocoSegmentationToPng`](https://github.com/nightrome/cocostuffapi/blob/17acf33aef3c6cc2d6aca46dcf084266c2778cf0/PythonAPI/pycocotools/cocostuffhelper.py#L130)
> Convert COCO GT or results for a single image to a segmentation map and write it to disk.
* [`ocrd-segment-extract-pages`](https://github.com/OCR-D/ocrd_segment/blob/master/ocrd_segment/extract_pages.py)
> Extract region classes and their colours in mask (pseg) images. Allows the color map as free dict parameter, and comes with a default that mimics PageViewer's coloring for quick debugging; it also warns when regions do overlap.
### Train using Docker
Build the Docker image:
```bash
cd train
docker build -t model-training .
```
Run Docker image
```bash
cd train
docker run --gpus all -v $PWD:/entry_point_dir model-training
```