Merge pull request #130 from qurator-spk/v3-api

port processor to core v3
pull/150/head
Konstantin Baierer 2 weeks ago committed by GitHub
commit e0d38517d3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -0,0 +1,6 @@
tests
dist
build
env*
*.egg-info
models_eynollah*

@ -0,0 +1,44 @@
name: CD
on:
push:
branches: [ "master" ]
workflow_dispatch: # run manually
jobs:
build:
runs-on: ubuntu-latest
permissions:
packages: write
contents: read
steps:
- name: Checkout
uses: actions/checkout@v4
with:
# we need tags for docker version tagging
fetch-tags: true
fetch-depth: 0
- # Activate cache export feature to reduce build time of images
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERIO_USERNAME }}
password: ${{ secrets.DOCKERIO_PASSWORD }}
- name: Build the Docker image
# build both tags at the same time
run: make docker DOCKER_TAG="docker.io/ocrd/eynollah -t ghcr.io/qurator-spk/eynollah"
- name: Test the Docker image
run: docker run --rm ocrd/eynollah ocrd-eynollah-segment -h
- name: Push to Dockerhub
run: docker push docker.io/ocrd/eynollah
- name: Push to Github Container Registry
run: docker push ghcr.io/qurator-spk/eynollah

@ -16,18 +16,26 @@ jobs:
steps: steps:
- name: clean up - name: clean up
run: | run: |
df -h
sudo rm -rf /usr/share/dotnet sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost" sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY" sudo rm -rf "$AGENT_TOOLSDIRECTORY"
df -h
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- uses: actions/cache@v4 - uses: actions/cache@v4
id: model_cache id: seg_model_cache
with: with:
path: models_eynollah path: models_eynollah
key: ${{ runner.os }}-models key: ${{ runner.os }}-models
- uses: actions/cache@v4
id: bin_model_cache
with:
path: default-2021-03-09
key: ${{ runner.os }}-modelbin
- name: Download models - name: Download models
if: steps.model_cache.outputs.cache-hit != 'true' if: steps.seg_model_cache.outputs.cache-hit != 'true' || steps.bin_model_cache.outputs.cache-hit != 'true'
run: make models run: make models
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5 uses: actions/setup-python@v5
@ -36,9 +44,11 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install .[OCR,plotting] make install EXTRAS=OCR,plotting
pip install -r requirements-test.txt make deps-test
- name: Test with pytest - name: Test with pytest
run: make test run: make test
- name: Test docker build - name: Test standalone CLI
run: make docker run: make smoke-test
- name: Test OCR-D CLI
run: make ocrd-test

@ -4,23 +4,42 @@ FROM $DOCKER_BASE_IMAGE
ARG VCS_REF ARG VCS_REF
ARG BUILD_DATE ARG BUILD_DATE
LABEL \ LABEL \
maintainer="https://ocr-d.de/kontakt" \ maintainer="https://ocr-d.de/en/contact" \
org.label-schema.vcs-ref=$VCS_REF \ org.label-schema.vcs-ref=$VCS_REF \
org.label-schema.vcs-url="https://github.com/qurator-spk/eynollah" \ org.label-schema.vcs-url="https://github.com/qurator-spk/eynollah" \
org.label-schema.build-date=$BUILD_DATE org.label-schema.build-date=$BUILD_DATE \
org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \
org.opencontainers.image.title="Eynollah" \
org.opencontainers.image.description="" \
org.opencontainers.image.source="https://github.com/qurator-spk/eynollah" \
org.opencontainers.image.documentation="https://github.com/qurator-spk/eynollah/blob/${VCS_REF}/README.md" \
org.opencontainers.image.revision=$VCS_REF \
org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.base.name=ocrd/core-cuda-tf2
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
# set proper locales
ENV PYTHONIOENCODING=utf8 ENV PYTHONIOENCODING=utf8
ENV XDG_DATA_HOME=/usr/local/share ENV LANG=C.UTF-8
ENV LC_ALL=C.UTF-8
WORKDIR /build-eynollah # avoid HOME/.local/share (hard to predict USER here)
COPY src/ ./src # so let XDG_DATA_HOME coincide with fixed system location
COPY pyproject.toml . # (can still be overridden by derived stages)
COPY requirements.txt . ENV XDG_DATA_HOME /usr/local/share
COPY README.md . # avoid the need for an extra volume for persistent resource user db
COPY Makefile . # (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
RUN apt-get install -y --no-install-recommends g++ ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
RUN make install
WORKDIR /build/eynollah
COPY . .
COPY ocrd-tool.json .
# prepackage ocrd-tool.json as ocrd-all-tool.json
RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
# install everything and reduce image size
RUN make install EXTRAS=OCR && rm -rf /build/eynollah
# smoke test
RUN eynollah --help
WORKDIR /data WORKDIR /data
VOLUME /data VOLUME /data

@ -1,10 +1,20 @@
EYNOLLAH_MODELS ?= $(PWD)/models_eynollah PYTHON ?= python3
export EYNOLLAH_MODELS PIP ?= pip3
EXTRAS ?=
# DOCKER_BASE_IMAGE = artefakt.dev.sbb.berlin:5000/sbb/ocrd_core:v2.68.0 # DOCKER_BASE_IMAGE = artefakt.dev.sbb.berlin:5000/sbb/ocrd_core:v2.68.0
DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.68.0 DOCKER_BASE_IMAGE = docker.io/ocrd/core-cuda-tf2:v3.3.0
DOCKER_TAG = ocrd/eynollah DOCKER_TAG = ocrd/eynollah
#SEG_MODEL := https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz
#SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz
SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah.tar.gz
#SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz
#SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz
BIN_MODEL := https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip
PYTEST_ARGS ?=
# BEGIN-EVAL makefile-parser --make-help Makefile # BEGIN-EVAL makefile-parser --make-help Makefile
@ -12,44 +22,90 @@ help:
@echo "" @echo ""
@echo " Targets" @echo " Targets"
@echo "" @echo ""
@echo " models Download and extract models to $(PWD)/models_eynollah" @echo " docker Build Docker image"
@echo " install Install with pip" @echo " build Build Python source and binary distribution"
@echo " install Install package with pip"
@echo " install-dev Install editable with pip" @echo " install-dev Install editable with pip"
@echo " deps-test Install test dependencies with pip"
@echo " models Download and extract models to $(CURDIR)/models_eynollah"
@echo " smoke-test Run simple CLI check"
@echo " ocrd-test Run OCR-D CLI check"
@echo " test Run unit tests" @echo " test Run unit tests"
@echo "" @echo ""
@echo " Variables" @echo " Variables"
@echo " EXTRAS comma-separated list of features (like 'OCR,plotting') for 'install' [$(EXTRAS)]"
@echo " DOCKER_TAG Docker image tag for 'docker' [$(DOCKER_TAG)]"
@echo " PYTEST_ARGS pytest args for 'test' (Set to '-s' to see log output during test execution, '-vv' to see individual tests. [$(PYTEST_ARGS)]"
@echo " SEG_MODEL URL of 'models' archive to download for segmentation 'test' [$(SEG_MODEL)]"
@echo " BIN_MODEL URL of 'models' archive to download for binarization 'test' [$(BIN_MODEL)]"
@echo "" @echo ""
# END-EVAL # END-EVAL
# Download and extract models to $(PWD)/models_eynollah # Download and extract models to $(PWD)/models_eynollah
models: models_eynollah models: models_eynollah default-2021-03-09
models_eynollah: models_eynollah.tar.gz models_eynollah: models_eynollah.tar.gz
tar xf models_eynollah.tar.gz tar zxf models_eynollah.tar.gz
models_eynollah.tar.gz: models_eynollah.tar.gz:
# wget 'https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz' wget $(SEG_MODEL)
# wget 'https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz'
wget 'https://qurator-data.de/eynollah/2022-04-05/models_eynollah.tar.gz' default-2021-03-09: $(notdir $(BIN_MODEL))
# wget 'https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz' unzip $(notdir $(BIN_MODEL))
# wget 'https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz' mkdir $@
mv $(basename $(notdir $(BIN_MODEL))) $@
$(notdir $(BIN_MODEL)):
wget $(BIN_MODEL)
build:
$(PIP) install build
$(PYTHON) -m build .
# Install with pip # Install with pip
install: install:
pip install . $(PIP) install .$(and $(EXTRAS),[$(EXTRAS)])
# Install editable with pip # Install editable with pip
install-dev: install-dev:
pip install -e . $(PIP) install -e .$(and $(EXTRAS),[$(EXTRAS)])
deps-test: models_eynollah
$(PIP) install -r requirements-test.txt
smoke-test: TMPDIR != mktemp -d
smoke-test: tests/resources/kant_aufklaerung_1784_0020.tif
# layout analysis:
eynollah layout -i $< -o $(TMPDIR) -m $(CURDIR)/models_eynollah
fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 $(TMPDIR)/$(basename $(<F)).xml
fgrep -c -e TextRegion -e ImageRegion -e SeparatorRegion $(TMPDIR)/$(basename $(<F)).xml
# directory mode (skip one, add one):
eynollah layout -di $(<D) -o $(TMPDIR) -m $(CURDIR)/models_eynollah
test -s $(TMPDIR)/euler_rechenkunst01_1738_0025.xml
# binarize:
eynollah binarization -m $(CURDIR)/default-2021-03-09 $< $(TMPDIR)/$(<F)
test -s $(TMPDIR)/$(<F)
@set -x; test "$$(identify -format '%w %h' $<)" = "$$(identify -format '%w %h' $(TMPDIR)/$(<F))"
$(RM) -r $(TMPDIR)
smoke-test: ocrd-test: TMPDIR != mktemp -d
eynollah layout -i tests/resources/kant_aufklaerung_1784_0020.tif -o . -m $(PWD)/models_eynollah ocrd-test: tests/resources/kant_aufklaerung_1784_0020.tif
cp $< $(TMPDIR)
ocrd workspace -d $(TMPDIR) init
ocrd workspace -d $(TMPDIR) add -G OCR-D-IMG -g PHYS_0020 -i OCR-D-IMG_0020 $(<F)
ocrd-eynollah-segment -w $(TMPDIR) -I OCR-D-IMG -O OCR-D-SEG -P models $(CURDIR)/models_eynollah
result=$$(ocrd workspace -d $(TMPDIR) find -G OCR-D-SEG); \
fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 $(TMPDIR)/$$result && \
fgrep -c -e TextRegion -e ImageRegion -e SeparatorRegion $(TMPDIR)/$$result
ocrd-sbb-binarize -w $(TMPDIR) -I OCR-D-IMG -O OCR-D-BIN -P model $(CURDIR)/default-2021-03-09
ocrd-sbb-binarize -w $(TMPDIR) -I OCR-D-SEG -O OCR-D-SEG-BIN -P model $(CURDIR)/default-2021-03-09 -P operation_level region
$(RM) -r $(TMPDIR)
# Run unit tests # Run unit tests
test: test:
pytest tests EYNOLLAH_MODELS=$(CURDIR)/models_eynollah $(PYTHON) -m pytest tests --durations=0 --continue-on-collection-errors $(PYTEST_ARGS)
# Build docker image # Build docker image
docker: docker:
@ -59,3 +115,4 @@ docker:
--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
-t $(DOCKER_TAG) . -t $(DOCKER_TAG) .
.PHONY: models build install install-dev test smoke-test ocrd-test docker help

@ -21,7 +21,7 @@
:warning: Development is currently focused on achieving the best possible quality of results for a wide variety of historical documents and therefore processing can be very slow. We aim to improve this, but contributions are welcome. :warning: Development is currently focused on achieving the best possible quality of results for a wide variety of historical documents and therefore processing can be very slow. We aim to improve this, but contributions are welcome.
## Installation ## Installation
Python `3.8-3.11` with Tensorflow `2.12-2.15` on Linux are currently supported. Python `3.8-3.11` with Tensorflow `<2.13` on Linux are currently supported.
For (limited) GPU support the CUDA toolkit needs to be installed. For (limited) GPU support the CUDA toolkit needs to be installed.
@ -83,23 +83,28 @@ If no option is set, the tool performs layout detection of main regions (backgro
The best output quality is produced when RGB images are used as input rather than greyscale or binarized images. The best output quality is produced when RGB images are used as input rather than greyscale or binarized images.
#### Use as OCR-D processor #### Use as OCR-D processor
🚧 **Work in progress**
Eynollah ships with a CLI interface to be used as [OCR-D](https://ocr-d.de) processor. Eynollah ships with a CLI interface to be used as [OCR-D](https://ocr-d.de) [processor](https://ocr-d.de/en/spec/cli).
In this case, the source image file group with (preferably) RGB images should be used as input like this: In this case, the source image file group with (preferably) RGB images should be used as input like this:
``` ocrd-eynollah-segment -I OCR-D-IMG -O OCR-D-SEG -P models 2022-04-05
ocrd-eynollah-segment -I OCR-D-IMG -O SEG-LINE -P models
```
Any image referenced by `@imageFilename` in PAGE-XML is passed on directly to Eynollah as a processor, so that e.g.
``` If the input file group is PAGE-XML (from a previous OCR-D workflow step), Eynollah behaves as follows:
ocrd-eynollah-segment -I OCR-D-IMG-BIN -O SEG-LINE -P models - existing regions are kept and ignored (i.e. in effect they might overlap segments from Eynollah results)
``` - existing annotation (and respective `AlternativeImage`s) are partially _ignored_:
- previous page frame detection (`cropped` images)
- previous derotation (`deskewed` images)
- previous thresholding (`binarized` images)
- if the page-level image nevertheless deviates from the original (`@imageFilename`)
(because some other preprocessing step was in effect like `denoised`), then
the output PAGE-XML will be based on that as new top-level (`@imageFilename`)
ocrd-eynollah-segment -I OCR-D-XYZ -O OCR-D-SEG -P models 2022-04-05
uses the original (RGB) image despite any binarization that may have occured in previous OCR-D processing steps Still, in general, it makes more sense to add other workflow steps **after** Eynollah.
#### Additional documentation #### Additional documentation
Please check the [wiki](https://github.com/qurator-spk/eynollah/wiki). Please check the [wiki](https://github.com/qurator-spk/eynollah/wiki).

@ -32,7 +32,7 @@ plotting = ["matplotlib"]
[project.scripts] [project.scripts]
eynollah = "eynollah.cli:main" eynollah = "eynollah.cli:main"
ocrd-eynollah-segment = "eynollah.ocrd_cli:main" ocrd-eynollah-segment = "eynollah.ocrd_cli:main"
ocrd-sbb-binarize = "eynollah.ocrd_cli_binarization:cli" ocrd-sbb-binarize = "eynollah.ocrd_cli_binarization:main"
[project.urls] [project.urls]
Homepage = "https://github.com/qurator-spk/eynollah" Homepage = "https://github.com/qurator-spk/eynollah"
@ -40,6 +40,7 @@ Repository = "https://github.com/qurator-spk/eynollah.git"
[tool.setuptools.dynamic] [tool.setuptools.dynamic]
dependencies = {file = ["requirements.txt"]} dependencies = {file = ["requirements.txt"]}
optional-dependencies.test = {file = ["requirements-test.txt"]}
[tool.setuptools.packages.find] [tool.setuptools.packages.find]
where = ["src"] where = ["src"]

@ -1,5 +1,5 @@
# ocrd includes opencv, numpy, shapely, click # ocrd includes opencv, numpy, shapely, click
ocrd >= 2.23.3 ocrd >= 3.3.0
numpy <1.24.0 numpy <1.24.0
scikit-learn >= 0.23.2 scikit-learn >= 0.23.2
tensorflow < 2.13 tensorflow < 2.13

@ -1,6 +1,6 @@
import sys import sys
import click import click
from ocrd_utils import initLogging, setOverrideLogLevel from ocrd_utils import initLogging, getLevelName, getLogger
from eynollah.eynollah import Eynollah, Eynollah_ocr from eynollah.eynollah import Eynollah, Eynollah_ocr
from eynollah.sbb_binarize import SbbBinarizer from eynollah.sbb_binarize import SbbBinarizer
@ -15,21 +15,18 @@ def main():
help="directory of GT page-xml files", help="directory of GT page-xml files",
type=click.Path(exists=True, file_okay=False), type=click.Path(exists=True, file_okay=False),
) )
@click.option( @click.option(
"--dir_out_modal_image", "--dir_out_modal_image",
"-domi", "-domi",
help="directory where ground truth images would be written", help="directory where ground truth images would be written",
type=click.Path(exists=True, file_okay=False), type=click.Path(exists=True, file_okay=False),
) )
@click.option( @click.option(
"--dir_out_classes", "--dir_out_classes",
"-docl", "-docl",
help="directory where ground truth classes would be written", help="directory where ground truth classes would be written",
type=click.Path(exists=True, file_okay=False), type=click.Path(exists=True, file_okay=False),
) )
@click.option( @click.option(
"--input_height", "--input_height",
"-ih", "-ih",
@ -45,17 +42,13 @@ def main():
"-min", "-min",
help="min area size of regions considered for reading order training.", help="min area size of regions considered for reading order training.",
) )
def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, input_height, input_width, min_area_size): def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, input_height, input_width, min_area_size):
xml_files_ind = os.listdir(dir_xml) xml_files_ind = os.listdir(dir_xml)
@main.command() @main.command()
@click.option('--patches/--no-patches', default=True, help='by enabling this parameter you let the model to see the image in patches.') @click.option('--patches/--no-patches', default=True, help='by enabling this parameter you let the model to see the image in patches.')
@click.option('--model_dir', '-m', type=click.Path(exists=True, file_okay=False), required=True, help='directory containing models for prediction') @click.option('--model_dir', '-m', type=click.Path(exists=True, file_okay=False), required=True, help='directory containing models for prediction')
@click.argument('input_image') @click.argument('input_image')
@click.argument('output_image') @click.argument('output_image')
@click.option( @click.option(
"--dir_in", "--dir_in",
@ -69,7 +62,6 @@ def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, i
help="directory where the binarized images will be written", help="directory where the binarized images will be written",
type=click.Path(exists=True, file_okay=False), type=click.Path(exists=True, file_okay=False),
) )
def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out): def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out):
if not dir_out and (dir_in): if not dir_out and (dir_in):
print("Error: You used -di but did not set -do") print("Error: You used -di but did not set -do")
@ -264,25 +256,37 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
if log_level: if log_level:
getLogger('eynollah').setLevel(getLevelName(log_level)) getLogger('eynollah').setLevel(getLevelName(log_level))
if not enable_plotting and (save_layout or save_deskewed or save_all or save_page or save_images or allow_enhancement): if not enable_plotting and (save_layout or save_deskewed or save_all or save_page or save_images or allow_enhancement):
print("Error: You used one of -sl, -sd, -sa, -sp, -si or -ae but did not enable plotting with -ep") raise ValueError("Plotting with -sl, -sd, -sa, -sp, -si or -ae also requires -ep")
sys.exit(1)
elif enable_plotting and not (save_layout or save_deskewed or save_all or save_page or save_images or allow_enhancement): elif enable_plotting and not (save_layout or save_deskewed or save_all or save_page or save_images or allow_enhancement):
print("Error: You used -ep to enable plotting but set none of -sl, -sd, -sa, -sp, -si or -ae") raise ValueError("Plotting with -ep also requires -sl, -sd, -sa, -sp, -si or -ae")
sys.exit(1)
if textline_light and not light_version: if textline_light and not light_version:
print('Error: You used -tll to enable light textline detection but -light is not enabled') raise ValueError("Light textline detection with -tll also requires -light")
sys.exit(1)
if light_version and not textline_light: if light_version and not textline_light:
print('Error: You used -light without -tll. Light version need light textline to be enabled.') raise ValueError("Light version with -light also requires light textline detection -tll")
if extract_only_images and (allow_enhancement or allow_scaling or light_version or curved_line or textline_light or full_layout or tables or right2left or headers_off) : if extract_only_images and allow_enhancement:
print('Error: You used -eoi which can not be enabled alongside light_version -light or allow_scaling -as or allow_enhancement -ae or curved_line -cl or textline_light -tll or full_layout -fl or tables -tab or right2left -r2l or headers_off -ho') raise ValueError("Image extraction with -eoi can not be enabled alongside allow_enhancement -ae")
sys.exit(1) if extract_only_images and allow_scaling:
raise ValueError("Image extraction with -eoi can not be enabled alongside allow_scaling -as")
if extract_only_images and light_version:
raise ValueError("Image extraction with -eoi can not be enabled alongside light_version -light")
if extract_only_images and curved_line:
raise ValueError("Image extraction with -eoi can not be enabled alongside curved_line -cl")
if extract_only_images and textline_light:
raise ValueError("Image extraction with -eoi can not be enabled alongside textline_light -tll")
if extract_only_images and full_layout:
raise ValueError("Image extraction with -eoi can not be enabled alongside full_layout -fl")
if extract_only_images and tables:
raise ValueError("Image extraction with -eoi can not be enabled alongside tables -tab")
if extract_only_images and right2left:
raise ValueError("Image extraction with -eoi can not be enabled alongside right2left -r2l")
if extract_only_images and headers_off:
raise ValueError("Image extraction with -eoi can not be enabled alongside headers_off -ho")
if image is None and dir_in is None:
raise ValueError("Either a single image -i or a dir_in -di is required")
eynollah = Eynollah( eynollah = Eynollah(
image_filename=image, model,
overwrite=overwrite, logger=getLogger('eynollah'),
dir_out=out, dir_out=out,
dir_in=dir_in,
dir_models=model,
dir_of_cropped_images=save_images, dir_of_cropped_images=save_images,
extract_only_images=extract_only_images, extract_only_images=extract_only_images,
dir_of_layout=save_layout, dir_of_layout=save_layout,
@ -308,10 +312,9 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
skip_layout_and_reading_order=skip_layout_and_reading_order, skip_layout_and_reading_order=skip_layout_and_reading_order,
) )
if dir_in: if dir_in:
eynollah.run() eynollah.run(dir_in=dir_in, overwrite=overwrite)
else: else:
pcgts = eynollah.run() eynollah.run(image_filename=image, overwrite=overwrite)
eynollah.writer.write_pagexml(pcgts)
@main.command() @main.command()
@ -367,9 +370,9 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
) )
def ocr(dir_in, out, dir_xmls, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, log_level): def ocr(dir_in, out, dir_xmls, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, log_level):
if log_level:
setOverrideLogLevel(log_level)
initLogging() initLogging()
if log_level:
getLogger('eynollah').setLevel(getLevelName(log_level))
eynollah_ocr = Eynollah_ocr( eynollah_ocr = Eynollah_ocr(
dir_xmls=dir_xmls, dir_xmls=dir_xmls,
dir_in=dir_in, dir_in=dir_in,

@ -6,47 +6,57 @@
document layout analysis (segmentation) with output in PAGE-XML document layout analysis (segmentation) with output in PAGE-XML
""" """
import tracemalloc from logging import Logger
from difflib import SequenceMatcher as sq
import math import math
import os import os
import sys import sys
import time import time
from typing import Optional
import atexit import atexit
import warnings import warnings
from functools import partial from functools import partial
from pathlib import Path from pathlib import Path
from multiprocessing import cpu_count from multiprocessing import cpu_count
from loky import ProcessPoolExecutor
import gc import gc
from ocrd_utils import getLogger import copy
import json
from loky import ProcessPoolExecutor
from PIL.Image import Image
import xml.etree.ElementTree as ET
import cv2 import cv2
import numpy as np import numpy as np
from transformers import TrOCRProcessor
from PIL import Image
import torch
from difflib import SequenceMatcher as sq
from transformers import VisionEncoderDecoderModel
from numba import cuda
import copy
from scipy.signal import find_peaks from scipy.signal import find_peaks
from scipy.ndimage import gaussian_filter1d from scipy.ndimage import gaussian_filter1d
from numba import cuda
from ocrd import OcrdPage
from ocrd_utils import getLogger, tf_disable_interactive_logs
try:
import torch
except ImportError:
torch = None
try:
import matplotlib.pyplot as plt
except ImportError:
plt = None
try:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
except ImportError:
TrOCRProcessor = VisionEncoderDecoderModel = None
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
#os.environ['CUDA_VISIBLE_DEVICES'] = '-1' #os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
stderr = sys.stderr tf_disable_interactive_logs()
sys.stderr = open(os.devnull, "w")
import tensorflow as tf import tensorflow as tf
from tensorflow.python.keras import backend as K from tensorflow.python.keras import backend as K
from tensorflow.keras.models import load_model from tensorflow.keras.models import load_model
sys.stderr = stderr
tf.get_logger().setLevel("ERROR") tf.get_logger().setLevel("ERROR")
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
# use tf1 compatibility for keras backend # use tf1 compatibility for keras backend
from tensorflow.compat.v1.keras.backend import set_session from tensorflow.compat.v1.keras.backend import set_session
from tensorflow.keras import layers from tensorflow.keras import layers
import json
import xml.etree.ElementTree as ET
from tensorflow.keras.layers import StringLookup from tensorflow.keras.layers import StringLookup
from .utils.contour import ( from .utils.contour import (
@ -166,54 +176,37 @@ class PatchEncoder(layers.Layer):
class Eynollah: class Eynollah:
def __init__( def __init__(
self, self,
dir_models, dir_models : str,
image_filename=None, dir_out : Optional[str] = None,
image_pil=None, dir_of_cropped_images : Optional[str] = None,
image_filename_stem=None, extract_only_images : bool =False,
overwrite=False, dir_of_layout : Optional[str] = None,
dir_out=None, dir_of_deskewed : Optional[str] = None,
dir_in=None, dir_of_all : Optional[str] = None,
dir_of_cropped_images=None, dir_save_page : Optional[str] = None,
extract_only_images=False, enable_plotting : bool = False,
dir_of_layout=None, allow_enhancement : bool = False,
dir_of_deskewed=None, curved_line : bool = False,
dir_of_all=None, textline_light : bool = False,
dir_save_page=None, full_layout : bool = False,
enable_plotting=False, tables : bool = False,
allow_enhancement=False, right2left : bool = False,
curved_line=False, input_binary : bool = False,
textline_light=False, allow_scaling : bool = False,
full_layout=False, headers_off : bool = False,
tables=False, light_version : bool = False,
right2left=False, ignore_page_extraction : bool = False,
input_binary=False, reading_order_machine_based : bool = False,
allow_scaling=False, do_ocr : bool = False,
headers_off=False, num_col_upper : Optional[int] = None,
light_version=False, num_col_lower : Optional[int] = None,
ignore_page_extraction=False, skip_layout_and_reading_order : bool = False,
reading_order_machine_based=False, logger : Logger = None,
do_ocr=False,
num_col_upper=None,
num_col_lower=None,
skip_layout_and_reading_order = False,
override_dpi=None,
logger=None,
pcgts=None,
): ):
if skip_layout_and_reading_order: if skip_layout_and_reading_order:
textline_light = True textline_light = True
self.light_version = light_version self.light_version = light_version
if not dir_in:
if image_pil:
self._imgs = self._cache_images(image_pil=image_pil)
else:
self._imgs = self._cache_images(image_filename=image_filename)
if override_dpi:
self.dpi = override_dpi
self.image_filename = image_filename
self.overwrite = overwrite
self.dir_out = dir_out self.dir_out = dir_out
self.dir_in = dir_in
self.dir_of_all = dir_of_all self.dir_of_all = dir_of_all
self.dir_save_page = dir_save_page self.dir_save_page = dir_save_page
self.reading_order_machine_based = reading_order_machine_based self.reading_order_machine_based = reading_order_machine_based
@ -244,22 +237,6 @@ class Eynollah:
self.num_col_lower = int(num_col_lower) self.num_col_lower = int(num_col_lower)
else: else:
self.num_col_lower = num_col_lower self.num_col_lower = num_col_lower
self.pcgts = pcgts
if not dir_in:
self.plotter = None if not enable_plotting else EynollahPlotter(
dir_out=self.dir_out,
dir_of_all=dir_of_all,
dir_save_page=dir_save_page,
dir_of_deskewed=dir_of_deskewed,
dir_of_cropped_images=dir_of_cropped_images,
dir_of_layout=dir_of_layout,
image_filename_stem=Path(Path(image_filename).name).stem)
self.writer = EynollahXmlWriter(
dir_out=self.dir_out,
image_filename=self.image_filename,
curved_line=self.curved_line,
textline_light = self.textline_light,
pcgts=pcgts)
self.logger = logger if logger else getLogger('eynollah') self.logger = logger if logger else getLogger('eynollah')
# for parallelization of CPU-intensive tasks: # for parallelization of CPU-intensive tasks:
self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200) self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200)
@ -311,21 +288,25 @@ class Eynollah:
self.model_textline_dir = dir_models + "/modelens_textline_0_1__2_4_16092024" self.model_textline_dir = dir_models + "/modelens_textline_0_1__2_4_16092024"
if self.ocr: if self.ocr:
self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124" self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124"
if self.tables: if self.tables:
if self.light_version: if self.light_version:
self.model_table_dir = dir_models + "/modelens_table_0t4_201124" self.model_table_dir = dir_models + "/modelens_table_0t4_201124"
else: else:
self.model_table_dir = dir_models + "/eynollah-tables_20210319" self.model_table_dir = dir_models + "/eynollah-tables_20210319"
self.models = {} # #gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
# #gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=7.7, allow_growth=True)
if dir_in: # #session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
# as in start_new_session: # config = tf.compat.v1.ConfigProto()
config = tf.compat.v1.ConfigProto() # config.gpu_options.allow_growth = True
config.gpu_options.allow_growth = True # #session = tf.InteractiveSession()
session = tf.compat.v1.Session(config=config) # session = tf.compat.v1.Session(config=config)
set_session(session) # set_session(session)
try:
for device in tf.config.list_physical_devices('GPU'):
tf.config.experimental.set_memory_growth(device, True)
except:
self.logger.warning("no GPU device available")
self.model_page = self.our_load_model(self.model_page_dir) self.model_page = self.our_load_model(self.model_page_dir)
self.model_classifier = self.our_load_model(self.model_dir_of_col_classifier) self.model_classifier = self.our_load_model(self.model_dir_of_col_classifier)
@ -354,9 +335,7 @@ class Eynollah:
if self.tables: if self.tables:
self.model_table = self.our_load_model(self.model_table_dir) self.model_table = self.our_load_model(self.model_table_dir)
self.ls_imgs = os.listdir(self.dir_in) def cache_images(self, image_filename=None, image_pil=None, dpi=None):
def _cache_images(self, image_filename=None, image_pil=None):
ret = {} ret = {}
t_c0 = time.time() t_c0 = time.time()
if image_filename: if image_filename:
@ -374,12 +353,13 @@ class Eynollah:
ret['img_grayscale'] = cv2.cvtColor(ret['img'], cv2.COLOR_BGR2GRAY) ret['img_grayscale'] = cv2.cvtColor(ret['img'], cv2.COLOR_BGR2GRAY)
for prefix in ('', '_grayscale'): for prefix in ('', '_grayscale'):
ret[f'img{prefix}_uint8'] = ret[f'img{prefix}'].astype(np.uint8) ret[f'img{prefix}_uint8'] = ret[f'img{prefix}'].astype(np.uint8)
return ret self._imgs = ret
if dpi is not None:
self.dpi = dpi
def reset_file_name_dir(self, image_filename): def reset_file_name_dir(self, image_filename):
t_c = time.time() t_c = time.time()
self._imgs = self._cache_images(image_filename=image_filename) self.cache_images(image_filename=image_filename)
self.image_filename = image_filename
self.plotter = None if not self.enable_plotting else EynollahPlotter( self.plotter = None if not self.enable_plotting else EynollahPlotter(
dir_out=self.dir_out, dir_out=self.dir_out,
@ -392,10 +372,9 @@ class Eynollah:
self.writer = EynollahXmlWriter( self.writer = EynollahXmlWriter(
dir_out=self.dir_out, dir_out=self.dir_out,
image_filename=self.image_filename, image_filename=image_filename,
curved_line=self.curved_line, curved_line=self.curved_line,
textline_light = self.textline_light, textline_light = self.textline_light)
pcgts=self.pcgts)
def imread(self, grayscale=False, uint8=True): def imread(self, grayscale=False, uint8=True):
key = 'img' key = 'img'
@ -410,8 +389,6 @@ class Eynollah:
def predict_enhancement(self, img): def predict_enhancement(self, img):
self.logger.debug("enter predict_enhancement") self.logger.debug("enter predict_enhancement")
if not self.dir_in:
self.model_enhancement, _ = self.start_new_session_and_model(self.model_dir_of_enhancement)
img_height_model = self.model_enhancement.layers[-1].output_shape[1] img_height_model = self.model_enhancement.layers[-1].output_shape[1]
img_width_model = self.model_enhancement.layers[-1].output_shape[2] img_width_model = self.model_enhancement.layers[-1].output_shape[2]
@ -609,9 +586,6 @@ class Eynollah:
_, page_coord = self.early_page_for_num_of_column_classification(img) _, page_coord = self.early_page_for_num_of_column_classification(img)
if not self.dir_in:
self.model_classifier, _ = self.start_new_session_and_model(self.model_dir_of_col_classifier)
if self.input_binary: if self.input_binary:
img_in = np.copy(img) img_in = np.copy(img)
img_in = img_in / 255.0 img_in = img_in / 255.0
@ -651,9 +625,6 @@ class Eynollah:
self.logger.info("Detected %s DPI", dpi) self.logger.info("Detected %s DPI", dpi)
if self.input_binary: if self.input_binary:
img = self.imread() img = self.imread()
if not self.dir_in:
self.model_bin, _ = self.start_new_session_and_model(self.model_dir_of_binarization)
prediction_bin = self.do_prediction(True, img, self.model_bin, n_batch_inference=5) prediction_bin = self.do_prediction(True, img, self.model_bin, n_batch_inference=5)
prediction_bin = 255 * (prediction_bin[:,:,0]==0) prediction_bin = 255 * (prediction_bin[:,:,0]==0)
prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2).astype(np.uint8) prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2).astype(np.uint8)
@ -670,9 +641,6 @@ class Eynollah:
self.image_page_org_size = img[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3], :] self.image_page_org_size = img[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3], :]
self.page_coord = page_coord self.page_coord = page_coord
if not self.dir_in:
self.model_classifier, _ = self.start_new_session_and_model(self.model_dir_of_col_classifier)
if self.num_col_upper and not self.num_col_lower: if self.num_col_upper and not self.num_col_lower:
num_col = self.num_col_upper num_col = self.num_col_upper
label_p_pred = [np.ones(6)] label_p_pred = [np.ones(6)]
@ -812,43 +780,6 @@ class Eynollah:
self.writer.height_org = self.height_org self.writer.height_org = self.height_org
self.writer.width_org = self.width_org self.writer.width_org = self.width_org
def start_new_session_and_model_old(self, model_dir):
self.logger.debug("enter start_new_session_and_model (model_dir=%s)", model_dir)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.InteractiveSession()
model = load_model(model_dir, compile=False)
return model, session
def start_new_session_and_model(self, model_dir):
self.logger.debug("enter start_new_session_and_model (model_dir=%s)", model_dir)
#gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
#gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=7.7, allow_growth=True)
#session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
physical_devices = tf.config.list_physical_devices('GPU')
try:
for device in physical_devices:
tf.config.experimental.set_memory_growth(device, True)
except:
self.logger.warning("no GPU device available")
if model_dir.endswith('.h5') and Path(model_dir[:-3]).exists():
# prefer SavedModel over HDF5 format if it exists
model_dir = model_dir[:-3]
if model_dir in self.models:
model = self.models[model_dir]
else:
try:
model = load_model(model_dir, compile=False)
except:
model = load_model(model_dir , compile=False, custom_objects={
"PatchEncoder": PatchEncoder, "Patches": Patches})
self.models[model_dir] = model
return model, None
def do_prediction( def do_prediction(
self, patches, img, model, self, patches, img, model,
n_batch_inference=1, marginal_of_patch_percent=0.1, n_batch_inference=1, marginal_of_patch_percent=0.1,
@ -1386,9 +1317,6 @@ class Eynollah:
self.logger.debug("enter extract_page") self.logger.debug("enter extract_page")
cont_page = [] cont_page = []
if not self.ignore_page_extraction: if not self.ignore_page_extraction:
if not self.dir_in:
self.model_page, _ = self.start_new_session_and_model(self.model_page_dir)
img = cv2.GaussianBlur(self.image, (5, 5), 0) img = cv2.GaussianBlur(self.image, (5, 5), 0)
img_page_prediction = self.do_prediction(False, img, self.model_page) img_page_prediction = self.do_prediction(False, img, self.model_page)
imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY) imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY)
@ -1436,8 +1364,6 @@ class Eynollah:
img = np.copy(img_bin).astype(np.uint8) img = np.copy(img_bin).astype(np.uint8)
else: else:
img = self.imread() img = self.imread()
if not self.dir_in:
self.model_page, _ = self.start_new_session_and_model(self.model_page_dir)
img = cv2.GaussianBlur(img, (5, 5), 0) img = cv2.GaussianBlur(img, (5, 5), 0)
img_page_prediction = self.do_prediction(False, img, self.model_page) img_page_prediction = self.do_prediction(False, img, self.model_page)
@ -1465,11 +1391,6 @@ class Eynollah:
self.logger.debug("enter extract_text_regions") self.logger.debug("enter extract_text_regions")
img_height_h = img.shape[0] img_height_h = img.shape[0]
img_width_h = img.shape[1] img_width_h = img.shape[1]
if not self.dir_in:
if patches:
self.model_region_fl, _ = self.start_new_session_and_model(self.model_region_dir_fully)
else:
self.model_region_fl_np, _ = self.start_new_session_and_model(self.model_region_dir_fully_np)
model_region = self.model_region_fl if patches else self.model_region_fl_np model_region = self.model_region_fl if patches else self.model_region_fl_np
if self.light_version: if self.light_version:
@ -1501,11 +1422,6 @@ class Eynollah:
self.logger.debug("enter extract_text_regions") self.logger.debug("enter extract_text_regions")
img_height_h = img.shape[0] img_height_h = img.shape[0]
img_width_h = img.shape[1] img_width_h = img.shape[1]
if not self.dir_in:
if patches:
self.model_region_fl, _ = self.start_new_session_and_model(self.model_region_dir_fully)
else:
self.model_region_fl_np, _ = self.start_new_session_and_model(self.model_region_dir_fully_np)
model_region = self.model_region_fl if patches else self.model_region_fl_np model_region = self.model_region_fl if patches else self.model_region_fl_np
if not patches: if not patches:
@ -1636,8 +1552,6 @@ class Eynollah:
def textline_contours(self, img, use_patches, scaler_h, scaler_w, num_col_classifier=None): def textline_contours(self, img, use_patches, scaler_h, scaler_w, num_col_classifier=None):
self.logger.debug('enter textline_contours') self.logger.debug('enter textline_contours')
if not self.dir_in:
self.model_textline, _ = self.start_new_session_and_model(self.model_textline_dir)
#img = img.astype(np.uint8) #img = img.astype(np.uint8)
img_org = np.copy(img) img_org = np.copy(img)
@ -1739,9 +1653,6 @@ class Eynollah:
img_h_new = int(img.shape[0] / float(img.shape[1]) * img_w_new) img_h_new = int(img.shape[0] / float(img.shape[1]) * img_w_new)
img_resized = resize_image(img,img_h_new, img_w_new ) img_resized = resize_image(img,img_h_new, img_w_new )
if not self.dir_in:
self.model_region, _ = self.start_new_session_and_model(self.model_region_dir_p_ens_light_only_images_extraction)
prediction_regions_org = self.do_prediction_new_concept(True, img_resized, self.model_region) prediction_regions_org = self.do_prediction_new_concept(True, img_resized, self.model_region)
prediction_regions_org = resize_image(prediction_regions_org,img_height_h, img_width_h ) prediction_regions_org = resize_image(prediction_regions_org,img_height_h, img_width_h )
@ -1830,7 +1741,6 @@ class Eynollah:
img_height_h = img_org.shape[0] img_height_h = img_org.shape[0]
img_width_h = img_org.shape[1] img_width_h = img_org.shape[1]
#model_region, _ = self.start_new_session_and_model(self.model_region_dir_p_ens)
#print(num_col_classifier,'num_col_classifier') #print(num_col_classifier,'num_col_classifier')
if num_col_classifier == 1: if num_col_classifier == 1:
@ -1853,8 +1763,6 @@ class Eynollah:
#if self.input_binary: #if self.input_binary:
#img_bin = np.copy(img_resized) #img_bin = np.copy(img_resized)
###if (not self.input_binary and self.full_layout) or (not self.input_binary and num_col_classifier >= 30): ###if (not self.input_binary and self.full_layout) or (not self.input_binary and num_col_classifier >= 30):
###if not self.dir_in:
###self.model_bin, _ = self.start_new_session_and_model(self.model_dir_of_binarization)
###prediction_bin = self.do_prediction(True, img_resized, self.model_bin, n_batch_inference=5) ###prediction_bin = self.do_prediction(True, img_resized, self.model_bin, n_batch_inference=5)
####print("inside bin ", time.time()-t_bin) ####print("inside bin ", time.time()-t_bin)
@ -1870,8 +1778,6 @@ class Eynollah:
###else: ###else:
###img_bin = np.copy(img_resized) ###img_bin = np.copy(img_resized)
if self.ocr and not self.input_binary: if self.ocr and not self.input_binary:
if not self.dir_in:
self.model_bin, _ = self.start_new_session_and_model(self.model_dir_of_binarization)
prediction_bin = self.do_prediction(True, img_resized, self.model_bin, n_batch_inference=5) prediction_bin = self.do_prediction(True, img_resized, self.model_bin, n_batch_inference=5)
prediction_bin = 255 * (prediction_bin[:,:,0] == 0) prediction_bin = 255 * (prediction_bin[:,:,0] == 0)
prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2) prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2)
@ -1894,12 +1800,7 @@ class Eynollah:
#plt.show() #plt.show()
if not skip_layout_and_reading_order: if not skip_layout_and_reading_order:
#print("inside 2 ", time.time()-t_in) #print("inside 2 ", time.time()-t_in)
if not self.dir_in:
self.model_region_1_2, _ = self.start_new_session_and_model(self.model_region_dir_p_1_2_sp_np)
##self.model_region, _ = self.start_new_session_and_model(self.model_region_dir_p_ens_light)
if num_col_classifier == 1 or num_col_classifier == 2: if num_col_classifier == 1 or num_col_classifier == 2:
model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p_1_2_sp_np)
if self.image_org.shape[0]/self.image_org.shape[1] > 2.5: if self.image_org.shape[0]/self.image_org.shape[1] > 2.5:
self.logger.debug("resized to %dx%d for %d cols", self.logger.debug("resized to %dx%d for %d cols",
img_resized.shape[1], img_resized.shape[0], num_col_classifier) img_resized.shape[1], img_resized.shape[0], num_col_classifier)
@ -1998,9 +1899,6 @@ class Eynollah:
img_height_h = img_org.shape[0] img_height_h = img_org.shape[0]
img_width_h = img_org.shape[1] img_width_h = img_org.shape[1]
if not self.dir_in:
self.model_region, _ = self.start_new_session_and_model(self.model_region_dir_p_ens)
ratio_y=1.3 ratio_y=1.3
ratio_x=1 ratio_x=1
@ -2026,9 +1924,6 @@ class Eynollah:
prediction_regions_org=prediction_regions_org[:,:,0] prediction_regions_org=prediction_regions_org[:,:,0]
prediction_regions_org[(prediction_regions_org[:,:]==1) & (mask_zeros_y[:,:]==1)]=0 prediction_regions_org[(prediction_regions_org[:,:]==1) & (mask_zeros_y[:,:]==1)]=0
if not self.dir_in:
self.model_region_p2, _ = self.start_new_session_and_model(self.model_region_dir_p2)
img = resize_image(img_org, int(img_org.shape[0]), int(img_org.shape[1])) img = resize_image(img_org, int(img_org.shape[0]), int(img_org.shape[1]))
prediction_regions_org2 = self.do_prediction(True, img, self.model_region_p2, marginal_of_patch_percent=0.2) prediction_regions_org2 = self.do_prediction(True, img, self.model_region_p2, marginal_of_patch_percent=0.2)
@ -2055,15 +1950,11 @@ class Eynollah:
if self.input_binary: if self.input_binary:
prediction_bin = np.copy(img_org) prediction_bin = np.copy(img_org)
else: else:
if not self.dir_in:
self.model_bin, _ = self.start_new_session_and_model(self.model_dir_of_binarization)
prediction_bin = self.do_prediction(True, img_org, self.model_bin, n_batch_inference=5) prediction_bin = self.do_prediction(True, img_org, self.model_bin, n_batch_inference=5)
prediction_bin = resize_image(prediction_bin, img_height_h, img_width_h ) prediction_bin = resize_image(prediction_bin, img_height_h, img_width_h )
prediction_bin = 255 * (prediction_bin[:,:,0]==0) prediction_bin = 255 * (prediction_bin[:,:,0]==0)
prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2) prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2)
if not self.dir_in:
self.model_region, _ = self.start_new_session_and_model(self.model_region_dir_p_ens)
ratio_y=1 ratio_y=1
ratio_x=1 ratio_x=1
@ -2096,17 +1987,10 @@ class Eynollah:
except: except:
if self.input_binary: if self.input_binary:
prediction_bin = np.copy(img_org) prediction_bin = np.copy(img_org)
if not self.dir_in:
self.model_bin, _ = self.start_new_session_and_model(self.model_dir_of_binarization)
prediction_bin = self.do_prediction(True, img_org, self.model_bin, n_batch_inference=5) prediction_bin = self.do_prediction(True, img_org, self.model_bin, n_batch_inference=5)
prediction_bin = resize_image(prediction_bin, img_height_h, img_width_h ) prediction_bin = resize_image(prediction_bin, img_height_h, img_width_h )
prediction_bin = 255 * (prediction_bin[:,:,0]==0) prediction_bin = 255 * (prediction_bin[:,:,0]==0)
prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2) prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2)
if not self.dir_in:
self.model_region, _ = self.start_new_session_and_model(self.model_region_dir_p_ens)
else: else:
prediction_bin = np.copy(img_org) prediction_bin = np.copy(img_org)
ratio_y=1 ratio_y=1
@ -2736,10 +2620,6 @@ class Eynollah:
img_org = np.copy(img) img_org = np.copy(img)
img_height_h = img_org.shape[0] img_height_h = img_org.shape[0]
img_width_h = img_org.shape[1] img_width_h = img_org.shape[1]
if not self.dir_in:
self.model_table, _ = self.start_new_session_and_model(self.model_table_dir)
patches = False patches = False
if self.light_version: if self.light_version:
prediction_table = self.do_prediction_new_concept(patches, img, self.model_table) prediction_table = self.do_prediction_new_concept(patches, img, self.model_table)
@ -3376,7 +3256,11 @@ class Eynollah:
regions_without_separators_d, regions_fully, regions_without_separators, regions_without_separators_d, regions_fully, regions_without_separators,
polygons_of_marginals, contours_tables) polygons_of_marginals, contours_tables)
def our_load_model(self, model_file): @staticmethod
def our_load_model(model_file):
if model_file.endswith('.h5') and Path(model_file[:-3]).exists():
# prefer SavedModel over HDF5 format if it exists
model_file = model_file[:-3]
try: try:
model = load_model(model_file, compile=False) model = load_model(model_file, compile=False)
except: except:
@ -3427,9 +3311,6 @@ class Eynollah:
img_header_and_sep = resize_image(img_header_and_sep, height1, width1) img_header_and_sep = resize_image(img_header_and_sep, height1, width1)
img_poly = resize_image(img_poly, height3, width3) img_poly = resize_image(img_poly, height3, width3)
if not self.dir_in:
self.model_reading_order, _ = self.start_new_session_and_model(self.model_reading_order_dir)
inference_bs = 3 inference_bs = 3
input_1 = np.zeros((inference_bs, height1, width1, 3)) input_1 = np.zeros((inference_bs, height1, width1, 3))
ordered = [list(range(len(co_text_all)))] ordered = [list(range(len(co_text_all)))]
@ -3730,7 +3611,7 @@ class Eynollah:
for ij in range(len(all_found_textline_polygons[j])): for ij in range(len(all_found_textline_polygons[j])):
con_ind = all_found_textline_polygons[j][ij] con_ind = all_found_textline_polygons[j][ij]
area = cv2.contourArea(con_ind) area = cv2.contourArea(con_ind)
con_ind = con_ind.astype(np.float) con_ind = con_ind.astype(float)
x_differential = np.diff( con_ind[:,0,0]) x_differential = np.diff( con_ind[:,0,0])
y_differential = np.diff( con_ind[:,0,1]) y_differential = np.diff( con_ind[:,0,1])
@ -3834,7 +3715,7 @@ class Eynollah:
con_ind = all_found_textline_polygons[j] con_ind = all_found_textline_polygons[j]
#print(len(con_ind[:,0,0]),'con_ind[:,0,0]') #print(len(con_ind[:,0,0]),'con_ind[:,0,0]')
area = cv2.contourArea(con_ind) area = cv2.contourArea(con_ind)
con_ind = con_ind.astype(np.float) con_ind = con_ind.astype(float)
x_differential = np.diff( con_ind[:,0,0]) x_differential = np.diff( con_ind[:,0,0])
y_differential = np.diff( con_ind[:,0,1]) y_differential = np.diff( con_ind[:,0,1])
@ -3937,7 +3818,7 @@ class Eynollah:
con_ind = all_found_textline_polygons[j][ij] con_ind = all_found_textline_polygons[j][ij]
area = cv2.contourArea(con_ind) area = cv2.contourArea(con_ind)
con_ind = con_ind.astype(np.float) con_ind = con_ind.astype(float)
x_differential = np.diff( con_ind[:,0,0]) x_differential = np.diff( con_ind[:,0,0])
y_differential = np.diff( con_ind[:,0,1]) y_differential = np.diff( con_ind[:,0,1])
@ -4080,10 +3961,8 @@ class Eynollah:
ind_textline_inside_tr = list(range(len(contours[jj]))) ind_textline_inside_tr = list(range(len(contours[jj])))
index_textline_inside_textregion = index_textline_inside_textregion + ind_textline_inside_tr index_textline_inside_textregion = index_textline_inside_textregion + ind_textline_inside_tr
#ind_ins = [0] * len(contours[jj]) + jj ind_ins = [jj] * len(contours[jj])
ind_ins = np.zeros( len(contours[jj]) ) + jj indexes_of_textline_tot = indexes_of_textline_tot + ind_ins
list_ind_ins = list(ind_ins)
indexes_of_textline_tot = indexes_of_textline_tot + list_ind_ins
M_main_tot = [cv2.moments(contours_txtline_of_all_textregions[j]) M_main_tot = [cv2.moments(contours_txtline_of_all_textregions[j])
for j in range(len(contours_txtline_of_all_textregions))] for j in range(len(contours_txtline_of_all_textregions))]
@ -4171,7 +4050,7 @@ class Eynollah:
for j in range(len(all_found_textline_polygons)): for j in range(len(all_found_textline_polygons)):
for i in range(len(all_found_textline_polygons[j])): for i in range(len(all_found_textline_polygons[j])):
con_ind = all_found_textline_polygons[j][i] con_ind = all_found_textline_polygons[j][i]
con_ind = con_ind.astype(np.float) con_ind = con_ind.astype(float)
x_differential = np.diff( con_ind[:,0,0]) x_differential = np.diff( con_ind[:,0,0])
y_differential = np.diff( con_ind[:,0,1]) y_differential = np.diff( con_ind[:,0,1])
@ -4311,31 +4190,44 @@ class Eynollah:
return (slopes_rem, all_found_textline_polygons_rem, boxes_text_rem, txt_con_org_rem, return (slopes_rem, all_found_textline_polygons_rem, boxes_text_rem, txt_con_org_rem,
contours_only_text_parent_rem, index_by_text_par_con_rem_sort) contours_only_text_parent_rem, index_by_text_par_con_rem_sort)
def run(self): def run(self, image_filename : Optional[str] = None, dir_in : Optional[str] = None, overwrite : bool = False):
""" """
Get image and scales, then extract the page of scanned image Get image and scales, then extract the page of scanned image
""" """
self.logger.debug("enter run") self.logger.debug("enter run")
t0_tot = time.time() t0_tot = time.time()
if not self.dir_in: if dir_in:
self.ls_imgs = [1] self.ls_imgs = os.listdir(dir_in)
elif image_filename:
self.ls_imgs = [image_filename]
else:
raise ValueError("run requires either a single image filename or a directory")
for img_name in self.ls_imgs: for img_filename in self.ls_imgs:
self.logger.info(img_name) self.logger.info(img_filename)
t0 = time.time() t0 = time.time()
if self.dir_in:
self.reset_file_name_dir(os.path.join(self.dir_in,img_name))
#print("text region early -11 in %.1fs", time.time() - t0)
self.reset_file_name_dir(os.path.join(dir_in or "", img_filename))
#print("text region early -11 in %.1fs", time.time() - t0)
if os.path.exists(self.writer.output_filename): if os.path.exists(self.writer.output_filename):
if self.overwrite: if overwrite:
self.logger.warning("will overwrite existing output file '%s'", self.writer.output_filename) self.logger.warning("will overwrite existing output file '%s'", self.writer.output_filename)
else: else:
self.logger.warning("will skip input for existing output file '%s'", self.writer.output_filename) self.logger.warning("will skip input for existing output file '%s'", self.writer.output_filename)
continue continue
pcgts = self.run_single()
self.logger.info("Job done in %.1fs", time.time() - t0)
#print("Job done in %.1fs" % (time.time() - t0))
self.writer.write_pagexml(pcgts)
if dir_in:
self.logger.info("All jobs done in %.1fs", time.time() - t0_tot)
print("all Job done in %.1fs", time.time() - t0_tot)
def run_single(self):
t0 = time.time()
img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version) img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version)
self.logger.info("Enhancing took %.1fs ", time.time() - t0) self.logger.info("Enhancing took %.1fs ", time.time() - t0)
if self.extract_only_images: if self.extract_only_images:
@ -4348,11 +4240,6 @@ class Eynollah:
cont_page, [], [], ocr_all_textlines) cont_page, [], [], ocr_all_textlines)
if self.plotter: if self.plotter:
self.plotter.write_images_into_directory(polygons_of_images, image_page) self.plotter.write_images_into_directory(polygons_of_images, image_page)
if self.dir_in:
self.writer.write_pagexml(pcgts)
continue
else:
return pcgts return pcgts
if self.skip_layout_and_reading_order: if self.skip_layout_and_reading_order:
@ -4395,10 +4282,6 @@ class Eynollah:
all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals,
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals,
cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines) cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines)
if self.dir_in:
self.writer.write_pagexml(pcgts)
continue
else:
return pcgts return pcgts
#print("text region early -1 in %.1fs", time.time() - t0) #print("text region early -1 in %.1fs", time.time() - t0)
@ -4451,11 +4334,6 @@ class Eynollah:
pcgts = self.writer.build_pagexml_no_full_layout( pcgts = self.writer.build_pagexml_no_full_layout(
[], page_coord, [], [], [], [], [], [], [], [], [], [], [], page_coord, [], [], [], [], [], [], [], [], [], [],
cont_page, [], [], ocr_all_textlines) cont_page, [], [], ocr_all_textlines)
self.logger.info("Job done in %.1fs", time.time() - t1)
if self.dir_in:
self.writer.write_pagexml(pcgts)
continue
else:
return pcgts return pcgts
#print("text region early in %.1fs", time.time() - t0) #print("text region early in %.1fs", time.time() - t0)
@ -4641,11 +4519,6 @@ class Eynollah:
polygons_of_images, polygons_of_images,
polygons_of_marginals, empty_marginals, empty_marginals, [], [], polygons_of_marginals, empty_marginals, empty_marginals, [], [],
cont_page, polygons_lines_xml, contours_tables, []) cont_page, polygons_lines_xml, contours_tables, [])
self.logger.info("Job done in %.1fs", time.time() - t0)
if self.dir_in:
self.writer.write_pagexml(pcgts)
continue
else:
return pcgts return pcgts
#print("text region early 3 in %.1fs", time.time() - t0) #print("text region early 3 in %.1fs", time.time() - t0)
@ -4836,15 +4709,8 @@ class Eynollah:
polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals,
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals,
cont_page, polygons_lines_xml, ocr_all_textlines) cont_page, polygons_lines_xml, ocr_all_textlines)
self.logger.info("Job done in %.1fs", time.time() - t0)
#print("Job done in %.1fs", time.time() - t0)
if self.dir_in:
self.writer.write_pagexml(pcgts)
continue
else:
return pcgts return pcgts
else:
contours_only_text_parent_h = None contours_only_text_parent_h = None
if self.reading_order_machine_based: if self.reading_order_machine_based:
order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model(
@ -4922,20 +4788,7 @@ class Eynollah:
all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals,
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals,
cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines) cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines)
#print("Job done in %.1fs" % (time.time() - t0))
self.logger.info("Job done in %.1fs", time.time() - t0)
if not self.dir_in:
return pcgts return pcgts
#print("text region early 7 in %.1fs", time.time() - t0)
if self.dir_in:
self.writer.write_pagexml(pcgts)
self.logger.info("Job done in %.1fs", time.time() - t0)
#print("Job done in %.1fs" % (time.time() - t0))
if self.dir_in:
self.logger.info("All jobs done in %.1fs", time.time() - t0_tot)
print("all Job done in %.1fs", time.time() - t0_tot)
class Eynollah_ocr: class Eynollah_ocr:

@ -1,47 +0,0 @@
{
"version": "0.1.0",
"git_url": "https://github.com/qurator-spk/sbb_binarization",
"tools": {
"ocrd-sbb-binarize": {
"executable": "ocrd-sbb-binarize",
"description": "Pixelwise binarization with selectional auto-encoders in Keras",
"categories": ["Image preprocessing"],
"steps": ["preprocessing/optimization/binarization"],
"input_file_grp": [],
"output_file_grp": [],
"parameters": {
"operation_level": {
"type": "string",
"enum": ["page", "region"],
"default": "page",
"description": "PAGE XML hierarchy level to operate on"
},
"model": {
"description": "Directory containing HDF5 or SavedModel/ProtoBuf models. Can be an absolute path or a path relative to the OCR-D resource location, the current working directory or the $SBB_BINARIZE_DATA environment variable (if set)",
"type": "string",
"format": "uri",
"content-type": "text/directory",
"required": true
}
},
"resources": [
{
"url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2020_01_16.zip",
"name": "default",
"type": "archive",
"path_in_archive": "saved_model_2020_01_16",
"size": 563147331,
"description": "default models provided by github.com/qurator-spk (SavedModel format)"
},
{
"url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip",
"name": "default-2021-03-09",
"type": "archive",
"path_in_archive": ".",
"size": 133230419,
"description": "updated default models provided by github.com/qurator-spk (SavedModel format)"
}
]
}
}
}

@ -1,21 +1,22 @@
{ {
"version": "0.3.1", "version": "0.3.1",
"git_url": "https://github.com/qurator-spk/eynollah", "git_url": "https://github.com/qurator-spk/eynollah",
"dockerhub": "ocrd/eynollah",
"tools": { "tools": {
"ocrd-eynollah-segment": { "ocrd-eynollah-segment": {
"executable": "ocrd-eynollah-segment", "executable": "ocrd-eynollah-segment",
"categories": ["Layout analysis"], "categories": ["Layout analysis"],
"description": "Segment page into regions and lines and do reading order detection with eynollah", "description": "Segment page into regions and lines and do reading order detection with eynollah",
"input_file_grp": ["OCR-D-IMG", "OCR-D-SEG-PAGE", "OCR-D-GT-SEG-PAGE"], "input_file_grp_cardinality": 1,
"output_file_grp": ["OCR-D-SEG-LINE"], "output_file_grp_cardinality": 1,
"steps": ["layout/segmentation/region", "layout/segmentation/line"], "steps": ["layout/segmentation/region", "layout/segmentation/line"],
"parameters": { "parameters": {
"models": { "models": {
"type": "string", "type": "string",
"format": "file", "format": "uri",
"content-type": "text/directory", "content-type": "text/directory",
"cacheable": true, "cacheable": true,
"description": "Path to directory containing models to be used (See https://qurator-data.de/eynollah)", "description": "Directory containing models to be used (See https://qurator-data.de/eynollah)",
"required": true "required": true
}, },
"dpi": { "dpi": {
@ -32,7 +33,7 @@
"light_version": { "light_version": {
"type": "boolean", "type": "boolean",
"default": true, "default": true,
"description": "Try to detect all element subtypes in light version" "description": "Try to detect all element subtypes in light version (faster+simpler method for main region detection and deskewing)"
}, },
"textline_light": { "textline_light": {
"type": "boolean", "type": "boolean",
@ -49,11 +50,31 @@
"default": false, "default": false,
"description": "try to return contour of textlines instead of just rectangle bounding box. Needs more processing time" "description": "try to return contour of textlines instead of just rectangle bounding box. Needs more processing time"
}, },
"ignore_page_extraction": {
"type": "boolean",
"default": false,
"description": "if this parameter set to true, this tool would ignore page extraction"
},
"allow_scaling": { "allow_scaling": {
"type": "boolean", "type": "boolean",
"default": false, "default": false,
"description": "check the resolution against the number of detected columns and if needed, scale the image up or down during layout detection (heuristic to improve quality and performance)" "description": "check the resolution against the number of detected columns and if needed, scale the image up or down during layout detection (heuristic to improve quality and performance)"
}, },
"allow_enhancement": {
"type": "boolean",
"default": false,
"description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not."
},
"textline_light": {
"type": "boolean",
"default": false,
"description": "if this parameter set to true, this tool will try to return contoure of textlines instead of rectangle bounding box of textline with a faster method."
},
"right_to_left": {
"type": "boolean",
"default": false,
"description": "if this parameter set to true, this tool will extract right-to-left reading order."
},
"headers_off": { "headers_off": {
"type": "boolean", "type": "boolean",
"default": false, "default": false,
@ -70,6 +91,47 @@
"path_in_archive": "models_eynollah" "path_in_archive": "models_eynollah"
} }
] ]
},
"ocrd-sbb-binarize": {
"executable": "ocrd-sbb-binarize",
"description": "Pixelwise binarization with selectional auto-encoders in Keras",
"categories": ["Image preprocessing"],
"steps": ["preprocessing/optimization/binarization"],
"input_file_grp_cardinality": 1,
"output_file_grp_cardinality": 1,
"parameters": {
"operation_level": {
"type": "string",
"enum": ["page", "region"],
"default": "page",
"description": "PAGE XML hierarchy level to operate on"
},
"model": {
"description": "Directory containing HDF5 or SavedModel/ProtoBuf models. Can be an absolute path or a path relative to the OCR-D resource location, the current working directory or the $SBB_BINARIZE_DATA environment variable (if set)",
"type": "string",
"format": "uri",
"content-type": "text/directory",
"required": true
}
},
"resources": [
{
"url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2020_01_16.zip",
"name": "default",
"type": "archive",
"path_in_archive": "saved_model_2020_01_16",
"size": 563147331,
"description": "default models provided by github.com/qurator-spk (SavedModel format)"
},
{
"url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip",
"name": "default-2021-03-09",
"type": "archive",
"path_in_archive": ".",
"size": 133230419,
"description": "updated default models provided by github.com/qurator-spk (SavedModel format)"
}
]
} }
} }
} }

@ -1,29 +1,16 @@
from os import environ from typing import Optional
from os.path import join
from pathlib import Path
from pkg_resources import resource_string
from json import loads
from PIL import Image from PIL import Image
import numpy as np import numpy as np
import cv2 import cv2
from click import command from click import command
from ocrd_utils import ( from ocrd import Processor, OcrdPageResult, OcrdPageResultImage
getLogger, from ocrd_models.ocrd_page import OcrdPage, AlternativeImageType
assert_file_grp_cardinality,
make_file_id,
MIMETYPE_PAGE
)
from ocrd import Processor
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import AlternativeImageType, to_xml
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
from .sbb_binarize import SbbBinarizer from .sbb_binarize import SbbBinarizer
OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool-binarization.json').decode('utf8'))
TOOL = 'ocrd-sbb-binarize'
def cv2pil(img): def cv2pil(img):
return Image.fromarray(img.astype('uint8')) return Image.fromarray(img.astype('uint8'))
@ -35,39 +22,22 @@ def pil2cv(img):
return cv2.cvtColor(pil_as_np_array, color_conversion) return cv2.cvtColor(pil_as_np_array, color_conversion)
class SbbBinarizeProcessor(Processor): class SbbBinarizeProcessor(Processor):
# already employs GPU (without singleton process atm)
max_workers = 1
def __init__(self, *args, **kwargs): @property
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] def executable(self):
kwargs['version'] = OCRD_TOOL['version'] return 'ocrd-sbb-binarize'
super().__init__(*args, **kwargs)
if hasattr(self, 'output_file_grp'):
# processing context
self.setup()
def setup(self): def setup(self):
""" """
Set up the model prior to processing. Set up the model prior to processing.
""" """
LOG = getLogger('processor.SbbBinarize.__init__')
if not 'model' in self.parameter:
raise ValueError("'model' parameter is required")
# resolve relative path via environment variable
model_path = Path(self.parameter['model'])
if not model_path.is_absolute():
if 'SBB_BINARIZE_DATA' in environ and environ['SBB_BINARIZE_DATA']:
LOG.info("Environment variable SBB_BINARIZE_DATA is set to '%s'" \
" - prepending to model value '%s'. If you don't want this mechanism," \
" unset the SBB_BINARIZE_DATA environment variable.",
environ['SBB_BINARIZE_DATA'], model_path)
model_path = Path(environ['SBB_BINARIZE_DATA']).joinpath(model_path)
model_path = model_path.resolve()
if not model_path.is_dir():
raise FileNotFoundError("Does not exist or is not a directory: %s" % model_path)
# resolve relative path via OCR-D ResourceManager # resolve relative path via OCR-D ResourceManager
model_path = self.resolve_resource(str(model_path)) model_path = self.resolve_resource(self.parameter['model'])
self.binarizer = SbbBinarizer(model_dir=model_path, logger=LOG) self.binarizer = SbbBinarizer(model_dir=model_path, logger=self.logger)
def process(self): def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
""" """
Binarize images with sbb_binarization (based on selectional auto-encoders). Binarize images with sbb_binarization (based on selectional auto-encoders).
@ -88,71 +58,52 @@ class SbbBinarizeProcessor(Processor):
Produce a new PAGE output file by serialising the resulting hierarchy. Produce a new PAGE output file by serialising the resulting hierarchy.
""" """
LOG = getLogger('processor.SbbBinarize') assert input_pcgts
assert_file_grp_cardinality(self.input_file_grp, 1) assert input_pcgts[0]
assert_file_grp_cardinality(self.output_file_grp, 1) assert self.parameter
oplevel = self.parameter['operation_level'] oplevel = self.parameter['operation_level']
pcgts = input_pcgts[0]
for n, input_file in enumerate(self.input_files): result = OcrdPageResult(pcgts)
file_id = make_file_id(input_file, self.output_file_grp)
page_id = input_file.pageId or input_file.ID
LOG.info("INPUT FILE %i / %s", n, page_id)
pcgts = page_from_file(self.workspace.download_file(input_file))
self.add_metadata(pcgts)
pcgts.set_pcGtsId(file_id)
page = pcgts.get_Page() page = pcgts.get_Page()
page_image, page_xywh, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') page_image, page_xywh, _ = self.workspace.image_from_page(
page, page_id, feature_filter='binarized')
if oplevel == 'page': if oplevel == 'page':
LOG.info("Binarizing on 'page' level in page '%s'", page_id) self.logger.info("Binarizing on 'page' level in page '%s'", page_id)
bin_image = cv2pil(self.binarizer.run(image=pil2cv(page_image), use_patches=True)) page_image_bin = cv2pil(self.binarizer.run(image=pil2cv(page_image), use_patches=True))
# update METS (add the image file): # update PAGE (reference the image file):
bin_image_path = self.workspace.save_image_file(bin_image, page_image_ref = AlternativeImageType(comments=page_xywh['features'] + ',binarized,clipped')
file_id + '.IMG-BIN', page.add_AlternativeImage(page_image_ref)
page_id=input_file.pageId, result.images.append(OcrdPageResultImage(page_image_bin, '.IMG-BIN', page_image_ref))
file_grp=self.output_file_grp)
page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments='%s,binarized' % page_xywh['features']))
elif oplevel == 'region': elif oplevel == 'region':
regions = page.get_AllRegions(['Text', 'Table'], depth=1) regions = page.get_AllRegions(['Text', 'Table'], depth=1)
if not regions: if not regions:
LOG.warning("Page '%s' contains no text/table regions", page_id) self.logger.warning("Page '%s' contains no text/table regions", page_id)
for region in regions: for region in regions:
region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh, feature_filter='binarized') region_image, region_xywh = self.workspace.image_from_segment(
region_image_bin = cv2pil(binarizer.run(image=pil2cv(region_image), use_patches=True)) region, page_image, page_xywh, feature_filter='binarized')
region_image_bin_path = self.workspace.save_image_file( region_image_bin = cv2pil(self.binarizer.run(image=pil2cv(region_image), use_patches=True))
region_image_bin, # update PAGE (reference the image file):
"%s_%s.IMG-BIN" % (file_id, region.id), region_image_ref = AlternativeImageType(comments=region_xywh['features'] + ',binarized')
page_id=input_file.pageId, region.add_AlternativeImage(region_image_ref)
file_grp=self.output_file_grp) result.images.append(OcrdPageResultImage(region_image_bin, region.id + '.IMG-BIN', region_image_ref))
region.add_AlternativeImage(
AlternativeImageType(filename=region_image_bin_path, comments='%s,binarized' % region_xywh['features']))
elif oplevel == 'line': elif oplevel == 'line':
region_line_tuples = [(r.id, r.get_TextLine()) for r in page.get_AllRegions(['Text'], depth=0)] lines = page.get_AllTextLines()
if not region_line_tuples: if not lines:
LOG.warning("Page '%s' contains no text lines", page_id) self.logger.warning("Page '%s' contains no text lines", page_id)
for region_id, line in region_line_tuples: for line in lines:
line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh, feature_filter='binarized') line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh, feature_filter='binarized')
line_image_bin = cv2pil(binarizer.run(image=pil2cv(line_image), use_patches=True)) line_image_bin = cv2pil(self.binarizer.run(image=pil2cv(line_image), use_patches=True))
line_image_bin_path = self.workspace.save_image_file( # update PAGE (reference the image file):
line_image_bin, line_image_ref = AlternativeImageType(comments=line_xywh['features'] + ',binarized')
"%s_%s_%s.IMG-BIN" % (file_id, region_id, line.id), line.add_AlternativeImage(region_image_ref)
page_id=input_file.pageId, result.images.append(OcrdPageResultImage(line_image_bin, line.id + '.IMG-BIN', line_image_ref))
file_grp=self.output_file_grp)
line.add_AlternativeImage( return result
AlternativeImageType(filename=line_image_bin_path, comments='%s,binarized' % line_xywh['features']))
self.workspace.add_file(
ID=file_id,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
mimetype=MIMETYPE_PAGE,
local_filename=join(self.output_file_grp, file_id + '.xml'),
content=to_xml(pcgts))
@command() @command()
@ocrd_cli_options @ocrd_cli_options
def cli(*args, **kwargs): def main(*args, **kwargs):
return ocrd_cli_wrap_processor(SbbBinarizeProcessor, *args, **kwargs) return ocrd_cli_wrap_processor(SbbBinarizeProcessor, *args, **kwargs)

@ -1,5 +1,8 @@
try:
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.patches as mpatches import matplotlib.patches as mpatches
except ImportError:
plt = mpatches = None
import numpy as np import numpy as np
import os.path import os.path
import cv2 import cv2

@ -1,71 +1,91 @@
from json import loads from typing import Optional
from pkg_resources import resource_string from ocrd_models import OcrdPage
from tempfile import NamedTemporaryFile from ocrd import Processor, OcrdPageResult
from pathlib import Path
from os.path import join
from PIL import Image from .eynollah import Eynollah, EynollahXmlWriter
from ocrd import Processor class EynollahProcessor(Processor):
from ocrd_modelfactory import page_from_file, exif_from_filename # already employs background CPU multiprocessing per page
from ocrd_models import OcrdFile, OcrdExif # already employs GPU (without singleton process atm)
from ocrd_models.ocrd_page import to_xml max_workers = 1
from ocrd_utils import (
getLogger, @property
MIMETYPE_PAGE, def executable(self):
assert_file_grp_cardinality, return 'ocrd-eynollah-segment'
make_file_id
def setup(self) -> None:
if self.parameter['textline_light'] and not self.parameter['light_version']:
raise ValueError("Error: You set parameter 'textline_light' to enable light textline detection, "
"but parameter 'light_version' is not enabled")
self.eynollah = Eynollah(
self.resolve_resource(self.parameter['models']),
logger=self.logger,
allow_enhancement=self.parameter['allow_enhancement'],
curved_line=self.parameter['curved_line'],
right2left=self.parameter['right_to_left'],
ignore_page_extraction=self.parameter['ignore_page_extraction'],
light_version=self.parameter['light_version'],
textline_light=self.parameter['textline_light'],
full_layout=self.parameter['full_layout'],
allow_scaling=self.parameter['allow_scaling'],
headers_off=self.parameter['headers_off'],
tables=self.parameter['tables'],
) )
self.eynollah.plotter = None
from .eynollah import Eynollah def shutdown(self):
from .utils.pil_cv2 import pil2cv if hasattr(self, 'eynollah'):
del self.eynollah
OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
"""
Performs cropping, region and line segmentation with Eynollah.
class EynollahProcessor(Processor): For each page, open and deserialize PAGE input file (from existing
PAGE file in the input fileGrp, or generated from image file).
Retrieve its respective page-level image (ignoring annotation that
already added `binarized`, `cropped` or `deskewed` features).
Set up Eynollah to detect regions and lines, and add each one to the
page, respectively.
def __init__(self, *args, **kwargs): \b
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment'] - If ``tables``, try to detect table blocks and add them as TableRegion.
kwargs['version'] = OCRD_TOOL['version'] - If ``full_layout``, then in addition to paragraphs and marginals, also
super().__init__(*args, **kwargs) try to detect drop capitals and headings.
- If ``ignore_page_extraction``, then attempt no cropping of the page.
- If ``curved_line``, then compute contour polygons for text lines
instead of simple bounding boxes.
def process(self): Produce a new output file by serialising the resulting hierarchy.
LOG = getLogger('eynollah') """
assert_file_grp_cardinality(self.input_file_grp, 1) assert input_pcgts
assert_file_grp_cardinality(self.output_file_grp, 1) assert input_pcgts[0]
for n, input_file in enumerate(self.input_files): assert self.parameter
page_id = input_file.pageId or input_file.ID pcgts = input_pcgts[0]
LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files)) result = OcrdPageResult(pcgts)
pcgts = page_from_file(self.workspace.download_file(input_file))
LOG.debug('width %s height %s', pcgts.get_Page().imageWidth, pcgts.get_Page().imageHeight)
self.add_metadata(pcgts)
page = pcgts.get_Page() page = pcgts.get_Page()
# XXX loses DPI information page_image, _, _ = self.workspace.image_from_page(
# page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') page, page_id,
image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename # avoid any features that would change the coordinate system: cropped,deskewed
eynollah_kwargs = { # (the PAGE builder merely adds regions, so afterwards we would not know which to transform)
'dir_models': self.resolve_resource(self.parameter['models']), # also avoid binarization as models usually fare better on grayscale/RGB
'dir_out': self.output_file_grp, feature_filter='cropped,deskewed,binarized')
'allow_enhancement': False, if hasattr(page_image, 'filename'):
'curved_line': self.parameter['curved_line'], image_filename = page_image.filename
'full_layout': self.parameter['full_layout'], else:
'allow_scaling': self.parameter['allow_scaling'], image_filename = "dummy" # will be replaced by ocrd.Processor.process_page_file
'light_version': self.parameter['light_version'], result.images.append(OcrdPageResultImage(page_image, '.IMG', page)) # mark as new original
'textline_light': self.parameter['textline_light'], # FIXME: mask out already existing regions (incremental segmentation)
'headers_off': self.parameter['headers_off'], self.eynollah.cache_images(
'tables': self.parameter['tables'], image_pil=page_image,
'override_dpi': self.parameter['dpi'], dpi=self.parameter['dpi'],
'logger': LOG, )
'pcgts': pcgts, self.eynollah.writer = EynollahXmlWriter(
'image_filename': image_filename dir_out=None,
} image_filename=image_filename,
Eynollah(**eynollah_kwargs).run() curved_line=self.eynollah.curved_line,
file_id = make_file_id(input_file, self.output_file_grp) textline_light=self.eynollah.textline_light,
pcgts.set_pcGtsId(file_id) pcgts=pcgts)
self.workspace.add_file( self.eynollah.run_single()
ID=file_id, return result
file_grp=self.output_file_grp,
pageId=page_id,
mimetype=MIMETYPE_PAGE,
local_filename=join(self.output_file_grp, file_id) + '.xml',
content=to_xml(pcgts))

@ -4,24 +4,18 @@ Tool to load model and binarize a given image.
import sys import sys
from glob import glob from glob import glob
from os import environ, devnull
from os.path import join
from warnings import catch_warnings, simplefilter
import os import os
import logging
import numpy as np import numpy as np
from PIL import Image from PIL import Image
import cv2 import cv2
environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from ocrd_utils import tf_disable_interactive_logs
stderr = sys.stderr tf_disable_interactive_logs()
sys.stderr = open(devnull, 'w')
import tensorflow as tf import tensorflow as tf
from tensorflow.keras.models import load_model from tensorflow.keras.models import load_model
from tensorflow.python.keras import backend as tensorflow_backend from tensorflow.python.keras import backend as tensorflow_backend
sys.stderr = stderr
import logging
def resize_image(img_in, input_height, input_width): def resize_image(img_in, input_height, input_width):
return cv2.resize(img_in, (input_width, input_height), interpolation=cv2.INTER_NEAREST) return cv2.resize(img_in, (input_width, input_height), interpolation=cv2.INTER_NEAREST)
@ -53,7 +47,7 @@ class SbbBinarizer:
del self.session del self.session
def load_model(self, model_name): def load_model(self, model_name):
model = load_model(join(self.model_dir, model_name), compile=False) model = load_model(os.path.join(self.model_dir, model_name), compile=False)
model_height = model.layers[len(model.layers)-1].output_shape[1] model_height = model.layers[len(model.layers)-1].output_shape[1]
model_width = model.layers[len(model.layers)-1].output_shape[2] model_width = model.layers[len(model.layers)-1].output_shape[2]
n_classes = model.layers[len(model.layers)-1].output_shape[3] n_classes = model.layers[len(model.layers)-1].output_shape[3]

@ -1,13 +1,17 @@
import time
import math import math
try:
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
except ImportError:
plt = None
import numpy as np import numpy as np
from shapely import geometry from shapely import geometry
import cv2 import cv2
import imutils import imutils
from scipy.signal import find_peaks from scipy.signal import find_peaks
from scipy.ndimage import gaussian_filter1d from scipy.ndimage import gaussian_filter1d
import time
from .is_nan import isNaN from .is_nan import isNaN
from .contour import (contours_in_same_horizon, from .contour import (contours_in_same_horizon,
find_new_features_of_contours, find_new_features_of_contours,
@ -237,10 +241,8 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order(
if len(remained_sep_indexes)>1: if len(remained_sep_indexes)>1:
#print(np.array(remained_sep_indexes),'np.array(remained_sep_indexes)') #print(np.array(remained_sep_indexes),'np.array(remained_sep_indexes)')
#print(np.array(mother),'mother') #print(np.array(mother),'mother')
##remained_sep_indexes_without_mother = remained_sep_indexes[mother==0] remained_sep_indexes_without_mother = remained_sep_indexes[mother==0]
##remained_sep_indexes_with_child_without_mother = remained_sep_indexes[mother==0 & child==1] remained_sep_indexes_with_child_without_mother = remained_sep_indexes[(mother==0) & (child==1)]
remained_sep_indexes_without_mother=np.array(list(remained_sep_indexes))[np.array(mother)==0]
remained_sep_indexes_with_child_without_mother=np.array(list(remained_sep_indexes))[(np.array(mother)==0) & (np.array(child)==1)]
#print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother') #print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother')
#print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother') #print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother')
@ -980,7 +982,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light(
(regions_model_full[:,:,0]==2)).sum() (regions_model_full[:,:,0]==2)).sum()
pixels_main = all_pixels - pixels_header pixels_main = all_pixels - pixels_header
if (pixels_header>=pixels_main) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ): if (pixels_header/float(pixels_main)>=0.3) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ):
regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=2 regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=2
contours_only_text_parent_head.append(con) contours_only_text_parent_head.append(con)
if contours_only_text_parent_d_ordered is not None: if contours_only_text_parent_d_ordered is not None:

@ -247,7 +247,7 @@ def get_textregion_contours_in_org_image_light(cnts, img, slope_first, map=map):
img = cv2.resize(img, (int(img.shape[1]/6), int(img.shape[0]/6)), interpolation=cv2.INTER_NEAREST) img = cv2.resize(img, (int(img.shape[1]/6), int(img.shape[0]/6)), interpolation=cv2.INTER_NEAREST)
##cnts = list( (np.array(cnts)/2).astype(np.int16) ) ##cnts = list( (np.array(cnts)/2).astype(np.int16) )
#cnts = cnts/2 #cnts = cnts/2
cnts = [(i/6).astype(np.int) for i in cnts] cnts = [(i/6).astype(int) for i in cnts]
results = map(partial(do_back_rotation_and_get_cnt_back, results = map(partial(do_back_rotation_and_get_cnt_back,
img=img, img=img,
slope_first=slope_first, slope_first=slope_first,

@ -1,3 +1,4 @@
from contextlib import nullcontext
from PIL import Image from PIL import Image
import numpy as np import numpy as np
from ocrd_models import OcrdExif from ocrd_models import OcrdExif
@ -17,11 +18,12 @@ def pil2cv(img):
def check_dpi(img): def check_dpi(img):
try: try:
if isinstance(img, Image.Image): if isinstance(img, Image.Image):
pil_image = img pil_image = nullcontext(img)
elif isinstance(img, str): elif isinstance(img, str):
pil_image = Image.open(img) pil_image = Image.open(img)
else: else:
pil_image = cv2pil(img) pil_image = nullcontext(cv2pil(img))
with pil_image:
exif = OcrdExif(pil_image) exif = OcrdExif(pil_image)
resolution = exif.resolution resolution = exif.resolution
if resolution == 1: if resolution == 1:

@ -1616,7 +1616,7 @@ def do_work_of_slopes_new(
textline_con_fil = filter_contours_area_of_image(img_int_p, textline_con, textline_con_fil = filter_contours_area_of_image(img_int_p, textline_con,
hierarchy, hierarchy,
max_area=1, min_area=0.00008) max_area=1, min_area=0.00008)
y_diff_mean = find_contours_mean_y_diff(textline_con_fil) y_diff_mean = find_contours_mean_y_diff(textline_con_fil) if len(textline_con_fil) > 1 else np.NaN
if np.isnan(y_diff_mean): if np.isnan(y_diff_mean):
slope_for_all = MAX_SLOPE slope_for_all = MAX_SLOPE
else: else:
@ -1681,7 +1681,7 @@ def do_work_of_slopes_new_curved(
textline_con_fil = filter_contours_area_of_image(img_int_p, textline_con, textline_con_fil = filter_contours_area_of_image(img_int_p, textline_con,
hierarchy, hierarchy,
max_area=1, min_area=0.0008) max_area=1, min_area=0.0008)
y_diff_mean = find_contours_mean_y_diff(textline_con_fil) y_diff_mean = find_contours_mean_y_diff(textline_con_fil) if len(textline_con_fil) > 1 else np.NaN
if np.isnan(y_diff_mean): if np.isnan(y_diff_mean):
slope_for_all = MAX_SLOPE slope_for_all = MAX_SLOPE
else: else:

@ -28,7 +28,7 @@ class EynollahXmlWriter():
self.counter = EynollahIdCounter() self.counter = EynollahIdCounter()
self.dir_out = dir_out self.dir_out = dir_out
self.image_filename = image_filename self.image_filename = image_filename
self.output_filename = os.path.join(self.dir_out, self.image_filename_stem) + ".xml" self.output_filename = os.path.join(self.dir_out or "", self.image_filename_stem) + ".xml"
self.curved_line = curved_line self.curved_line = curved_line
self.textline_light = textline_light self.textline_light = textline_light
self.pcgts = pcgts self.pcgts = pcgts

Loading…
Cancel
Save