mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-06-14 14:49:54 +02:00
Merge pull request #130 from qurator-spk/v3-api
port processor to core v3
This commit is contained in:
commit
e0d38517d3
22 changed files with 1119 additions and 1134 deletions
6
.dockerignore
Normal file
6
.dockerignore
Normal file
|
@ -0,0 +1,6 @@
|
|||
tests
|
||||
dist
|
||||
build
|
||||
env*
|
||||
*.egg-info
|
||||
models_eynollah*
|
44
.github/workflows/build-docker.yml
vendored
Normal file
44
.github/workflows/build-docker.yml
vendored
Normal file
|
@ -0,0 +1,44 @@
|
|||
name: CD
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ "master" ]
|
||||
workflow_dispatch: # run manually
|
||||
|
||||
jobs:
|
||||
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
# we need tags for docker version tagging
|
||||
fetch-tags: true
|
||||
fetch-depth: 0
|
||||
- # Activate cache export feature to reduce build time of images
|
||||
name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Login to GitHub Container Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERIO_USERNAME }}
|
||||
password: ${{ secrets.DOCKERIO_PASSWORD }}
|
||||
- name: Build the Docker image
|
||||
# build both tags at the same time
|
||||
run: make docker DOCKER_TAG="docker.io/ocrd/eynollah -t ghcr.io/qurator-spk/eynollah"
|
||||
- name: Test the Docker image
|
||||
run: docker run --rm ocrd/eynollah ocrd-eynollah-segment -h
|
||||
- name: Push to Dockerhub
|
||||
run: docker push docker.io/ocrd/eynollah
|
||||
- name: Push to Github Container Registry
|
||||
run: docker push ghcr.io/qurator-spk/eynollah
|
22
.github/workflows/test-eynollah.yml
vendored
22
.github/workflows/test-eynollah.yml
vendored
|
@ -16,18 +16,26 @@ jobs:
|
|||
steps:
|
||||
- name: clean up
|
||||
run: |
|
||||
df -h
|
||||
sudo rm -rf /usr/share/dotnet
|
||||
sudo rm -rf /usr/local/lib/android
|
||||
sudo rm -rf /opt/ghc
|
||||
sudo rm -rf "/usr/local/share/boost"
|
||||
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
|
||||
df -h
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/cache@v4
|
||||
id: model_cache
|
||||
id: seg_model_cache
|
||||
with:
|
||||
path: models_eynollah
|
||||
key: ${{ runner.os }}-models
|
||||
- uses: actions/cache@v4
|
||||
id: bin_model_cache
|
||||
with:
|
||||
path: default-2021-03-09
|
||||
key: ${{ runner.os }}-modelbin
|
||||
- name: Download models
|
||||
if: steps.model_cache.outputs.cache-hit != 'true'
|
||||
if: steps.seg_model_cache.outputs.cache-hit != 'true' || steps.bin_model_cache.outputs.cache-hit != 'true'
|
||||
run: make models
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
|
@ -36,9 +44,11 @@ jobs:
|
|||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install .[OCR,plotting]
|
||||
pip install -r requirements-test.txt
|
||||
make install EXTRAS=OCR,plotting
|
||||
make deps-test
|
||||
- name: Test with pytest
|
||||
run: make test
|
||||
- name: Test docker build
|
||||
run: make docker
|
||||
- name: Test standalone CLI
|
||||
run: make smoke-test
|
||||
- name: Test OCR-D CLI
|
||||
run: make ocrd-test
|
||||
|
|
41
Dockerfile
41
Dockerfile
|
@ -4,23 +4,42 @@ FROM $DOCKER_BASE_IMAGE
|
|||
ARG VCS_REF
|
||||
ARG BUILD_DATE
|
||||
LABEL \
|
||||
maintainer="https://ocr-d.de/kontakt" \
|
||||
maintainer="https://ocr-d.de/en/contact" \
|
||||
org.label-schema.vcs-ref=$VCS_REF \
|
||||
org.label-schema.vcs-url="https://github.com/qurator-spk/eynollah" \
|
||||
org.label-schema.build-date=$BUILD_DATE
|
||||
org.label-schema.build-date=$BUILD_DATE \
|
||||
org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \
|
||||
org.opencontainers.image.title="Eynollah" \
|
||||
org.opencontainers.image.description="" \
|
||||
org.opencontainers.image.source="https://github.com/qurator-spk/eynollah" \
|
||||
org.opencontainers.image.documentation="https://github.com/qurator-spk/eynollah/blob/${VCS_REF}/README.md" \
|
||||
org.opencontainers.image.revision=$VCS_REF \
|
||||
org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.base.name=ocrd/core-cuda-tf2
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
# set proper locales
|
||||
ENV PYTHONIOENCODING=utf8
|
||||
ENV XDG_DATA_HOME=/usr/local/share
|
||||
ENV LANG=C.UTF-8
|
||||
ENV LC_ALL=C.UTF-8
|
||||
|
||||
WORKDIR /build-eynollah
|
||||
COPY src/ ./src
|
||||
COPY pyproject.toml .
|
||||
COPY requirements.txt .
|
||||
COPY README.md .
|
||||
COPY Makefile .
|
||||
RUN apt-get install -y --no-install-recommends g++
|
||||
RUN make install
|
||||
# avoid HOME/.local/share (hard to predict USER here)
|
||||
# so let XDG_DATA_HOME coincide with fixed system location
|
||||
# (can still be overridden by derived stages)
|
||||
ENV XDG_DATA_HOME /usr/local/share
|
||||
# avoid the need for an extra volume for persistent resource user db
|
||||
# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
|
||||
ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
|
||||
|
||||
WORKDIR /build/eynollah
|
||||
COPY . .
|
||||
COPY ocrd-tool.json .
|
||||
# prepackage ocrd-tool.json as ocrd-all-tool.json
|
||||
RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
|
||||
# install everything and reduce image size
|
||||
RUN make install EXTRAS=OCR && rm -rf /build/eynollah
|
||||
# smoke test
|
||||
RUN eynollah --help
|
||||
|
||||
WORKDIR /data
|
||||
VOLUME /data
|
||||
|
|
91
Makefile
91
Makefile
|
@ -1,10 +1,20 @@
|
|||
EYNOLLAH_MODELS ?= $(PWD)/models_eynollah
|
||||
export EYNOLLAH_MODELS
|
||||
PYTHON ?= python3
|
||||
PIP ?= pip3
|
||||
EXTRAS ?=
|
||||
|
||||
# DOCKER_BASE_IMAGE = artefakt.dev.sbb.berlin:5000/sbb/ocrd_core:v2.68.0
|
||||
DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.68.0
|
||||
DOCKER_BASE_IMAGE = docker.io/ocrd/core-cuda-tf2:v3.3.0
|
||||
DOCKER_TAG = ocrd/eynollah
|
||||
|
||||
#SEG_MODEL := https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz
|
||||
#SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz
|
||||
SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah.tar.gz
|
||||
#SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz
|
||||
#SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz
|
||||
|
||||
BIN_MODEL := https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip
|
||||
|
||||
PYTEST_ARGS ?=
|
||||
|
||||
# BEGIN-EVAL makefile-parser --make-help Makefile
|
||||
|
||||
|
@ -12,44 +22,90 @@ help:
|
|||
@echo ""
|
||||
@echo " Targets"
|
||||
@echo ""
|
||||
@echo " models Download and extract models to $(PWD)/models_eynollah"
|
||||
@echo " install Install with pip"
|
||||
@echo " docker Build Docker image"
|
||||
@echo " build Build Python source and binary distribution"
|
||||
@echo " install Install package with pip"
|
||||
@echo " install-dev Install editable with pip"
|
||||
@echo " deps-test Install test dependencies with pip"
|
||||
@echo " models Download and extract models to $(CURDIR)/models_eynollah"
|
||||
@echo " smoke-test Run simple CLI check"
|
||||
@echo " ocrd-test Run OCR-D CLI check"
|
||||
@echo " test Run unit tests"
|
||||
@echo ""
|
||||
@echo " Variables"
|
||||
@echo " EXTRAS comma-separated list of features (like 'OCR,plotting') for 'install' [$(EXTRAS)]"
|
||||
@echo " DOCKER_TAG Docker image tag for 'docker' [$(DOCKER_TAG)]"
|
||||
@echo " PYTEST_ARGS pytest args for 'test' (Set to '-s' to see log output during test execution, '-vv' to see individual tests. [$(PYTEST_ARGS)]"
|
||||
@echo " SEG_MODEL URL of 'models' archive to download for segmentation 'test' [$(SEG_MODEL)]"
|
||||
@echo " BIN_MODEL URL of 'models' archive to download for binarization 'test' [$(BIN_MODEL)]"
|
||||
@echo ""
|
||||
|
||||
# END-EVAL
|
||||
|
||||
|
||||
# Download and extract models to $(PWD)/models_eynollah
|
||||
models: models_eynollah
|
||||
models: models_eynollah default-2021-03-09
|
||||
|
||||
models_eynollah: models_eynollah.tar.gz
|
||||
tar xf models_eynollah.tar.gz
|
||||
tar zxf models_eynollah.tar.gz
|
||||
|
||||
models_eynollah.tar.gz:
|
||||
# wget 'https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz'
|
||||
# wget 'https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz'
|
||||
wget 'https://qurator-data.de/eynollah/2022-04-05/models_eynollah.tar.gz'
|
||||
# wget 'https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz'
|
||||
# wget 'https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz'
|
||||
wget $(SEG_MODEL)
|
||||
|
||||
default-2021-03-09: $(notdir $(BIN_MODEL))
|
||||
unzip $(notdir $(BIN_MODEL))
|
||||
mkdir $@
|
||||
mv $(basename $(notdir $(BIN_MODEL))) $@
|
||||
|
||||
$(notdir $(BIN_MODEL)):
|
||||
wget $(BIN_MODEL)
|
||||
|
||||
build:
|
||||
$(PIP) install build
|
||||
$(PYTHON) -m build .
|
||||
|
||||
# Install with pip
|
||||
install:
|
||||
pip install .
|
||||
$(PIP) install .$(and $(EXTRAS),[$(EXTRAS)])
|
||||
|
||||
# Install editable with pip
|
||||
install-dev:
|
||||
pip install -e .
|
||||
$(PIP) install -e .$(and $(EXTRAS),[$(EXTRAS)])
|
||||
|
||||
smoke-test:
|
||||
eynollah layout -i tests/resources/kant_aufklaerung_1784_0020.tif -o . -m $(PWD)/models_eynollah
|
||||
deps-test: models_eynollah
|
||||
$(PIP) install -r requirements-test.txt
|
||||
|
||||
smoke-test: TMPDIR != mktemp -d
|
||||
smoke-test: tests/resources/kant_aufklaerung_1784_0020.tif
|
||||
# layout analysis:
|
||||
eynollah layout -i $< -o $(TMPDIR) -m $(CURDIR)/models_eynollah
|
||||
fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 $(TMPDIR)/$(basename $(<F)).xml
|
||||
fgrep -c -e TextRegion -e ImageRegion -e SeparatorRegion $(TMPDIR)/$(basename $(<F)).xml
|
||||
# directory mode (skip one, add one):
|
||||
eynollah layout -di $(<D) -o $(TMPDIR) -m $(CURDIR)/models_eynollah
|
||||
test -s $(TMPDIR)/euler_rechenkunst01_1738_0025.xml
|
||||
# binarize:
|
||||
eynollah binarization -m $(CURDIR)/default-2021-03-09 $< $(TMPDIR)/$(<F)
|
||||
test -s $(TMPDIR)/$(<F)
|
||||
@set -x; test "$$(identify -format '%w %h' $<)" = "$$(identify -format '%w %h' $(TMPDIR)/$(<F))"
|
||||
$(RM) -r $(TMPDIR)
|
||||
|
||||
ocrd-test: TMPDIR != mktemp -d
|
||||
ocrd-test: tests/resources/kant_aufklaerung_1784_0020.tif
|
||||
cp $< $(TMPDIR)
|
||||
ocrd workspace -d $(TMPDIR) init
|
||||
ocrd workspace -d $(TMPDIR) add -G OCR-D-IMG -g PHYS_0020 -i OCR-D-IMG_0020 $(<F)
|
||||
ocrd-eynollah-segment -w $(TMPDIR) -I OCR-D-IMG -O OCR-D-SEG -P models $(CURDIR)/models_eynollah
|
||||
result=$$(ocrd workspace -d $(TMPDIR) find -G OCR-D-SEG); \
|
||||
fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 $(TMPDIR)/$$result && \
|
||||
fgrep -c -e TextRegion -e ImageRegion -e SeparatorRegion $(TMPDIR)/$$result
|
||||
ocrd-sbb-binarize -w $(TMPDIR) -I OCR-D-IMG -O OCR-D-BIN -P model $(CURDIR)/default-2021-03-09
|
||||
ocrd-sbb-binarize -w $(TMPDIR) -I OCR-D-SEG -O OCR-D-SEG-BIN -P model $(CURDIR)/default-2021-03-09 -P operation_level region
|
||||
$(RM) -r $(TMPDIR)
|
||||
|
||||
# Run unit tests
|
||||
test:
|
||||
pytest tests
|
||||
EYNOLLAH_MODELS=$(CURDIR)/models_eynollah $(PYTHON) -m pytest tests --durations=0 --continue-on-collection-errors $(PYTEST_ARGS)
|
||||
|
||||
# Build docker image
|
||||
docker:
|
||||
|
@ -59,3 +115,4 @@ docker:
|
|||
--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
|
||||
-t $(DOCKER_TAG) .
|
||||
|
||||
.PHONY: models build install install-dev test smoke-test ocrd-test docker help
|
||||
|
|
27
README.md
27
README.md
|
@ -21,7 +21,7 @@
|
|||
:warning: Development is currently focused on achieving the best possible quality of results for a wide variety of historical documents and therefore processing can be very slow. We aim to improve this, but contributions are welcome.
|
||||
|
||||
## Installation
|
||||
Python `3.8-3.11` with Tensorflow `2.12-2.15` on Linux are currently supported.
|
||||
Python `3.8-3.11` with Tensorflow `<2.13` on Linux are currently supported.
|
||||
|
||||
For (limited) GPU support the CUDA toolkit needs to be installed.
|
||||
|
||||
|
@ -83,23 +83,28 @@ If no option is set, the tool performs layout detection of main regions (backgro
|
|||
The best output quality is produced when RGB images are used as input rather than greyscale or binarized images.
|
||||
|
||||
#### Use as OCR-D processor
|
||||
🚧 **Work in progress**
|
||||
|
||||
Eynollah ships with a CLI interface to be used as [OCR-D](https://ocr-d.de) processor.
|
||||
Eynollah ships with a CLI interface to be used as [OCR-D](https://ocr-d.de) [processor](https://ocr-d.de/en/spec/cli).
|
||||
|
||||
In this case, the source image file group with (preferably) RGB images should be used as input like this:
|
||||
|
||||
```
|
||||
ocrd-eynollah-segment -I OCR-D-IMG -O SEG-LINE -P models
|
||||
```
|
||||
ocrd-eynollah-segment -I OCR-D-IMG -O OCR-D-SEG -P models 2022-04-05
|
||||
|
||||
Any image referenced by `@imageFilename` in PAGE-XML is passed on directly to Eynollah as a processor, so that e.g.
|
||||
|
||||
```
|
||||
ocrd-eynollah-segment -I OCR-D-IMG-BIN -O SEG-LINE -P models
|
||||
```
|
||||
If the input file group is PAGE-XML (from a previous OCR-D workflow step), Eynollah behaves as follows:
|
||||
- existing regions are kept and ignored (i.e. in effect they might overlap segments from Eynollah results)
|
||||
- existing annotation (and respective `AlternativeImage`s) are partially _ignored_:
|
||||
- previous page frame detection (`cropped` images)
|
||||
- previous derotation (`deskewed` images)
|
||||
- previous thresholding (`binarized` images)
|
||||
- if the page-level image nevertheless deviates from the original (`@imageFilename`)
|
||||
(because some other preprocessing step was in effect like `denoised`), then
|
||||
the output PAGE-XML will be based on that as new top-level (`@imageFilename`)
|
||||
|
||||
uses the original (RGB) image despite any binarization that may have occured in previous OCR-D processing steps
|
||||
|
||||
ocrd-eynollah-segment -I OCR-D-XYZ -O OCR-D-SEG -P models 2022-04-05
|
||||
|
||||
Still, in general, it makes more sense to add other workflow steps **after** Eynollah.
|
||||
|
||||
#### Additional documentation
|
||||
Please check the [wiki](https://github.com/qurator-spk/eynollah/wiki).
|
||||
|
|
|
@ -32,7 +32,7 @@ plotting = ["matplotlib"]
|
|||
[project.scripts]
|
||||
eynollah = "eynollah.cli:main"
|
||||
ocrd-eynollah-segment = "eynollah.ocrd_cli:main"
|
||||
ocrd-sbb-binarize = "eynollah.ocrd_cli_binarization:cli"
|
||||
ocrd-sbb-binarize = "eynollah.ocrd_cli_binarization:main"
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/qurator-spk/eynollah"
|
||||
|
@ -40,6 +40,7 @@ Repository = "https://github.com/qurator-spk/eynollah.git"
|
|||
|
||||
[tool.setuptools.dynamic]
|
||||
dependencies = {file = ["requirements.txt"]}
|
||||
optional-dependencies.test = {file = ["requirements-test.txt"]}
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["src"]
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# ocrd includes opencv, numpy, shapely, click
|
||||
ocrd >= 2.23.3
|
||||
ocrd >= 3.3.0
|
||||
numpy <1.24.0
|
||||
scikit-learn >= 0.23.2
|
||||
tensorflow < 2.13
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import sys
|
||||
import click
|
||||
from ocrd_utils import initLogging, setOverrideLogLevel
|
||||
from ocrd_utils import initLogging, getLevelName, getLogger
|
||||
from eynollah.eynollah import Eynollah, Eynollah_ocr
|
||||
from eynollah.sbb_binarize import SbbBinarizer
|
||||
|
||||
|
@ -15,21 +15,18 @@ def main():
|
|||
help="directory of GT page-xml files",
|
||||
type=click.Path(exists=True, file_okay=False),
|
||||
)
|
||||
|
||||
@click.option(
|
||||
"--dir_out_modal_image",
|
||||
"-domi",
|
||||
help="directory where ground truth images would be written",
|
||||
type=click.Path(exists=True, file_okay=False),
|
||||
)
|
||||
|
||||
@click.option(
|
||||
"--dir_out_classes",
|
||||
"-docl",
|
||||
help="directory where ground truth classes would be written",
|
||||
type=click.Path(exists=True, file_okay=False),
|
||||
)
|
||||
|
||||
@click.option(
|
||||
"--input_height",
|
||||
"-ih",
|
||||
|
@ -45,17 +42,13 @@ def main():
|
|||
"-min",
|
||||
help="min area size of regions considered for reading order training.",
|
||||
)
|
||||
|
||||
def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, input_height, input_width, min_area_size):
|
||||
xml_files_ind = os.listdir(dir_xml)
|
||||
|
||||
@main.command()
|
||||
@click.option('--patches/--no-patches', default=True, help='by enabling this parameter you let the model to see the image in patches.')
|
||||
|
||||
@click.option('--model_dir', '-m', type=click.Path(exists=True, file_okay=False), required=True, help='directory containing models for prediction')
|
||||
|
||||
@click.argument('input_image')
|
||||
|
||||
@click.argument('output_image')
|
||||
@click.option(
|
||||
"--dir_in",
|
||||
|
@ -69,7 +62,6 @@ def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, i
|
|||
help="directory where the binarized images will be written",
|
||||
type=click.Path(exists=True, file_okay=False),
|
||||
)
|
||||
|
||||
def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out):
|
||||
if not dir_out and (dir_in):
|
||||
print("Error: You used -di but did not set -do")
|
||||
|
@ -264,25 +256,37 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
|
|||
if log_level:
|
||||
getLogger('eynollah').setLevel(getLevelName(log_level))
|
||||
if not enable_plotting and (save_layout or save_deskewed or save_all or save_page or save_images or allow_enhancement):
|
||||
print("Error: You used one of -sl, -sd, -sa, -sp, -si or -ae but did not enable plotting with -ep")
|
||||
sys.exit(1)
|
||||
raise ValueError("Plotting with -sl, -sd, -sa, -sp, -si or -ae also requires -ep")
|
||||
elif enable_plotting and not (save_layout or save_deskewed or save_all or save_page or save_images or allow_enhancement):
|
||||
print("Error: You used -ep to enable plotting but set none of -sl, -sd, -sa, -sp, -si or -ae")
|
||||
sys.exit(1)
|
||||
raise ValueError("Plotting with -ep also requires -sl, -sd, -sa, -sp, -si or -ae")
|
||||
if textline_light and not light_version:
|
||||
print('Error: You used -tll to enable light textline detection but -light is not enabled')
|
||||
sys.exit(1)
|
||||
raise ValueError("Light textline detection with -tll also requires -light")
|
||||
if light_version and not textline_light:
|
||||
print('Error: You used -light without -tll. Light version need light textline to be enabled.')
|
||||
if extract_only_images and (allow_enhancement or allow_scaling or light_version or curved_line or textline_light or full_layout or tables or right2left or headers_off) :
|
||||
print('Error: You used -eoi which can not be enabled alongside light_version -light or allow_scaling -as or allow_enhancement -ae or curved_line -cl or textline_light -tll or full_layout -fl or tables -tab or right2left -r2l or headers_off -ho')
|
||||
sys.exit(1)
|
||||
raise ValueError("Light version with -light also requires light textline detection -tll")
|
||||
if extract_only_images and allow_enhancement:
|
||||
raise ValueError("Image extraction with -eoi can not be enabled alongside allow_enhancement -ae")
|
||||
if extract_only_images and allow_scaling:
|
||||
raise ValueError("Image extraction with -eoi can not be enabled alongside allow_scaling -as")
|
||||
if extract_only_images and light_version:
|
||||
raise ValueError("Image extraction with -eoi can not be enabled alongside light_version -light")
|
||||
if extract_only_images and curved_line:
|
||||
raise ValueError("Image extraction with -eoi can not be enabled alongside curved_line -cl")
|
||||
if extract_only_images and textline_light:
|
||||
raise ValueError("Image extraction with -eoi can not be enabled alongside textline_light -tll")
|
||||
if extract_only_images and full_layout:
|
||||
raise ValueError("Image extraction with -eoi can not be enabled alongside full_layout -fl")
|
||||
if extract_only_images and tables:
|
||||
raise ValueError("Image extraction with -eoi can not be enabled alongside tables -tab")
|
||||
if extract_only_images and right2left:
|
||||
raise ValueError("Image extraction with -eoi can not be enabled alongside right2left -r2l")
|
||||
if extract_only_images and headers_off:
|
||||
raise ValueError("Image extraction with -eoi can not be enabled alongside headers_off -ho")
|
||||
if image is None and dir_in is None:
|
||||
raise ValueError("Either a single image -i or a dir_in -di is required")
|
||||
eynollah = Eynollah(
|
||||
image_filename=image,
|
||||
overwrite=overwrite,
|
||||
model,
|
||||
logger=getLogger('eynollah'),
|
||||
dir_out=out,
|
||||
dir_in=dir_in,
|
||||
dir_models=model,
|
||||
dir_of_cropped_images=save_images,
|
||||
extract_only_images=extract_only_images,
|
||||
dir_of_layout=save_layout,
|
||||
|
@ -308,10 +312,9 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
|
|||
skip_layout_and_reading_order=skip_layout_and_reading_order,
|
||||
)
|
||||
if dir_in:
|
||||
eynollah.run()
|
||||
eynollah.run(dir_in=dir_in, overwrite=overwrite)
|
||||
else:
|
||||
pcgts = eynollah.run()
|
||||
eynollah.writer.write_pagexml(pcgts)
|
||||
eynollah.run(image_filename=image, overwrite=overwrite)
|
||||
|
||||
|
||||
@main.command()
|
||||
|
@ -367,9 +370,9 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
|
|||
)
|
||||
|
||||
def ocr(dir_in, out, dir_xmls, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, log_level):
|
||||
if log_level:
|
||||
setOverrideLogLevel(log_level)
|
||||
initLogging()
|
||||
if log_level:
|
||||
getLogger('eynollah').setLevel(getLevelName(log_level))
|
||||
eynollah_ocr = Eynollah_ocr(
|
||||
dir_xmls=dir_xmls,
|
||||
dir_in=dir_in,
|
||||
|
|
|
@ -6,47 +6,57 @@
|
|||
document layout analysis (segmentation) with output in PAGE-XML
|
||||
"""
|
||||
|
||||
import tracemalloc
|
||||
from logging import Logger
|
||||
from difflib import SequenceMatcher as sq
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import Optional
|
||||
import atexit
|
||||
import warnings
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
from multiprocessing import cpu_count
|
||||
from loky import ProcessPoolExecutor
|
||||
import gc
|
||||
from ocrd_utils import getLogger
|
||||
import copy
|
||||
import json
|
||||
|
||||
from loky import ProcessPoolExecutor
|
||||
from PIL.Image import Image
|
||||
import xml.etree.ElementTree as ET
|
||||
import cv2
|
||||
import numpy as np
|
||||
from transformers import TrOCRProcessor
|
||||
from PIL import Image
|
||||
import torch
|
||||
from difflib import SequenceMatcher as sq
|
||||
from transformers import VisionEncoderDecoderModel
|
||||
from numba import cuda
|
||||
import copy
|
||||
from scipy.signal import find_peaks
|
||||
from scipy.ndimage import gaussian_filter1d
|
||||
from numba import cuda
|
||||
|
||||
from ocrd import OcrdPage
|
||||
from ocrd_utils import getLogger, tf_disable_interactive_logs
|
||||
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
torch = None
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
except ImportError:
|
||||
plt = None
|
||||
try:
|
||||
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
||||
except ImportError:
|
||||
TrOCRProcessor = VisionEncoderDecoderModel = None
|
||||
|
||||
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
||||
#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
|
||||
stderr = sys.stderr
|
||||
sys.stderr = open(os.devnull, "w")
|
||||
tf_disable_interactive_logs()
|
||||
import tensorflow as tf
|
||||
from tensorflow.python.keras import backend as K
|
||||
from tensorflow.keras.models import load_model
|
||||
sys.stderr = stderr
|
||||
tf.get_logger().setLevel("ERROR")
|
||||
warnings.filterwarnings("ignore")
|
||||
import matplotlib.pyplot as plt
|
||||
# use tf1 compatibility for keras backend
|
||||
from tensorflow.compat.v1.keras.backend import set_session
|
||||
from tensorflow.keras import layers
|
||||
import json
|
||||
import xml.etree.ElementTree as ET
|
||||
from tensorflow.keras.layers import StringLookup
|
||||
|
||||
from .utils.contour import (
|
||||
|
@ -166,54 +176,37 @@ class PatchEncoder(layers.Layer):
|
|||
class Eynollah:
|
||||
def __init__(
|
||||
self,
|
||||
dir_models,
|
||||
image_filename=None,
|
||||
image_pil=None,
|
||||
image_filename_stem=None,
|
||||
overwrite=False,
|
||||
dir_out=None,
|
||||
dir_in=None,
|
||||
dir_of_cropped_images=None,
|
||||
extract_only_images=False,
|
||||
dir_of_layout=None,
|
||||
dir_of_deskewed=None,
|
||||
dir_of_all=None,
|
||||
dir_save_page=None,
|
||||
enable_plotting=False,
|
||||
allow_enhancement=False,
|
||||
curved_line=False,
|
||||
textline_light=False,
|
||||
full_layout=False,
|
||||
tables=False,
|
||||
right2left=False,
|
||||
input_binary=False,
|
||||
allow_scaling=False,
|
||||
headers_off=False,
|
||||
light_version=False,
|
||||
ignore_page_extraction=False,
|
||||
reading_order_machine_based=False,
|
||||
do_ocr=False,
|
||||
num_col_upper=None,
|
||||
num_col_lower=None,
|
||||
skip_layout_and_reading_order = False,
|
||||
override_dpi=None,
|
||||
logger=None,
|
||||
pcgts=None,
|
||||
dir_models : str,
|
||||
dir_out : Optional[str] = None,
|
||||
dir_of_cropped_images : Optional[str] = None,
|
||||
extract_only_images : bool =False,
|
||||
dir_of_layout : Optional[str] = None,
|
||||
dir_of_deskewed : Optional[str] = None,
|
||||
dir_of_all : Optional[str] = None,
|
||||
dir_save_page : Optional[str] = None,
|
||||
enable_plotting : bool = False,
|
||||
allow_enhancement : bool = False,
|
||||
curved_line : bool = False,
|
||||
textline_light : bool = False,
|
||||
full_layout : bool = False,
|
||||
tables : bool = False,
|
||||
right2left : bool = False,
|
||||
input_binary : bool = False,
|
||||
allow_scaling : bool = False,
|
||||
headers_off : bool = False,
|
||||
light_version : bool = False,
|
||||
ignore_page_extraction : bool = False,
|
||||
reading_order_machine_based : bool = False,
|
||||
do_ocr : bool = False,
|
||||
num_col_upper : Optional[int] = None,
|
||||
num_col_lower : Optional[int] = None,
|
||||
skip_layout_and_reading_order : bool = False,
|
||||
logger : Logger = None,
|
||||
):
|
||||
if skip_layout_and_reading_order:
|
||||
textline_light = True
|
||||
self.light_version = light_version
|
||||
if not dir_in:
|
||||
if image_pil:
|
||||
self._imgs = self._cache_images(image_pil=image_pil)
|
||||
else:
|
||||
self._imgs = self._cache_images(image_filename=image_filename)
|
||||
if override_dpi:
|
||||
self.dpi = override_dpi
|
||||
self.image_filename = image_filename
|
||||
self.overwrite = overwrite
|
||||
self.dir_out = dir_out
|
||||
self.dir_in = dir_in
|
||||
self.dir_of_all = dir_of_all
|
||||
self.dir_save_page = dir_save_page
|
||||
self.reading_order_machine_based = reading_order_machine_based
|
||||
|
@ -244,22 +237,6 @@ class Eynollah:
|
|||
self.num_col_lower = int(num_col_lower)
|
||||
else:
|
||||
self.num_col_lower = num_col_lower
|
||||
self.pcgts = pcgts
|
||||
if not dir_in:
|
||||
self.plotter = None if not enable_plotting else EynollahPlotter(
|
||||
dir_out=self.dir_out,
|
||||
dir_of_all=dir_of_all,
|
||||
dir_save_page=dir_save_page,
|
||||
dir_of_deskewed=dir_of_deskewed,
|
||||
dir_of_cropped_images=dir_of_cropped_images,
|
||||
dir_of_layout=dir_of_layout,
|
||||
image_filename_stem=Path(Path(image_filename).name).stem)
|
||||
self.writer = EynollahXmlWriter(
|
||||
dir_out=self.dir_out,
|
||||
image_filename=self.image_filename,
|
||||
curved_line=self.curved_line,
|
||||
textline_light = self.textline_light,
|
||||
pcgts=pcgts)
|
||||
self.logger = logger if logger else getLogger('eynollah')
|
||||
# for parallelization of CPU-intensive tasks:
|
||||
self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200)
|
||||
|
@ -311,21 +288,25 @@ class Eynollah:
|
|||
self.model_textline_dir = dir_models + "/modelens_textline_0_1__2_4_16092024"
|
||||
if self.ocr:
|
||||
self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124"
|
||||
|
||||
if self.tables:
|
||||
if self.light_version:
|
||||
self.model_table_dir = dir_models + "/modelens_table_0t4_201124"
|
||||
else:
|
||||
self.model_table_dir = dir_models + "/eynollah-tables_20210319"
|
||||
|
||||
self.models = {}
|
||||
|
||||
if dir_in:
|
||||
# as in start_new_session:
|
||||
config = tf.compat.v1.ConfigProto()
|
||||
config.gpu_options.allow_growth = True
|
||||
session = tf.compat.v1.Session(config=config)
|
||||
set_session(session)
|
||||
# #gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
|
||||
# #gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=7.7, allow_growth=True)
|
||||
# #session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
|
||||
# config = tf.compat.v1.ConfigProto()
|
||||
# config.gpu_options.allow_growth = True
|
||||
# #session = tf.InteractiveSession()
|
||||
# session = tf.compat.v1.Session(config=config)
|
||||
# set_session(session)
|
||||
try:
|
||||
for device in tf.config.list_physical_devices('GPU'):
|
||||
tf.config.experimental.set_memory_growth(device, True)
|
||||
except:
|
||||
self.logger.warning("no GPU device available")
|
||||
|
||||
self.model_page = self.our_load_model(self.model_page_dir)
|
||||
self.model_classifier = self.our_load_model(self.model_dir_of_col_classifier)
|
||||
|
@ -354,9 +335,7 @@ class Eynollah:
|
|||
if self.tables:
|
||||
self.model_table = self.our_load_model(self.model_table_dir)
|
||||
|
||||
self.ls_imgs = os.listdir(self.dir_in)
|
||||
|
||||
def _cache_images(self, image_filename=None, image_pil=None):
|
||||
def cache_images(self, image_filename=None, image_pil=None, dpi=None):
|
||||
ret = {}
|
||||
t_c0 = time.time()
|
||||
if image_filename:
|
||||
|
@ -374,12 +353,13 @@ class Eynollah:
|
|||
ret['img_grayscale'] = cv2.cvtColor(ret['img'], cv2.COLOR_BGR2GRAY)
|
||||
for prefix in ('', '_grayscale'):
|
||||
ret[f'img{prefix}_uint8'] = ret[f'img{prefix}'].astype(np.uint8)
|
||||
return ret
|
||||
self._imgs = ret
|
||||
if dpi is not None:
|
||||
self.dpi = dpi
|
||||
|
||||
def reset_file_name_dir(self, image_filename):
|
||||
t_c = time.time()
|
||||
self._imgs = self._cache_images(image_filename=image_filename)
|
||||
self.image_filename = image_filename
|
||||
self.cache_images(image_filename=image_filename)
|
||||
|
||||
self.plotter = None if not self.enable_plotting else EynollahPlotter(
|
||||
dir_out=self.dir_out,
|
||||
|
@ -392,10 +372,9 @@ class Eynollah:
|
|||
|
||||
self.writer = EynollahXmlWriter(
|
||||
dir_out=self.dir_out,
|
||||
image_filename=self.image_filename,
|
||||
image_filename=image_filename,
|
||||
curved_line=self.curved_line,
|
||||
textline_light = self.textline_light,
|
||||
pcgts=self.pcgts)
|
||||
textline_light = self.textline_light)
|
||||
|
||||
def imread(self, grayscale=False, uint8=True):
|
||||
key = 'img'
|
||||
|
@ -410,8 +389,6 @@ class Eynollah:
|
|||
|
||||
def predict_enhancement(self, img):
|
||||
self.logger.debug("enter predict_enhancement")
|
||||
if not self.dir_in:
|
||||
self.model_enhancement, _ = self.start_new_session_and_model(self.model_dir_of_enhancement)
|
||||
|
||||
img_height_model = self.model_enhancement.layers[-1].output_shape[1]
|
||||
img_width_model = self.model_enhancement.layers[-1].output_shape[2]
|
||||
|
@ -609,9 +586,6 @@ class Eynollah:
|
|||
|
||||
_, page_coord = self.early_page_for_num_of_column_classification(img)
|
||||
|
||||
if not self.dir_in:
|
||||
self.model_classifier, _ = self.start_new_session_and_model(self.model_dir_of_col_classifier)
|
||||
|
||||
if self.input_binary:
|
||||
img_in = np.copy(img)
|
||||
img_in = img_in / 255.0
|
||||
|
@ -651,9 +625,6 @@ class Eynollah:
|
|||
self.logger.info("Detected %s DPI", dpi)
|
||||
if self.input_binary:
|
||||
img = self.imread()
|
||||
if not self.dir_in:
|
||||
self.model_bin, _ = self.start_new_session_and_model(self.model_dir_of_binarization)
|
||||
|
||||
prediction_bin = self.do_prediction(True, img, self.model_bin, n_batch_inference=5)
|
||||
prediction_bin = 255 * (prediction_bin[:,:,0]==0)
|
||||
prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2).astype(np.uint8)
|
||||
|
@ -670,9 +641,6 @@ class Eynollah:
|
|||
self.image_page_org_size = img[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3], :]
|
||||
self.page_coord = page_coord
|
||||
|
||||
if not self.dir_in:
|
||||
self.model_classifier, _ = self.start_new_session_and_model(self.model_dir_of_col_classifier)
|
||||
|
||||
if self.num_col_upper and not self.num_col_lower:
|
||||
num_col = self.num_col_upper
|
||||
label_p_pred = [np.ones(6)]
|
||||
|
@ -812,43 +780,6 @@ class Eynollah:
|
|||
self.writer.height_org = self.height_org
|
||||
self.writer.width_org = self.width_org
|
||||
|
||||
def start_new_session_and_model_old(self, model_dir):
|
||||
self.logger.debug("enter start_new_session_and_model (model_dir=%s)", model_dir)
|
||||
config = tf.ConfigProto()
|
||||
config.gpu_options.allow_growth = True
|
||||
|
||||
session = tf.InteractiveSession()
|
||||
model = load_model(model_dir, compile=False)
|
||||
|
||||
return model, session
|
||||
|
||||
def start_new_session_and_model(self, model_dir):
|
||||
self.logger.debug("enter start_new_session_and_model (model_dir=%s)", model_dir)
|
||||
#gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
|
||||
#gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=7.7, allow_growth=True)
|
||||
#session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
|
||||
physical_devices = tf.config.list_physical_devices('GPU')
|
||||
try:
|
||||
for device in physical_devices:
|
||||
tf.config.experimental.set_memory_growth(device, True)
|
||||
except:
|
||||
self.logger.warning("no GPU device available")
|
||||
|
||||
if model_dir.endswith('.h5') and Path(model_dir[:-3]).exists():
|
||||
# prefer SavedModel over HDF5 format if it exists
|
||||
model_dir = model_dir[:-3]
|
||||
if model_dir in self.models:
|
||||
model = self.models[model_dir]
|
||||
else:
|
||||
try:
|
||||
model = load_model(model_dir, compile=False)
|
||||
except:
|
||||
model = load_model(model_dir , compile=False, custom_objects={
|
||||
"PatchEncoder": PatchEncoder, "Patches": Patches})
|
||||
self.models[model_dir] = model
|
||||
|
||||
return model, None
|
||||
|
||||
def do_prediction(
|
||||
self, patches, img, model,
|
||||
n_batch_inference=1, marginal_of_patch_percent=0.1,
|
||||
|
@ -1386,9 +1317,6 @@ class Eynollah:
|
|||
self.logger.debug("enter extract_page")
|
||||
cont_page = []
|
||||
if not self.ignore_page_extraction:
|
||||
if not self.dir_in:
|
||||
self.model_page, _ = self.start_new_session_and_model(self.model_page_dir)
|
||||
|
||||
img = cv2.GaussianBlur(self.image, (5, 5), 0)
|
||||
img_page_prediction = self.do_prediction(False, img, self.model_page)
|
||||
imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY)
|
||||
|
@ -1436,8 +1364,6 @@ class Eynollah:
|
|||
img = np.copy(img_bin).astype(np.uint8)
|
||||
else:
|
||||
img = self.imread()
|
||||
if not self.dir_in:
|
||||
self.model_page, _ = self.start_new_session_and_model(self.model_page_dir)
|
||||
img = cv2.GaussianBlur(img, (5, 5), 0)
|
||||
img_page_prediction = self.do_prediction(False, img, self.model_page)
|
||||
|
||||
|
@ -1465,11 +1391,6 @@ class Eynollah:
|
|||
self.logger.debug("enter extract_text_regions")
|
||||
img_height_h = img.shape[0]
|
||||
img_width_h = img.shape[1]
|
||||
if not self.dir_in:
|
||||
if patches:
|
||||
self.model_region_fl, _ = self.start_new_session_and_model(self.model_region_dir_fully)
|
||||
else:
|
||||
self.model_region_fl_np, _ = self.start_new_session_and_model(self.model_region_dir_fully_np)
|
||||
model_region = self.model_region_fl if patches else self.model_region_fl_np
|
||||
|
||||
if self.light_version:
|
||||
|
@ -1501,11 +1422,6 @@ class Eynollah:
|
|||
self.logger.debug("enter extract_text_regions")
|
||||
img_height_h = img.shape[0]
|
||||
img_width_h = img.shape[1]
|
||||
if not self.dir_in:
|
||||
if patches:
|
||||
self.model_region_fl, _ = self.start_new_session_and_model(self.model_region_dir_fully)
|
||||
else:
|
||||
self.model_region_fl_np, _ = self.start_new_session_and_model(self.model_region_dir_fully_np)
|
||||
model_region = self.model_region_fl if patches else self.model_region_fl_np
|
||||
|
||||
if not patches:
|
||||
|
@ -1636,8 +1552,6 @@ class Eynollah:
|
|||
|
||||
def textline_contours(self, img, use_patches, scaler_h, scaler_w, num_col_classifier=None):
|
||||
self.logger.debug('enter textline_contours')
|
||||
if not self.dir_in:
|
||||
self.model_textline, _ = self.start_new_session_and_model(self.model_textline_dir)
|
||||
|
||||
#img = img.astype(np.uint8)
|
||||
img_org = np.copy(img)
|
||||
|
@ -1739,9 +1653,6 @@ class Eynollah:
|
|||
img_h_new = int(img.shape[0] / float(img.shape[1]) * img_w_new)
|
||||
img_resized = resize_image(img,img_h_new, img_w_new )
|
||||
|
||||
if not self.dir_in:
|
||||
self.model_region, _ = self.start_new_session_and_model(self.model_region_dir_p_ens_light_only_images_extraction)
|
||||
|
||||
prediction_regions_org = self.do_prediction_new_concept(True, img_resized, self.model_region)
|
||||
|
||||
prediction_regions_org = resize_image(prediction_regions_org,img_height_h, img_width_h )
|
||||
|
@ -1830,7 +1741,6 @@ class Eynollah:
|
|||
img_height_h = img_org.shape[0]
|
||||
img_width_h = img_org.shape[1]
|
||||
|
||||
#model_region, _ = self.start_new_session_and_model(self.model_region_dir_p_ens)
|
||||
#print(num_col_classifier,'num_col_classifier')
|
||||
|
||||
if num_col_classifier == 1:
|
||||
|
@ -1853,8 +1763,6 @@ class Eynollah:
|
|||
#if self.input_binary:
|
||||
#img_bin = np.copy(img_resized)
|
||||
###if (not self.input_binary and self.full_layout) or (not self.input_binary and num_col_classifier >= 30):
|
||||
###if not self.dir_in:
|
||||
###self.model_bin, _ = self.start_new_session_and_model(self.model_dir_of_binarization)
|
||||
###prediction_bin = self.do_prediction(True, img_resized, self.model_bin, n_batch_inference=5)
|
||||
|
||||
####print("inside bin ", time.time()-t_bin)
|
||||
|
@ -1870,8 +1778,6 @@ class Eynollah:
|
|||
###else:
|
||||
###img_bin = np.copy(img_resized)
|
||||
if self.ocr and not self.input_binary:
|
||||
if not self.dir_in:
|
||||
self.model_bin, _ = self.start_new_session_and_model(self.model_dir_of_binarization)
|
||||
prediction_bin = self.do_prediction(True, img_resized, self.model_bin, n_batch_inference=5)
|
||||
prediction_bin = 255 * (prediction_bin[:,:,0] == 0)
|
||||
prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2)
|
||||
|
@ -1894,12 +1800,7 @@ class Eynollah:
|
|||
#plt.show()
|
||||
if not skip_layout_and_reading_order:
|
||||
#print("inside 2 ", time.time()-t_in)
|
||||
if not self.dir_in:
|
||||
self.model_region_1_2, _ = self.start_new_session_and_model(self.model_region_dir_p_1_2_sp_np)
|
||||
##self.model_region, _ = self.start_new_session_and_model(self.model_region_dir_p_ens_light)
|
||||
|
||||
if num_col_classifier == 1 or num_col_classifier == 2:
|
||||
model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p_1_2_sp_np)
|
||||
if self.image_org.shape[0]/self.image_org.shape[1] > 2.5:
|
||||
self.logger.debug("resized to %dx%d for %d cols",
|
||||
img_resized.shape[1], img_resized.shape[0], num_col_classifier)
|
||||
|
@ -1998,9 +1899,6 @@ class Eynollah:
|
|||
img_height_h = img_org.shape[0]
|
||||
img_width_h = img_org.shape[1]
|
||||
|
||||
if not self.dir_in:
|
||||
self.model_region, _ = self.start_new_session_and_model(self.model_region_dir_p_ens)
|
||||
|
||||
ratio_y=1.3
|
||||
ratio_x=1
|
||||
|
||||
|
@ -2026,9 +1924,6 @@ class Eynollah:
|
|||
prediction_regions_org=prediction_regions_org[:,:,0]
|
||||
prediction_regions_org[(prediction_regions_org[:,:]==1) & (mask_zeros_y[:,:]==1)]=0
|
||||
|
||||
if not self.dir_in:
|
||||
self.model_region_p2, _ = self.start_new_session_and_model(self.model_region_dir_p2)
|
||||
|
||||
img = resize_image(img_org, int(img_org.shape[0]), int(img_org.shape[1]))
|
||||
|
||||
prediction_regions_org2 = self.do_prediction(True, img, self.model_region_p2, marginal_of_patch_percent=0.2)
|
||||
|
@ -2055,15 +1950,11 @@ class Eynollah:
|
|||
if self.input_binary:
|
||||
prediction_bin = np.copy(img_org)
|
||||
else:
|
||||
if not self.dir_in:
|
||||
self.model_bin, _ = self.start_new_session_and_model(self.model_dir_of_binarization)
|
||||
prediction_bin = self.do_prediction(True, img_org, self.model_bin, n_batch_inference=5)
|
||||
prediction_bin = resize_image(prediction_bin, img_height_h, img_width_h )
|
||||
prediction_bin = 255 * (prediction_bin[:,:,0]==0)
|
||||
prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2)
|
||||
|
||||
if not self.dir_in:
|
||||
self.model_region, _ = self.start_new_session_and_model(self.model_region_dir_p_ens)
|
||||
ratio_y=1
|
||||
ratio_x=1
|
||||
|
||||
|
@ -2096,17 +1987,10 @@ class Eynollah:
|
|||
except:
|
||||
if self.input_binary:
|
||||
prediction_bin = np.copy(img_org)
|
||||
|
||||
if not self.dir_in:
|
||||
self.model_bin, _ = self.start_new_session_and_model(self.model_dir_of_binarization)
|
||||
prediction_bin = self.do_prediction(True, img_org, self.model_bin, n_batch_inference=5)
|
||||
prediction_bin = resize_image(prediction_bin, img_height_h, img_width_h )
|
||||
prediction_bin = 255 * (prediction_bin[:,:,0]==0)
|
||||
prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2)
|
||||
|
||||
if not self.dir_in:
|
||||
self.model_region, _ = self.start_new_session_and_model(self.model_region_dir_p_ens)
|
||||
|
||||
else:
|
||||
prediction_bin = np.copy(img_org)
|
||||
ratio_y=1
|
||||
|
@ -2736,10 +2620,6 @@ class Eynollah:
|
|||
img_org = np.copy(img)
|
||||
img_height_h = img_org.shape[0]
|
||||
img_width_h = img_org.shape[1]
|
||||
|
||||
if not self.dir_in:
|
||||
self.model_table, _ = self.start_new_session_and_model(self.model_table_dir)
|
||||
|
||||
patches = False
|
||||
if self.light_version:
|
||||
prediction_table = self.do_prediction_new_concept(patches, img, self.model_table)
|
||||
|
@ -3376,7 +3256,11 @@ class Eynollah:
|
|||
regions_without_separators_d, regions_fully, regions_without_separators,
|
||||
polygons_of_marginals, contours_tables)
|
||||
|
||||
def our_load_model(self, model_file):
|
||||
@staticmethod
|
||||
def our_load_model(model_file):
|
||||
if model_file.endswith('.h5') and Path(model_file[:-3]).exists():
|
||||
# prefer SavedModel over HDF5 format if it exists
|
||||
model_file = model_file[:-3]
|
||||
try:
|
||||
model = load_model(model_file, compile=False)
|
||||
except:
|
||||
|
@ -3427,9 +3311,6 @@ class Eynollah:
|
|||
img_header_and_sep = resize_image(img_header_and_sep, height1, width1)
|
||||
img_poly = resize_image(img_poly, height3, width3)
|
||||
|
||||
if not self.dir_in:
|
||||
self.model_reading_order, _ = self.start_new_session_and_model(self.model_reading_order_dir)
|
||||
|
||||
inference_bs = 3
|
||||
input_1 = np.zeros((inference_bs, height1, width1, 3))
|
||||
ordered = [list(range(len(co_text_all)))]
|
||||
|
@ -3730,7 +3611,7 @@ class Eynollah:
|
|||
for ij in range(len(all_found_textline_polygons[j])):
|
||||
con_ind = all_found_textline_polygons[j][ij]
|
||||
area = cv2.contourArea(con_ind)
|
||||
con_ind = con_ind.astype(np.float)
|
||||
con_ind = con_ind.astype(float)
|
||||
|
||||
x_differential = np.diff( con_ind[:,0,0])
|
||||
y_differential = np.diff( con_ind[:,0,1])
|
||||
|
@ -3834,7 +3715,7 @@ class Eynollah:
|
|||
con_ind = all_found_textline_polygons[j]
|
||||
#print(len(con_ind[:,0,0]),'con_ind[:,0,0]')
|
||||
area = cv2.contourArea(con_ind)
|
||||
con_ind = con_ind.astype(np.float)
|
||||
con_ind = con_ind.astype(float)
|
||||
|
||||
x_differential = np.diff( con_ind[:,0,0])
|
||||
y_differential = np.diff( con_ind[:,0,1])
|
||||
|
@ -3937,7 +3818,7 @@ class Eynollah:
|
|||
con_ind = all_found_textline_polygons[j][ij]
|
||||
area = cv2.contourArea(con_ind)
|
||||
|
||||
con_ind = con_ind.astype(np.float)
|
||||
con_ind = con_ind.astype(float)
|
||||
|
||||
x_differential = np.diff( con_ind[:,0,0])
|
||||
y_differential = np.diff( con_ind[:,0,1])
|
||||
|
@ -4080,10 +3961,8 @@ class Eynollah:
|
|||
|
||||
ind_textline_inside_tr = list(range(len(contours[jj])))
|
||||
index_textline_inside_textregion = index_textline_inside_textregion + ind_textline_inside_tr
|
||||
#ind_ins = [0] * len(contours[jj]) + jj
|
||||
ind_ins = np.zeros( len(contours[jj]) ) + jj
|
||||
list_ind_ins = list(ind_ins)
|
||||
indexes_of_textline_tot = indexes_of_textline_tot + list_ind_ins
|
||||
ind_ins = [jj] * len(contours[jj])
|
||||
indexes_of_textline_tot = indexes_of_textline_tot + ind_ins
|
||||
|
||||
M_main_tot = [cv2.moments(contours_txtline_of_all_textregions[j])
|
||||
for j in range(len(contours_txtline_of_all_textregions))]
|
||||
|
@ -4171,7 +4050,7 @@ class Eynollah:
|
|||
for j in range(len(all_found_textline_polygons)):
|
||||
for i in range(len(all_found_textline_polygons[j])):
|
||||
con_ind = all_found_textline_polygons[j][i]
|
||||
con_ind = con_ind.astype(np.float)
|
||||
con_ind = con_ind.astype(float)
|
||||
|
||||
x_differential = np.diff( con_ind[:,0,0])
|
||||
y_differential = np.diff( con_ind[:,0,1])
|
||||
|
@ -4311,31 +4190,44 @@ class Eynollah:
|
|||
return (slopes_rem, all_found_textline_polygons_rem, boxes_text_rem, txt_con_org_rem,
|
||||
contours_only_text_parent_rem, index_by_text_par_con_rem_sort)
|
||||
|
||||
def run(self):
|
||||
def run(self, image_filename : Optional[str] = None, dir_in : Optional[str] = None, overwrite : bool = False):
|
||||
"""
|
||||
Get image and scales, then extract the page of scanned image
|
||||
"""
|
||||
self.logger.debug("enter run")
|
||||
|
||||
t0_tot = time.time()
|
||||
|
||||
if not self.dir_in:
|
||||
self.ls_imgs = [1]
|
||||
if dir_in:
|
||||
self.ls_imgs = os.listdir(dir_in)
|
||||
elif image_filename:
|
||||
self.ls_imgs = [image_filename]
|
||||
else:
|
||||
raise ValueError("run requires either a single image filename or a directory")
|
||||
|
||||
for img_name in self.ls_imgs:
|
||||
self.logger.info(img_name)
|
||||
for img_filename in self.ls_imgs:
|
||||
self.logger.info(img_filename)
|
||||
t0 = time.time()
|
||||
if self.dir_in:
|
||||
self.reset_file_name_dir(os.path.join(self.dir_in,img_name))
|
||||
#print("text region early -11 in %.1fs", time.time() - t0)
|
||||
|
||||
self.reset_file_name_dir(os.path.join(dir_in or "", img_filename))
|
||||
#print("text region early -11 in %.1fs", time.time() - t0)
|
||||
if os.path.exists(self.writer.output_filename):
|
||||
if self.overwrite:
|
||||
if overwrite:
|
||||
self.logger.warning("will overwrite existing output file '%s'", self.writer.output_filename)
|
||||
else:
|
||||
self.logger.warning("will skip input for existing output file '%s'", self.writer.output_filename)
|
||||
continue
|
||||
|
||||
pcgts = self.run_single()
|
||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||
#print("Job done in %.1fs" % (time.time() - t0))
|
||||
self.writer.write_pagexml(pcgts)
|
||||
|
||||
if dir_in:
|
||||
self.logger.info("All jobs done in %.1fs", time.time() - t0_tot)
|
||||
print("all Job done in %.1fs", time.time() - t0_tot)
|
||||
|
||||
def run_single(self):
|
||||
t0 = time.time()
|
||||
img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version)
|
||||
self.logger.info("Enhancing took %.1fs ", time.time() - t0)
|
||||
if self.extract_only_images:
|
||||
|
@ -4348,11 +4240,6 @@ class Eynollah:
|
|||
cont_page, [], [], ocr_all_textlines)
|
||||
if self.plotter:
|
||||
self.plotter.write_images_into_directory(polygons_of_images, image_page)
|
||||
|
||||
if self.dir_in:
|
||||
self.writer.write_pagexml(pcgts)
|
||||
continue
|
||||
else:
|
||||
return pcgts
|
||||
|
||||
if self.skip_layout_and_reading_order:
|
||||
|
@ -4395,10 +4282,6 @@ class Eynollah:
|
|||
all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals,
|
||||
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals,
|
||||
cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines)
|
||||
if self.dir_in:
|
||||
self.writer.write_pagexml(pcgts)
|
||||
continue
|
||||
else:
|
||||
return pcgts
|
||||
|
||||
#print("text region early -1 in %.1fs", time.time() - t0)
|
||||
|
@ -4451,11 +4334,6 @@ class Eynollah:
|
|||
pcgts = self.writer.build_pagexml_no_full_layout(
|
||||
[], page_coord, [], [], [], [], [], [], [], [], [], [],
|
||||
cont_page, [], [], ocr_all_textlines)
|
||||
self.logger.info("Job done in %.1fs", time.time() - t1)
|
||||
if self.dir_in:
|
||||
self.writer.write_pagexml(pcgts)
|
||||
continue
|
||||
else:
|
||||
return pcgts
|
||||
|
||||
#print("text region early in %.1fs", time.time() - t0)
|
||||
|
@ -4641,11 +4519,6 @@ class Eynollah:
|
|||
polygons_of_images,
|
||||
polygons_of_marginals, empty_marginals, empty_marginals, [], [],
|
||||
cont_page, polygons_lines_xml, contours_tables, [])
|
||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||
if self.dir_in:
|
||||
self.writer.write_pagexml(pcgts)
|
||||
continue
|
||||
else:
|
||||
return pcgts
|
||||
|
||||
#print("text region early 3 in %.1fs", time.time() - t0)
|
||||
|
@ -4836,15 +4709,8 @@ class Eynollah:
|
|||
polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals,
|
||||
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals,
|
||||
cont_page, polygons_lines_xml, ocr_all_textlines)
|
||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||
#print("Job done in %.1fs", time.time() - t0)
|
||||
if self.dir_in:
|
||||
self.writer.write_pagexml(pcgts)
|
||||
continue
|
||||
else:
|
||||
return pcgts
|
||||
|
||||
else:
|
||||
contours_only_text_parent_h = None
|
||||
if self.reading_order_machine_based:
|
||||
order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model(
|
||||
|
@ -4922,20 +4788,7 @@ class Eynollah:
|
|||
all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals,
|
||||
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals,
|
||||
cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines)
|
||||
#print("Job done in %.1fs" % (time.time() - t0))
|
||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||
if not self.dir_in:
|
||||
return pcgts
|
||||
#print("text region early 7 in %.1fs", time.time() - t0)
|
||||
|
||||
if self.dir_in:
|
||||
self.writer.write_pagexml(pcgts)
|
||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||
#print("Job done in %.1fs" % (time.time() - t0))
|
||||
|
||||
if self.dir_in:
|
||||
self.logger.info("All jobs done in %.1fs", time.time() - t0_tot)
|
||||
print("all Job done in %.1fs", time.time() - t0_tot)
|
||||
|
||||
|
||||
class Eynollah_ocr:
|
||||
|
|
|
@ -1,47 +0,0 @@
|
|||
{
|
||||
"version": "0.1.0",
|
||||
"git_url": "https://github.com/qurator-spk/sbb_binarization",
|
||||
"tools": {
|
||||
"ocrd-sbb-binarize": {
|
||||
"executable": "ocrd-sbb-binarize",
|
||||
"description": "Pixelwise binarization with selectional auto-encoders in Keras",
|
||||
"categories": ["Image preprocessing"],
|
||||
"steps": ["preprocessing/optimization/binarization"],
|
||||
"input_file_grp": [],
|
||||
"output_file_grp": [],
|
||||
"parameters": {
|
||||
"operation_level": {
|
||||
"type": "string",
|
||||
"enum": ["page", "region"],
|
||||
"default": "page",
|
||||
"description": "PAGE XML hierarchy level to operate on"
|
||||
},
|
||||
"model": {
|
||||
"description": "Directory containing HDF5 or SavedModel/ProtoBuf models. Can be an absolute path or a path relative to the OCR-D resource location, the current working directory or the $SBB_BINARIZE_DATA environment variable (if set)",
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"content-type": "text/directory",
|
||||
"required": true
|
||||
}
|
||||
},
|
||||
"resources": [
|
||||
{
|
||||
"url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2020_01_16.zip",
|
||||
"name": "default",
|
||||
"type": "archive",
|
||||
"path_in_archive": "saved_model_2020_01_16",
|
||||
"size": 563147331,
|
||||
"description": "default models provided by github.com/qurator-spk (SavedModel format)"
|
||||
},
|
||||
{
|
||||
"url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip",
|
||||
"name": "default-2021-03-09",
|
||||
"type": "archive",
|
||||
"path_in_archive": ".",
|
||||
"size": 133230419,
|
||||
"description": "updated default models provided by github.com/qurator-spk (SavedModel format)"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,21 +1,22 @@
|
|||
{
|
||||
"version": "0.3.1",
|
||||
"git_url": "https://github.com/qurator-spk/eynollah",
|
||||
"dockerhub": "ocrd/eynollah",
|
||||
"tools": {
|
||||
"ocrd-eynollah-segment": {
|
||||
"executable": "ocrd-eynollah-segment",
|
||||
"categories": ["Layout analysis"],
|
||||
"description": "Segment page into regions and lines and do reading order detection with eynollah",
|
||||
"input_file_grp": ["OCR-D-IMG", "OCR-D-SEG-PAGE", "OCR-D-GT-SEG-PAGE"],
|
||||
"output_file_grp": ["OCR-D-SEG-LINE"],
|
||||
"input_file_grp_cardinality": 1,
|
||||
"output_file_grp_cardinality": 1,
|
||||
"steps": ["layout/segmentation/region", "layout/segmentation/line"],
|
||||
"parameters": {
|
||||
"models": {
|
||||
"type": "string",
|
||||
"format": "file",
|
||||
"format": "uri",
|
||||
"content-type": "text/directory",
|
||||
"cacheable": true,
|
||||
"description": "Path to directory containing models to be used (See https://qurator-data.de/eynollah)",
|
||||
"description": "Directory containing models to be used (See https://qurator-data.de/eynollah)",
|
||||
"required": true
|
||||
},
|
||||
"dpi": {
|
||||
|
@ -32,7 +33,7 @@
|
|||
"light_version": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Try to detect all element subtypes in light version"
|
||||
"description": "Try to detect all element subtypes in light version (faster+simpler method for main region detection and deskewing)"
|
||||
},
|
||||
"textline_light": {
|
||||
"type": "boolean",
|
||||
|
@ -49,11 +50,31 @@
|
|||
"default": false,
|
||||
"description": "try to return contour of textlines instead of just rectangle bounding box. Needs more processing time"
|
||||
},
|
||||
"ignore_page_extraction": {
|
||||
"type": "boolean",
|
||||
"default": false,
|
||||
"description": "if this parameter set to true, this tool would ignore page extraction"
|
||||
},
|
||||
"allow_scaling": {
|
||||
"type": "boolean",
|
||||
"default": false,
|
||||
"description": "check the resolution against the number of detected columns and if needed, scale the image up or down during layout detection (heuristic to improve quality and performance)"
|
||||
},
|
||||
"allow_enhancement": {
|
||||
"type": "boolean",
|
||||
"default": false,
|
||||
"description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not."
|
||||
},
|
||||
"textline_light": {
|
||||
"type": "boolean",
|
||||
"default": false,
|
||||
"description": "if this parameter set to true, this tool will try to return contoure of textlines instead of rectangle bounding box of textline with a faster method."
|
||||
},
|
||||
"right_to_left": {
|
||||
"type": "boolean",
|
||||
"default": false,
|
||||
"description": "if this parameter set to true, this tool will extract right-to-left reading order."
|
||||
},
|
||||
"headers_off": {
|
||||
"type": "boolean",
|
||||
"default": false,
|
||||
|
@ -70,6 +91,47 @@
|
|||
"path_in_archive": "models_eynollah"
|
||||
}
|
||||
]
|
||||
},
|
||||
"ocrd-sbb-binarize": {
|
||||
"executable": "ocrd-sbb-binarize",
|
||||
"description": "Pixelwise binarization with selectional auto-encoders in Keras",
|
||||
"categories": ["Image preprocessing"],
|
||||
"steps": ["preprocessing/optimization/binarization"],
|
||||
"input_file_grp_cardinality": 1,
|
||||
"output_file_grp_cardinality": 1,
|
||||
"parameters": {
|
||||
"operation_level": {
|
||||
"type": "string",
|
||||
"enum": ["page", "region"],
|
||||
"default": "page",
|
||||
"description": "PAGE XML hierarchy level to operate on"
|
||||
},
|
||||
"model": {
|
||||
"description": "Directory containing HDF5 or SavedModel/ProtoBuf models. Can be an absolute path or a path relative to the OCR-D resource location, the current working directory or the $SBB_BINARIZE_DATA environment variable (if set)",
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"content-type": "text/directory",
|
||||
"required": true
|
||||
}
|
||||
},
|
||||
"resources": [
|
||||
{
|
||||
"url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2020_01_16.zip",
|
||||
"name": "default",
|
||||
"type": "archive",
|
||||
"path_in_archive": "saved_model_2020_01_16",
|
||||
"size": 563147331,
|
||||
"description": "default models provided by github.com/qurator-spk (SavedModel format)"
|
||||
},
|
||||
{
|
||||
"url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip",
|
||||
"name": "default-2021-03-09",
|
||||
"type": "archive",
|
||||
"path_in_archive": ".",
|
||||
"size": 133230419,
|
||||
"description": "updated default models provided by github.com/qurator-spk (SavedModel format)"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,29 +1,16 @@
|
|||
from os import environ
|
||||
from os.path import join
|
||||
from pathlib import Path
|
||||
from pkg_resources import resource_string
|
||||
from json import loads
|
||||
from typing import Optional
|
||||
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
import cv2
|
||||
from click import command
|
||||
|
||||
from ocrd_utils import (
|
||||
getLogger,
|
||||
assert_file_grp_cardinality,
|
||||
make_file_id,
|
||||
MIMETYPE_PAGE
|
||||
)
|
||||
from ocrd import Processor
|
||||
from ocrd_modelfactory import page_from_file
|
||||
from ocrd_models.ocrd_page import AlternativeImageType, to_xml
|
||||
from ocrd import Processor, OcrdPageResult, OcrdPageResultImage
|
||||
from ocrd_models.ocrd_page import OcrdPage, AlternativeImageType
|
||||
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
||||
|
||||
from .sbb_binarize import SbbBinarizer
|
||||
|
||||
OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool-binarization.json').decode('utf8'))
|
||||
TOOL = 'ocrd-sbb-binarize'
|
||||
|
||||
def cv2pil(img):
|
||||
return Image.fromarray(img.astype('uint8'))
|
||||
|
@ -35,39 +22,22 @@ def pil2cv(img):
|
|||
return cv2.cvtColor(pil_as_np_array, color_conversion)
|
||||
|
||||
class SbbBinarizeProcessor(Processor):
|
||||
# already employs GPU (without singleton process atm)
|
||||
max_workers = 1
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
|
||||
kwargs['version'] = OCRD_TOOL['version']
|
||||
super().__init__(*args, **kwargs)
|
||||
if hasattr(self, 'output_file_grp'):
|
||||
# processing context
|
||||
self.setup()
|
||||
@property
|
||||
def executable(self):
|
||||
return 'ocrd-sbb-binarize'
|
||||
|
||||
def setup(self):
|
||||
"""
|
||||
Set up the model prior to processing.
|
||||
"""
|
||||
LOG = getLogger('processor.SbbBinarize.__init__')
|
||||
if not 'model' in self.parameter:
|
||||
raise ValueError("'model' parameter is required")
|
||||
# resolve relative path via environment variable
|
||||
model_path = Path(self.parameter['model'])
|
||||
if not model_path.is_absolute():
|
||||
if 'SBB_BINARIZE_DATA' in environ and environ['SBB_BINARIZE_DATA']:
|
||||
LOG.info("Environment variable SBB_BINARIZE_DATA is set to '%s'" \
|
||||
" - prepending to model value '%s'. If you don't want this mechanism," \
|
||||
" unset the SBB_BINARIZE_DATA environment variable.",
|
||||
environ['SBB_BINARIZE_DATA'], model_path)
|
||||
model_path = Path(environ['SBB_BINARIZE_DATA']).joinpath(model_path)
|
||||
model_path = model_path.resolve()
|
||||
if not model_path.is_dir():
|
||||
raise FileNotFoundError("Does not exist or is not a directory: %s" % model_path)
|
||||
# resolve relative path via OCR-D ResourceManager
|
||||
model_path = self.resolve_resource(str(model_path))
|
||||
self.binarizer = SbbBinarizer(model_dir=model_path, logger=LOG)
|
||||
model_path = self.resolve_resource(self.parameter['model'])
|
||||
self.binarizer = SbbBinarizer(model_dir=model_path, logger=self.logger)
|
||||
|
||||
def process(self):
|
||||
def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
|
||||
"""
|
||||
Binarize images with sbb_binarization (based on selectional auto-encoders).
|
||||
|
||||
|
@ -88,71 +58,52 @@ class SbbBinarizeProcessor(Processor):
|
|||
|
||||
Produce a new PAGE output file by serialising the resulting hierarchy.
|
||||
"""
|
||||
LOG = getLogger('processor.SbbBinarize')
|
||||
assert_file_grp_cardinality(self.input_file_grp, 1)
|
||||
assert_file_grp_cardinality(self.output_file_grp, 1)
|
||||
|
||||
assert input_pcgts
|
||||
assert input_pcgts[0]
|
||||
assert self.parameter
|
||||
oplevel = self.parameter['operation_level']
|
||||
|
||||
for n, input_file in enumerate(self.input_files):
|
||||
file_id = make_file_id(input_file, self.output_file_grp)
|
||||
page_id = input_file.pageId or input_file.ID
|
||||
LOG.info("INPUT FILE %i / %s", n, page_id)
|
||||
pcgts = page_from_file(self.workspace.download_file(input_file))
|
||||
self.add_metadata(pcgts)
|
||||
pcgts.set_pcGtsId(file_id)
|
||||
pcgts = input_pcgts[0]
|
||||
result = OcrdPageResult(pcgts)
|
||||
page = pcgts.get_Page()
|
||||
page_image, page_xywh, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
|
||||
page_image, page_xywh, _ = self.workspace.image_from_page(
|
||||
page, page_id, feature_filter='binarized')
|
||||
|
||||
if oplevel == 'page':
|
||||
LOG.info("Binarizing on 'page' level in page '%s'", page_id)
|
||||
bin_image = cv2pil(self.binarizer.run(image=pil2cv(page_image), use_patches=True))
|
||||
# update METS (add the image file):
|
||||
bin_image_path = self.workspace.save_image_file(bin_image,
|
||||
file_id + '.IMG-BIN',
|
||||
page_id=input_file.pageId,
|
||||
file_grp=self.output_file_grp)
|
||||
page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments='%s,binarized' % page_xywh['features']))
|
||||
self.logger.info("Binarizing on 'page' level in page '%s'", page_id)
|
||||
page_image_bin = cv2pil(self.binarizer.run(image=pil2cv(page_image), use_patches=True))
|
||||
# update PAGE (reference the image file):
|
||||
page_image_ref = AlternativeImageType(comments=page_xywh['features'] + ',binarized,clipped')
|
||||
page.add_AlternativeImage(page_image_ref)
|
||||
result.images.append(OcrdPageResultImage(page_image_bin, '.IMG-BIN', page_image_ref))
|
||||
|
||||
elif oplevel == 'region':
|
||||
regions = page.get_AllRegions(['Text', 'Table'], depth=1)
|
||||
if not regions:
|
||||
LOG.warning("Page '%s' contains no text/table regions", page_id)
|
||||
self.logger.warning("Page '%s' contains no text/table regions", page_id)
|
||||
for region in regions:
|
||||
region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh, feature_filter='binarized')
|
||||
region_image_bin = cv2pil(binarizer.run(image=pil2cv(region_image), use_patches=True))
|
||||
region_image_bin_path = self.workspace.save_image_file(
|
||||
region_image_bin,
|
||||
"%s_%s.IMG-BIN" % (file_id, region.id),
|
||||
page_id=input_file.pageId,
|
||||
file_grp=self.output_file_grp)
|
||||
region.add_AlternativeImage(
|
||||
AlternativeImageType(filename=region_image_bin_path, comments='%s,binarized' % region_xywh['features']))
|
||||
region_image, region_xywh = self.workspace.image_from_segment(
|
||||
region, page_image, page_xywh, feature_filter='binarized')
|
||||
region_image_bin = cv2pil(self.binarizer.run(image=pil2cv(region_image), use_patches=True))
|
||||
# update PAGE (reference the image file):
|
||||
region_image_ref = AlternativeImageType(comments=region_xywh['features'] + ',binarized')
|
||||
region.add_AlternativeImage(region_image_ref)
|
||||
result.images.append(OcrdPageResultImage(region_image_bin, region.id + '.IMG-BIN', region_image_ref))
|
||||
|
||||
elif oplevel == 'line':
|
||||
region_line_tuples = [(r.id, r.get_TextLine()) for r in page.get_AllRegions(['Text'], depth=0)]
|
||||
if not region_line_tuples:
|
||||
LOG.warning("Page '%s' contains no text lines", page_id)
|
||||
for region_id, line in region_line_tuples:
|
||||
lines = page.get_AllTextLines()
|
||||
if not lines:
|
||||
self.logger.warning("Page '%s' contains no text lines", page_id)
|
||||
for line in lines:
|
||||
line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh, feature_filter='binarized')
|
||||
line_image_bin = cv2pil(binarizer.run(image=pil2cv(line_image), use_patches=True))
|
||||
line_image_bin_path = self.workspace.save_image_file(
|
||||
line_image_bin,
|
||||
"%s_%s_%s.IMG-BIN" % (file_id, region_id, line.id),
|
||||
page_id=input_file.pageId,
|
||||
file_grp=self.output_file_grp)
|
||||
line.add_AlternativeImage(
|
||||
AlternativeImageType(filename=line_image_bin_path, comments='%s,binarized' % line_xywh['features']))
|
||||
line_image_bin = cv2pil(self.binarizer.run(image=pil2cv(line_image), use_patches=True))
|
||||
# update PAGE (reference the image file):
|
||||
line_image_ref = AlternativeImageType(comments=line_xywh['features'] + ',binarized')
|
||||
line.add_AlternativeImage(region_image_ref)
|
||||
result.images.append(OcrdPageResultImage(line_image_bin, line.id + '.IMG-BIN', line_image_ref))
|
||||
|
||||
self.workspace.add_file(
|
||||
ID=file_id,
|
||||
file_grp=self.output_file_grp,
|
||||
pageId=input_file.pageId,
|
||||
mimetype=MIMETYPE_PAGE,
|
||||
local_filename=join(self.output_file_grp, file_id + '.xml'),
|
||||
content=to_xml(pcgts))
|
||||
return result
|
||||
|
||||
@command()
|
||||
@ocrd_cli_options
|
||||
def cli(*args, **kwargs):
|
||||
def main(*args, **kwargs):
|
||||
return ocrd_cli_wrap_processor(SbbBinarizeProcessor, *args, **kwargs)
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as mpatches
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as mpatches
|
||||
except ImportError:
|
||||
plt = mpatches = None
|
||||
import numpy as np
|
||||
import os.path
|
||||
import cv2
|
||||
|
|
|
@ -1,71 +1,91 @@
|
|||
from json import loads
|
||||
from pkg_resources import resource_string
|
||||
from tempfile import NamedTemporaryFile
|
||||
from pathlib import Path
|
||||
from os.path import join
|
||||
from typing import Optional
|
||||
from ocrd_models import OcrdPage
|
||||
from ocrd import Processor, OcrdPageResult
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from ocrd import Processor
|
||||
from ocrd_modelfactory import page_from_file, exif_from_filename
|
||||
from ocrd_models import OcrdFile, OcrdExif
|
||||
from ocrd_models.ocrd_page import to_xml
|
||||
from ocrd_utils import (
|
||||
getLogger,
|
||||
MIMETYPE_PAGE,
|
||||
assert_file_grp_cardinality,
|
||||
make_file_id
|
||||
)
|
||||
|
||||
from .eynollah import Eynollah
|
||||
from .utils.pil_cv2 import pil2cv
|
||||
|
||||
OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
|
||||
from .eynollah import Eynollah, EynollahXmlWriter
|
||||
|
||||
class EynollahProcessor(Processor):
|
||||
# already employs background CPU multiprocessing per page
|
||||
# already employs GPU (without singleton process atm)
|
||||
max_workers = 1
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment']
|
||||
kwargs['version'] = OCRD_TOOL['version']
|
||||
super().__init__(*args, **kwargs)
|
||||
@property
|
||||
def executable(self):
|
||||
return 'ocrd-eynollah-segment'
|
||||
|
||||
def process(self):
|
||||
LOG = getLogger('eynollah')
|
||||
assert_file_grp_cardinality(self.input_file_grp, 1)
|
||||
assert_file_grp_cardinality(self.output_file_grp, 1)
|
||||
for n, input_file in enumerate(self.input_files):
|
||||
page_id = input_file.pageId or input_file.ID
|
||||
LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files))
|
||||
pcgts = page_from_file(self.workspace.download_file(input_file))
|
||||
LOG.debug('width %s height %s', pcgts.get_Page().imageWidth, pcgts.get_Page().imageHeight)
|
||||
self.add_metadata(pcgts)
|
||||
def setup(self) -> None:
|
||||
if self.parameter['textline_light'] and not self.parameter['light_version']:
|
||||
raise ValueError("Error: You set parameter 'textline_light' to enable light textline detection, "
|
||||
"but parameter 'light_version' is not enabled")
|
||||
self.eynollah = Eynollah(
|
||||
self.resolve_resource(self.parameter['models']),
|
||||
logger=self.logger,
|
||||
allow_enhancement=self.parameter['allow_enhancement'],
|
||||
curved_line=self.parameter['curved_line'],
|
||||
right2left=self.parameter['right_to_left'],
|
||||
ignore_page_extraction=self.parameter['ignore_page_extraction'],
|
||||
light_version=self.parameter['light_version'],
|
||||
textline_light=self.parameter['textline_light'],
|
||||
full_layout=self.parameter['full_layout'],
|
||||
allow_scaling=self.parameter['allow_scaling'],
|
||||
headers_off=self.parameter['headers_off'],
|
||||
tables=self.parameter['tables'],
|
||||
)
|
||||
self.eynollah.plotter = None
|
||||
|
||||
def shutdown(self):
|
||||
if hasattr(self, 'eynollah'):
|
||||
del self.eynollah
|
||||
|
||||
def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
|
||||
"""
|
||||
Performs cropping, region and line segmentation with Eynollah.
|
||||
|
||||
For each page, open and deserialize PAGE input file (from existing
|
||||
PAGE file in the input fileGrp, or generated from image file).
|
||||
Retrieve its respective page-level image (ignoring annotation that
|
||||
already added `binarized`, `cropped` or `deskewed` features).
|
||||
|
||||
Set up Eynollah to detect regions and lines, and add each one to the
|
||||
page, respectively.
|
||||
|
||||
\b
|
||||
- If ``tables``, try to detect table blocks and add them as TableRegion.
|
||||
- If ``full_layout``, then in addition to paragraphs and marginals, also
|
||||
try to detect drop capitals and headings.
|
||||
- If ``ignore_page_extraction``, then attempt no cropping of the page.
|
||||
- If ``curved_line``, then compute contour polygons for text lines
|
||||
instead of simple bounding boxes.
|
||||
|
||||
Produce a new output file by serialising the resulting hierarchy.
|
||||
"""
|
||||
assert input_pcgts
|
||||
assert input_pcgts[0]
|
||||
assert self.parameter
|
||||
pcgts = input_pcgts[0]
|
||||
result = OcrdPageResult(pcgts)
|
||||
page = pcgts.get_Page()
|
||||
# XXX loses DPI information
|
||||
# page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
|
||||
image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename
|
||||
eynollah_kwargs = {
|
||||
'dir_models': self.resolve_resource(self.parameter['models']),
|
||||
'dir_out': self.output_file_grp,
|
||||
'allow_enhancement': False,
|
||||
'curved_line': self.parameter['curved_line'],
|
||||
'full_layout': self.parameter['full_layout'],
|
||||
'allow_scaling': self.parameter['allow_scaling'],
|
||||
'light_version': self.parameter['light_version'],
|
||||
'textline_light': self.parameter['textline_light'],
|
||||
'headers_off': self.parameter['headers_off'],
|
||||
'tables': self.parameter['tables'],
|
||||
'override_dpi': self.parameter['dpi'],
|
||||
'logger': LOG,
|
||||
'pcgts': pcgts,
|
||||
'image_filename': image_filename
|
||||
}
|
||||
Eynollah(**eynollah_kwargs).run()
|
||||
file_id = make_file_id(input_file, self.output_file_grp)
|
||||
pcgts.set_pcGtsId(file_id)
|
||||
self.workspace.add_file(
|
||||
ID=file_id,
|
||||
file_grp=self.output_file_grp,
|
||||
pageId=page_id,
|
||||
mimetype=MIMETYPE_PAGE,
|
||||
local_filename=join(self.output_file_grp, file_id) + '.xml',
|
||||
content=to_xml(pcgts))
|
||||
page_image, _, _ = self.workspace.image_from_page(
|
||||
page, page_id,
|
||||
# avoid any features that would change the coordinate system: cropped,deskewed
|
||||
# (the PAGE builder merely adds regions, so afterwards we would not know which to transform)
|
||||
# also avoid binarization as models usually fare better on grayscale/RGB
|
||||
feature_filter='cropped,deskewed,binarized')
|
||||
if hasattr(page_image, 'filename'):
|
||||
image_filename = page_image.filename
|
||||
else:
|
||||
image_filename = "dummy" # will be replaced by ocrd.Processor.process_page_file
|
||||
result.images.append(OcrdPageResultImage(page_image, '.IMG', page)) # mark as new original
|
||||
# FIXME: mask out already existing regions (incremental segmentation)
|
||||
self.eynollah.cache_images(
|
||||
image_pil=page_image,
|
||||
dpi=self.parameter['dpi'],
|
||||
)
|
||||
self.eynollah.writer = EynollahXmlWriter(
|
||||
dir_out=None,
|
||||
image_filename=image_filename,
|
||||
curved_line=self.eynollah.curved_line,
|
||||
textline_light=self.eynollah.textline_light,
|
||||
pcgts=pcgts)
|
||||
self.eynollah.run_single()
|
||||
return result
|
||||
|
|
|
@ -4,25 +4,19 @@ Tool to load model and binarize a given image.
|
|||
|
||||
import sys
|
||||
from glob import glob
|
||||
from os import environ, devnull
|
||||
from os.path import join
|
||||
from warnings import catch_warnings, simplefilter
|
||||
import os
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
import cv2
|
||||
environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
||||
stderr = sys.stderr
|
||||
sys.stderr = open(devnull, 'w')
|
||||
from ocrd_utils import tf_disable_interactive_logs
|
||||
tf_disable_interactive_logs()
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.models import load_model
|
||||
from tensorflow.python.keras import backend as tensorflow_backend
|
||||
sys.stderr = stderr
|
||||
|
||||
|
||||
import logging
|
||||
|
||||
def resize_image(img_in, input_height, input_width):
|
||||
return cv2.resize(img_in, (input_width, input_height), interpolation=cv2.INTER_NEAREST)
|
||||
|
||||
|
@ -53,7 +47,7 @@ class SbbBinarizer:
|
|||
del self.session
|
||||
|
||||
def load_model(self, model_name):
|
||||
model = load_model(join(self.model_dir, model_name), compile=False)
|
||||
model = load_model(os.path.join(self.model_dir, model_name), compile=False)
|
||||
model_height = model.layers[len(model.layers)-1].output_shape[1]
|
||||
model_width = model.layers[len(model.layers)-1].output_shape[2]
|
||||
n_classes = model.layers[len(model.layers)-1].output_shape[3]
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
import time
|
||||
import math
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
except ImportError:
|
||||
plt = None
|
||||
import numpy as np
|
||||
from shapely import geometry
|
||||
import cv2
|
||||
import imutils
|
||||
from scipy.signal import find_peaks
|
||||
from scipy.ndimage import gaussian_filter1d
|
||||
import time
|
||||
|
||||
from .is_nan import isNaN
|
||||
from .contour import (contours_in_same_horizon,
|
||||
find_new_features_of_contours,
|
||||
|
@ -237,10 +241,8 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order(
|
|||
if len(remained_sep_indexes)>1:
|
||||
#print(np.array(remained_sep_indexes),'np.array(remained_sep_indexes)')
|
||||
#print(np.array(mother),'mother')
|
||||
##remained_sep_indexes_without_mother = remained_sep_indexes[mother==0]
|
||||
##remained_sep_indexes_with_child_without_mother = remained_sep_indexes[mother==0 & child==1]
|
||||
remained_sep_indexes_without_mother=np.array(list(remained_sep_indexes))[np.array(mother)==0]
|
||||
remained_sep_indexes_with_child_without_mother=np.array(list(remained_sep_indexes))[(np.array(mother)==0) & (np.array(child)==1)]
|
||||
remained_sep_indexes_without_mother = remained_sep_indexes[mother==0]
|
||||
remained_sep_indexes_with_child_without_mother = remained_sep_indexes[(mother==0) & (child==1)]
|
||||
#print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother')
|
||||
#print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother')
|
||||
|
||||
|
@ -980,7 +982,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light(
|
|||
(regions_model_full[:,:,0]==2)).sum()
|
||||
pixels_main = all_pixels - pixels_header
|
||||
|
||||
if (pixels_header>=pixels_main) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ):
|
||||
if (pixels_header/float(pixels_main)>=0.3) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ):
|
||||
regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=2
|
||||
contours_only_text_parent_head.append(con)
|
||||
if contours_only_text_parent_d_ordered is not None:
|
||||
|
|
|
@ -247,7 +247,7 @@ def get_textregion_contours_in_org_image_light(cnts, img, slope_first, map=map):
|
|||
img = cv2.resize(img, (int(img.shape[1]/6), int(img.shape[0]/6)), interpolation=cv2.INTER_NEAREST)
|
||||
##cnts = list( (np.array(cnts)/2).astype(np.int16) )
|
||||
#cnts = cnts/2
|
||||
cnts = [(i/6).astype(np.int) for i in cnts]
|
||||
cnts = [(i/6).astype(int) for i in cnts]
|
||||
results = map(partial(do_back_rotation_and_get_cnt_back,
|
||||
img=img,
|
||||
slope_first=slope_first,
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from contextlib import nullcontext
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
from ocrd_models import OcrdExif
|
||||
|
@ -17,11 +18,12 @@ def pil2cv(img):
|
|||
def check_dpi(img):
|
||||
try:
|
||||
if isinstance(img, Image.Image):
|
||||
pil_image = img
|
||||
pil_image = nullcontext(img)
|
||||
elif isinstance(img, str):
|
||||
pil_image = Image.open(img)
|
||||
else:
|
||||
pil_image = cv2pil(img)
|
||||
pil_image = nullcontext(cv2pil(img))
|
||||
with pil_image:
|
||||
exif = OcrdExif(pil_image)
|
||||
resolution = exif.resolution
|
||||
if resolution == 1:
|
||||
|
|
|
@ -1616,7 +1616,7 @@ def do_work_of_slopes_new(
|
|||
textline_con_fil = filter_contours_area_of_image(img_int_p, textline_con,
|
||||
hierarchy,
|
||||
max_area=1, min_area=0.00008)
|
||||
y_diff_mean = find_contours_mean_y_diff(textline_con_fil)
|
||||
y_diff_mean = find_contours_mean_y_diff(textline_con_fil) if len(textline_con_fil) > 1 else np.NaN
|
||||
if np.isnan(y_diff_mean):
|
||||
slope_for_all = MAX_SLOPE
|
||||
else:
|
||||
|
@ -1681,7 +1681,7 @@ def do_work_of_slopes_new_curved(
|
|||
textline_con_fil = filter_contours_area_of_image(img_int_p, textline_con,
|
||||
hierarchy,
|
||||
max_area=1, min_area=0.0008)
|
||||
y_diff_mean = find_contours_mean_y_diff(textline_con_fil)
|
||||
y_diff_mean = find_contours_mean_y_diff(textline_con_fil) if len(textline_con_fil) > 1 else np.NaN
|
||||
if np.isnan(y_diff_mean):
|
||||
slope_for_all = MAX_SLOPE
|
||||
else:
|
||||
|
|
|
@ -28,7 +28,7 @@ class EynollahXmlWriter():
|
|||
self.counter = EynollahIdCounter()
|
||||
self.dir_out = dir_out
|
||||
self.image_filename = image_filename
|
||||
self.output_filename = os.path.join(self.dir_out, self.image_filename_stem) + ".xml"
|
||||
self.output_filename = os.path.join(self.dir_out or "", self.image_filename_stem) + ".xml"
|
||||
self.curved_line = curved_line
|
||||
self.textline_light = textline_light
|
||||
self.pcgts = pcgts
|
||||
|
|
BIN
tests/resources/euler_rechenkunst01_1738_0025.tif
Normal file
BIN
tests/resources/euler_rechenkunst01_1738_0025.tif
Normal file
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue