diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..562fb6f --- /dev/null +++ b/.dockerignore @@ -0,0 +1,6 @@ +tests +dist +build +env* +*.egg-info +models_eynollah* diff --git a/.github/workflows/build-docker.yml b/.github/workflows/build-docker.yml new file mode 100644 index 0000000..d77958b --- /dev/null +++ b/.github/workflows/build-docker.yml @@ -0,0 +1,44 @@ +name: CD + +on: + push: + branches: [ "master" ] + workflow_dispatch: # run manually + +jobs: + + build: + runs-on: ubuntu-latest + permissions: + packages: write + contents: read + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + # we need tags for docker version tagging + fetch-tags: true + fetch-depth: 0 + - # Activate cache export feature to reduce build time of images + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERIO_USERNAME }} + password: ${{ secrets.DOCKERIO_PASSWORD }} + - name: Build the Docker image + # build both tags at the same time + run: make docker DOCKER_TAG="docker.io/ocrd/eynollah -t ghcr.io/qurator-spk/eynollah" + - name: Test the Docker image + run: docker run --rm ocrd/eynollah ocrd-eynollah-segment -h + - name: Push to Dockerhub + run: docker push docker.io/ocrd/eynollah + - name: Push to Github Container Registry + run: docker push ghcr.io/qurator-spk/eynollah diff --git a/.github/workflows/test-eynollah.yml b/.github/workflows/test-eynollah.yml index 479c371..59503aa 100644 --- a/.github/workflows/test-eynollah.yml +++ b/.github/workflows/test-eynollah.yml @@ -16,18 +16,26 @@ jobs: steps: - name: clean up run: | + df -h sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/local/lib/android sudo rm -rf /opt/ghc sudo rm -rf "/usr/local/share/boost" sudo rm -rf "$AGENT_TOOLSDIRECTORY" + df -h - uses: actions/checkout@v4 - uses: actions/cache@v4 - id: model_cache + id: seg_model_cache with: path: models_eynollah key: ${{ runner.os }}-models + - uses: actions/cache@v4 + id: bin_model_cache + with: + path: default-2021-03-09 + key: ${{ runner.os }}-modelbin - name: Download models - if: steps.model_cache.outputs.cache-hit != 'true' + if: steps.seg_model_cache.outputs.cache-hit != 'true' || steps.bin_model_cache.outputs.cache-hit != 'true' run: make models - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 @@ -36,9 +44,11 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install .[OCR,plotting] - pip install -r requirements-test.txt + make install EXTRAS=OCR,plotting + make deps-test - name: Test with pytest run: make test - - name: Test docker build - run: make docker + - name: Test standalone CLI + run: make smoke-test + - name: Test OCR-D CLI + run: make ocrd-test diff --git a/Dockerfile b/Dockerfile index 6780bc2..4785fc1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,23 +4,42 @@ FROM $DOCKER_BASE_IMAGE ARG VCS_REF ARG BUILD_DATE LABEL \ - maintainer="https://ocr-d.de/kontakt" \ + maintainer="https://ocr-d.de/en/contact" \ org.label-schema.vcs-ref=$VCS_REF \ org.label-schema.vcs-url="https://github.com/qurator-spk/eynollah" \ - org.label-schema.build-date=$BUILD_DATE + org.label-schema.build-date=$BUILD_DATE \ + org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \ + org.opencontainers.image.title="Eynollah" \ + org.opencontainers.image.description="" \ + org.opencontainers.image.source="https://github.com/qurator-spk/eynollah" \ + org.opencontainers.image.documentation="https://github.com/qurator-spk/eynollah/blob/${VCS_REF}/README.md" \ + org.opencontainers.image.revision=$VCS_REF \ + org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.base.name=ocrd/core-cuda-tf2 ENV DEBIAN_FRONTEND=noninteractive +# set proper locales ENV PYTHONIOENCODING=utf8 -ENV XDG_DATA_HOME=/usr/local/share +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 -WORKDIR /build-eynollah -COPY src/ ./src -COPY pyproject.toml . -COPY requirements.txt . -COPY README.md . -COPY Makefile . -RUN apt-get install -y --no-install-recommends g++ -RUN make install +# avoid HOME/.local/share (hard to predict USER here) +# so let XDG_DATA_HOME coincide with fixed system location +# (can still be overridden by derived stages) +ENV XDG_DATA_HOME /usr/local/share +# avoid the need for an extra volume for persistent resource user db +# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml) +ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources + +WORKDIR /build/eynollah +COPY . . +COPY ocrd-tool.json . +# prepackage ocrd-tool.json as ocrd-all-tool.json +RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json +# install everything and reduce image size +RUN make install EXTRAS=OCR && rm -rf /build/eynollah +# smoke test +RUN eynollah --help WORKDIR /data VOLUME /data diff --git a/Makefile b/Makefile index 506fcf7..27eb872 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,20 @@ -EYNOLLAH_MODELS ?= $(PWD)/models_eynollah -export EYNOLLAH_MODELS +PYTHON ?= python3 +PIP ?= pip3 +EXTRAS ?= # DOCKER_BASE_IMAGE = artefakt.dev.sbb.berlin:5000/sbb/ocrd_core:v2.68.0 -DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.68.0 +DOCKER_BASE_IMAGE = docker.io/ocrd/core-cuda-tf2:v3.3.0 DOCKER_TAG = ocrd/eynollah +#SEG_MODEL := https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz +#SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz +SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah.tar.gz +#SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz +#SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz + +BIN_MODEL := https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip + +PYTEST_ARGS ?= # BEGIN-EVAL makefile-parser --make-help Makefile @@ -12,44 +22,90 @@ help: @echo "" @echo " Targets" @echo "" - @echo " models Download and extract models to $(PWD)/models_eynollah" - @echo " install Install with pip" + @echo " docker Build Docker image" + @echo " build Build Python source and binary distribution" + @echo " install Install package with pip" @echo " install-dev Install editable with pip" + @echo " deps-test Install test dependencies with pip" + @echo " models Download and extract models to $(CURDIR)/models_eynollah" + @echo " smoke-test Run simple CLI check" + @echo " ocrd-test Run OCR-D CLI check" @echo " test Run unit tests" @echo "" @echo " Variables" + @echo " EXTRAS comma-separated list of features (like 'OCR,plotting') for 'install' [$(EXTRAS)]" + @echo " DOCKER_TAG Docker image tag for 'docker' [$(DOCKER_TAG)]" + @echo " PYTEST_ARGS pytest args for 'test' (Set to '-s' to see log output during test execution, '-vv' to see individual tests. [$(PYTEST_ARGS)]" + @echo " SEG_MODEL URL of 'models' archive to download for segmentation 'test' [$(SEG_MODEL)]" + @echo " BIN_MODEL URL of 'models' archive to download for binarization 'test' [$(BIN_MODEL)]" @echo "" # END-EVAL # Download and extract models to $(PWD)/models_eynollah -models: models_eynollah +models: models_eynollah default-2021-03-09 models_eynollah: models_eynollah.tar.gz - tar xf models_eynollah.tar.gz + tar zxf models_eynollah.tar.gz models_eynollah.tar.gz: - # wget 'https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz' - # wget 'https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz' - wget 'https://qurator-data.de/eynollah/2022-04-05/models_eynollah.tar.gz' - # wget 'https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz' - # wget 'https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz' + wget $(SEG_MODEL) + +default-2021-03-09: $(notdir $(BIN_MODEL)) + unzip $(notdir $(BIN_MODEL)) + mkdir $@ + mv $(basename $(notdir $(BIN_MODEL))) $@ + +$(notdir $(BIN_MODEL)): + wget $(BIN_MODEL) + +build: + $(PIP) install build + $(PYTHON) -m build . # Install with pip install: - pip install . + $(PIP) install .$(and $(EXTRAS),[$(EXTRAS)]) # Install editable with pip install-dev: - pip install -e . + $(PIP) install -e .$(and $(EXTRAS),[$(EXTRAS)]) + +deps-test: models_eynollah + $(PIP) install -r requirements-test.txt + +smoke-test: TMPDIR != mktemp -d +smoke-test: tests/resources/kant_aufklaerung_1784_0020.tif + # layout analysis: + eynollah layout -i $< -o $(TMPDIR) -m $(CURDIR)/models_eynollah + fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 $(TMPDIR)/$(basename $(= 2.23.3 +ocrd >= 3.3.0 numpy <1.24.0 scikit-learn >= 0.23.2 tensorflow < 2.13 diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index c306ac5..ecdfa3a 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -1,6 +1,6 @@ import sys import click -from ocrd_utils import initLogging, setOverrideLogLevel +from ocrd_utils import initLogging, getLevelName, getLogger from eynollah.eynollah import Eynollah, Eynollah_ocr from eynollah.sbb_binarize import SbbBinarizer @@ -15,21 +15,18 @@ def main(): help="directory of GT page-xml files", type=click.Path(exists=True, file_okay=False), ) - @click.option( "--dir_out_modal_image", "-domi", help="directory where ground truth images would be written", type=click.Path(exists=True, file_okay=False), ) - @click.option( "--dir_out_classes", "-docl", help="directory where ground truth classes would be written", type=click.Path(exists=True, file_okay=False), ) - @click.option( "--input_height", "-ih", @@ -45,17 +42,13 @@ def main(): "-min", help="min area size of regions considered for reading order training.", ) - def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, input_height, input_width, min_area_size): xml_files_ind = os.listdir(dir_xml) - + @main.command() @click.option('--patches/--no-patches', default=True, help='by enabling this parameter you let the model to see the image in patches.') - @click.option('--model_dir', '-m', type=click.Path(exists=True, file_okay=False), required=True, help='directory containing models for prediction') - @click.argument('input_image') - @click.argument('output_image') @click.option( "--dir_in", @@ -69,7 +62,6 @@ def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, i help="directory where the binarized images will be written", type=click.Path(exists=True, file_okay=False), ) - def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out): if not dir_out and (dir_in): print("Error: You used -di but did not set -do") @@ -78,10 +70,10 @@ def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out) print("Error: You used -do to write out binarized images but have not set -di") sys.exit(1) SbbBinarizer(model_dir).run(image_path=input_image, use_patches=patches, save=output_image, dir_in=dir_in, dir_out=dir_out) - - - - + + + + @main.command() @click.option( "--image", @@ -264,25 +256,37 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ if log_level: getLogger('eynollah').setLevel(getLevelName(log_level)) if not enable_plotting and (save_layout or save_deskewed or save_all or save_page or save_images or allow_enhancement): - print("Error: You used one of -sl, -sd, -sa, -sp, -si or -ae but did not enable plotting with -ep") - sys.exit(1) + raise ValueError("Plotting with -sl, -sd, -sa, -sp, -si or -ae also requires -ep") elif enable_plotting and not (save_layout or save_deskewed or save_all or save_page or save_images or allow_enhancement): - print("Error: You used -ep to enable plotting but set none of -sl, -sd, -sa, -sp, -si or -ae") - sys.exit(1) + raise ValueError("Plotting with -ep also requires -sl, -sd, -sa, -sp, -si or -ae") if textline_light and not light_version: - print('Error: You used -tll to enable light textline detection but -light is not enabled') - sys.exit(1) + raise ValueError("Light textline detection with -tll also requires -light") if light_version and not textline_light: - print('Error: You used -light without -tll. Light version need light textline to be enabled.') - if extract_only_images and (allow_enhancement or allow_scaling or light_version or curved_line or textline_light or full_layout or tables or right2left or headers_off) : - print('Error: You used -eoi which can not be enabled alongside light_version -light or allow_scaling -as or allow_enhancement -ae or curved_line -cl or textline_light -tll or full_layout -fl or tables -tab or right2left -r2l or headers_off -ho') - sys.exit(1) + raise ValueError("Light version with -light also requires light textline detection -tll") + if extract_only_images and allow_enhancement: + raise ValueError("Image extraction with -eoi can not be enabled alongside allow_enhancement -ae") + if extract_only_images and allow_scaling: + raise ValueError("Image extraction with -eoi can not be enabled alongside allow_scaling -as") + if extract_only_images and light_version: + raise ValueError("Image extraction with -eoi can not be enabled alongside light_version -light") + if extract_only_images and curved_line: + raise ValueError("Image extraction with -eoi can not be enabled alongside curved_line -cl") + if extract_only_images and textline_light: + raise ValueError("Image extraction with -eoi can not be enabled alongside textline_light -tll") + if extract_only_images and full_layout: + raise ValueError("Image extraction with -eoi can not be enabled alongside full_layout -fl") + if extract_only_images and tables: + raise ValueError("Image extraction with -eoi can not be enabled alongside tables -tab") + if extract_only_images and right2left: + raise ValueError("Image extraction with -eoi can not be enabled alongside right2left -r2l") + if extract_only_images and headers_off: + raise ValueError("Image extraction with -eoi can not be enabled alongside headers_off -ho") + if image is None and dir_in is None: + raise ValueError("Either a single image -i or a dir_in -di is required") eynollah = Eynollah( - image_filename=image, - overwrite=overwrite, + model, + logger=getLogger('eynollah'), dir_out=out, - dir_in=dir_in, - dir_models=model, dir_of_cropped_images=save_images, extract_only_images=extract_only_images, dir_of_layout=save_layout, @@ -308,12 +312,11 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ skip_layout_and_reading_order=skip_layout_and_reading_order, ) if dir_in: - eynollah.run() + eynollah.run(dir_in=dir_in, overwrite=overwrite) else: - pcgts = eynollah.run() - eynollah.writer.write_pagexml(pcgts) - - + eynollah.run(image_filename=image, overwrite=overwrite) + + @main.command() @click.option( "--dir_in", @@ -367,9 +370,9 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ ) def ocr(dir_in, out, dir_xmls, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, log_level): - if log_level: - setOverrideLogLevel(log_level) initLogging() + if log_level: + getLogger('eynollah').setLevel(getLevelName(log_level)) eynollah_ocr = Eynollah_ocr( dir_xmls=dir_xmls, dir_in=dir_in, diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 7acee39..ef8bcc6 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -6,47 +6,57 @@ document layout analysis (segmentation) with output in PAGE-XML """ -import tracemalloc +from logging import Logger +from difflib import SequenceMatcher as sq import math import os import sys import time +from typing import Optional import atexit import warnings from functools import partial from pathlib import Path from multiprocessing import cpu_count -from loky import ProcessPoolExecutor import gc -from ocrd_utils import getLogger +import copy +import json + +from loky import ProcessPoolExecutor +from PIL.Image import Image +import xml.etree.ElementTree as ET import cv2 import numpy as np -from transformers import TrOCRProcessor -from PIL import Image -import torch -from difflib import SequenceMatcher as sq -from transformers import VisionEncoderDecoderModel -from numba import cuda -import copy from scipy.signal import find_peaks from scipy.ndimage import gaussian_filter1d +from numba import cuda + +from ocrd import OcrdPage +from ocrd_utils import getLogger, tf_disable_interactive_logs + +try: + import torch +except ImportError: + torch = None +try: + import matplotlib.pyplot as plt +except ImportError: + plt = None +try: + from transformers import TrOCRProcessor, VisionEncoderDecoderModel +except ImportError: + TrOCRProcessor = VisionEncoderDecoderModel = None -os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' -stderr = sys.stderr -sys.stderr = open(os.devnull, "w") +tf_disable_interactive_logs() import tensorflow as tf from tensorflow.python.keras import backend as K from tensorflow.keras.models import load_model -sys.stderr = stderr tf.get_logger().setLevel("ERROR") warnings.filterwarnings("ignore") -import matplotlib.pyplot as plt # use tf1 compatibility for keras backend from tensorflow.compat.v1.keras.backend import set_session from tensorflow.keras import layers -import json -import xml.etree.ElementTree as ET from tensorflow.keras.layers import StringLookup from .utils.contour import ( @@ -138,8 +148,8 @@ class Patches(layers.Layer): 'patch_size': self.patch_size, }) return config - - + + class PatchEncoder(layers.Layer): def __init__(self, **kwargs): super(PatchEncoder, self).__init__() @@ -166,54 +176,37 @@ class PatchEncoder(layers.Layer): class Eynollah: def __init__( self, - dir_models, - image_filename=None, - image_pil=None, - image_filename_stem=None, - overwrite=False, - dir_out=None, - dir_in=None, - dir_of_cropped_images=None, - extract_only_images=False, - dir_of_layout=None, - dir_of_deskewed=None, - dir_of_all=None, - dir_save_page=None, - enable_plotting=False, - allow_enhancement=False, - curved_line=False, - textline_light=False, - full_layout=False, - tables=False, - right2left=False, - input_binary=False, - allow_scaling=False, - headers_off=False, - light_version=False, - ignore_page_extraction=False, - reading_order_machine_based=False, - do_ocr=False, - num_col_upper=None, - num_col_lower=None, - skip_layout_and_reading_order = False, - override_dpi=None, - logger=None, - pcgts=None, + dir_models : str, + dir_out : Optional[str] = None, + dir_of_cropped_images : Optional[str] = None, + extract_only_images : bool =False, + dir_of_layout : Optional[str] = None, + dir_of_deskewed : Optional[str] = None, + dir_of_all : Optional[str] = None, + dir_save_page : Optional[str] = None, + enable_plotting : bool = False, + allow_enhancement : bool = False, + curved_line : bool = False, + textline_light : bool = False, + full_layout : bool = False, + tables : bool = False, + right2left : bool = False, + input_binary : bool = False, + allow_scaling : bool = False, + headers_off : bool = False, + light_version : bool = False, + ignore_page_extraction : bool = False, + reading_order_machine_based : bool = False, + do_ocr : bool = False, + num_col_upper : Optional[int] = None, + num_col_lower : Optional[int] = None, + skip_layout_and_reading_order : bool = False, + logger : Logger = None, ): if skip_layout_and_reading_order: textline_light = True self.light_version = light_version - if not dir_in: - if image_pil: - self._imgs = self._cache_images(image_pil=image_pil) - else: - self._imgs = self._cache_images(image_filename=image_filename) - if override_dpi: - self.dpi = override_dpi - self.image_filename = image_filename - self.overwrite = overwrite self.dir_out = dir_out - self.dir_in = dir_in self.dir_of_all = dir_of_all self.dir_save_page = dir_save_page self.reading_order_machine_based = reading_order_machine_based @@ -244,22 +237,6 @@ class Eynollah: self.num_col_lower = int(num_col_lower) else: self.num_col_lower = num_col_lower - self.pcgts = pcgts - if not dir_in: - self.plotter = None if not enable_plotting else EynollahPlotter( - dir_out=self.dir_out, - dir_of_all=dir_of_all, - dir_save_page=dir_save_page, - dir_of_deskewed=dir_of_deskewed, - dir_of_cropped_images=dir_of_cropped_images, - dir_of_layout=dir_of_layout, - image_filename_stem=Path(Path(image_filename).name).stem) - self.writer = EynollahXmlWriter( - dir_out=self.dir_out, - image_filename=self.image_filename, - curved_line=self.curved_line, - textline_light = self.textline_light, - pcgts=pcgts) self.logger = logger if logger else getLogger('eynollah') # for parallelization of CPU-intensive tasks: self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200) @@ -311,52 +288,54 @@ class Eynollah: self.model_textline_dir = dir_models + "/modelens_textline_0_1__2_4_16092024" if self.ocr: self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124" - if self.tables: if self.light_version: self.model_table_dir = dir_models + "/modelens_table_0t4_201124" else: self.model_table_dir = dir_models + "/eynollah-tables_20210319" - - self.models = {} - - if dir_in: - # as in start_new_session: - config = tf.compat.v1.ConfigProto() - config.gpu_options.allow_growth = True - session = tf.compat.v1.Session(config=config) - set_session(session) - - self.model_page = self.our_load_model(self.model_page_dir) - self.model_classifier = self.our_load_model(self.model_dir_of_col_classifier) - self.model_bin = self.our_load_model(self.model_dir_of_binarization) - if self.extract_only_images: - self.model_region = self.our_load_model(self.model_region_dir_p_ens_light_only_images_extraction) + + # #gpu_options = tf.compat.v1.GPUOptions(allow_growth=True) + # #gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=7.7, allow_growth=True) + # #session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) + # config = tf.compat.v1.ConfigProto() + # config.gpu_options.allow_growth = True + # #session = tf.InteractiveSession() + # session = tf.compat.v1.Session(config=config) + # set_session(session) + try: + for device in tf.config.list_physical_devices('GPU'): + tf.config.experimental.set_memory_growth(device, True) + except: + self.logger.warning("no GPU device available") + + self.model_page = self.our_load_model(self.model_page_dir) + self.model_classifier = self.our_load_model(self.model_dir_of_col_classifier) + self.model_bin = self.our_load_model(self.model_dir_of_binarization) + if self.extract_only_images: + self.model_region = self.our_load_model(self.model_region_dir_p_ens_light_only_images_extraction) + else: + self.model_textline = self.our_load_model(self.model_textline_dir) + if self.light_version: + self.model_region = self.our_load_model(self.model_region_dir_p_ens_light) + self.model_region_1_2 = self.our_load_model(self.model_region_dir_p_1_2_sp_np) else: - self.model_textline = self.our_load_model(self.model_textline_dir) - if self.light_version: - self.model_region = self.our_load_model(self.model_region_dir_p_ens_light) - self.model_region_1_2 = self.our_load_model(self.model_region_dir_p_1_2_sp_np) - else: - self.model_region = self.our_load_model(self.model_region_dir_p_ens) - self.model_region_p2 = self.our_load_model(self.model_region_dir_p2) - self.model_enhancement = self.our_load_model(self.model_dir_of_enhancement) - ###self.model_region_fl_new = self.our_load_model(self.model_region_dir_fully_new) - self.model_region_fl_np = self.our_load_model(self.model_region_dir_fully_np) - self.model_region_fl = self.our_load_model(self.model_region_dir_fully) - if self.reading_order_machine_based: - self.model_reading_order = self.our_load_model(self.model_reading_order_dir) - if self.ocr: - self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) - self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - #("microsoft/trocr-base-printed")#("microsoft/trocr-base-handwritten") - self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") - if self.tables: - self.model_table = self.our_load_model(self.model_table_dir) - - self.ls_imgs = os.listdir(self.dir_in) + self.model_region = self.our_load_model(self.model_region_dir_p_ens) + self.model_region_p2 = self.our_load_model(self.model_region_dir_p2) + self.model_enhancement = self.our_load_model(self.model_dir_of_enhancement) + ###self.model_region_fl_new = self.our_load_model(self.model_region_dir_fully_new) + self.model_region_fl_np = self.our_load_model(self.model_region_dir_fully_np) + self.model_region_fl = self.our_load_model(self.model_region_dir_fully) + if self.reading_order_machine_based: + self.model_reading_order = self.our_load_model(self.model_reading_order_dir) + if self.ocr: + self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + #("microsoft/trocr-base-printed")#("microsoft/trocr-base-handwritten") + self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") + if self.tables: + self.model_table = self.our_load_model(self.model_table_dir) - def _cache_images(self, image_filename=None, image_pil=None): + def cache_images(self, image_filename=None, image_pil=None, dpi=None): ret = {} t_c0 = time.time() if image_filename: @@ -374,13 +353,14 @@ class Eynollah: ret['img_grayscale'] = cv2.cvtColor(ret['img'], cv2.COLOR_BGR2GRAY) for prefix in ('', '_grayscale'): ret[f'img{prefix}_uint8'] = ret[f'img{prefix}'].astype(np.uint8) - return ret + self._imgs = ret + if dpi is not None: + self.dpi = dpi def reset_file_name_dir(self, image_filename): t_c = time.time() - self._imgs = self._cache_images(image_filename=image_filename) - self.image_filename = image_filename - + self.cache_images(image_filename=image_filename) + self.plotter = None if not self.enable_plotting else EynollahPlotter( dir_out=self.dir_out, dir_of_all=self.dir_of_all, @@ -389,13 +369,12 @@ class Eynollah: dir_of_cropped_images=self.dir_of_cropped_images, dir_of_layout=self.dir_of_layout, image_filename_stem=Path(Path(image_filename).name).stem) - + self.writer = EynollahXmlWriter( dir_out=self.dir_out, - image_filename=self.image_filename, + image_filename=image_filename, curved_line=self.curved_line, - textline_light = self.textline_light, - pcgts=self.pcgts) + textline_light = self.textline_light) def imread(self, grayscale=False, uint8=True): key = 'img' @@ -410,8 +389,6 @@ class Eynollah: def predict_enhancement(self, img): self.logger.debug("enter predict_enhancement") - if not self.dir_in: - self.model_enhancement, _ = self.start_new_session_and_model(self.model_dir_of_enhancement) img_height_model = self.model_enhancement.layers[-1].output_shape[1] img_width_model = self.model_enhancement.layers[-1].output_shape[2] @@ -608,9 +585,6 @@ class Eynollah: img = self.imread() _, page_coord = self.early_page_for_num_of_column_classification(img) - - if not self.dir_in: - self.model_classifier, _ = self.start_new_session_and_model(self.model_dir_of_col_classifier) if self.input_binary: img_in = np.copy(img) @@ -651,9 +625,6 @@ class Eynollah: self.logger.info("Detected %s DPI", dpi) if self.input_binary: img = self.imread() - if not self.dir_in: - self.model_bin, _ = self.start_new_session_and_model(self.model_dir_of_binarization) - prediction_bin = self.do_prediction(True, img, self.model_bin, n_batch_inference=5) prediction_bin = 255 * (prediction_bin[:,:,0]==0) prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2).astype(np.uint8) @@ -662,17 +633,14 @@ class Eynollah: else: img = self.imread() img_bin = None - + width_early = img.shape[1] t1 = time.time() _, page_coord = self.early_page_for_num_of_column_classification(img_bin) - + self.image_page_org_size = img[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3], :] self.page_coord = page_coord - - if not self.dir_in: - self.model_classifier, _ = self.start_new_session_and_model(self.model_dir_of_col_classifier) - + if self.num_col_upper and not self.num_col_lower: num_col = self.num_col_upper label_p_pred = [np.ones(6)] @@ -812,43 +780,6 @@ class Eynollah: self.writer.height_org = self.height_org self.writer.width_org = self.width_org - def start_new_session_and_model_old(self, model_dir): - self.logger.debug("enter start_new_session_and_model (model_dir=%s)", model_dir) - config = tf.ConfigProto() - config.gpu_options.allow_growth = True - - session = tf.InteractiveSession() - model = load_model(model_dir, compile=False) - - return model, session - - def start_new_session_and_model(self, model_dir): - self.logger.debug("enter start_new_session_and_model (model_dir=%s)", model_dir) - #gpu_options = tf.compat.v1.GPUOptions(allow_growth=True) - #gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=7.7, allow_growth=True) - #session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) - physical_devices = tf.config.list_physical_devices('GPU') - try: - for device in physical_devices: - tf.config.experimental.set_memory_growth(device, True) - except: - self.logger.warning("no GPU device available") - - if model_dir.endswith('.h5') and Path(model_dir[:-3]).exists(): - # prefer SavedModel over HDF5 format if it exists - model_dir = model_dir[:-3] - if model_dir in self.models: - model = self.models[model_dir] - else: - try: - model = load_model(model_dir, compile=False) - except: - model = load_model(model_dir , compile=False, custom_objects={ - "PatchEncoder": PatchEncoder, "Patches": Patches}) - self.models[model_dir] = model - - return model, None - def do_prediction( self, patches, img, model, n_batch_inference=1, marginal_of_patch_percent=0.1, @@ -1386,9 +1317,6 @@ class Eynollah: self.logger.debug("enter extract_page") cont_page = [] if not self.ignore_page_extraction: - if not self.dir_in: - self.model_page, _ = self.start_new_session_and_model(self.model_page_dir) - img = cv2.GaussianBlur(self.image, (5, 5), 0) img_page_prediction = self.do_prediction(False, img, self.model_page) imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY) @@ -1436,8 +1364,6 @@ class Eynollah: img = np.copy(img_bin).astype(np.uint8) else: img = self.imread() - if not self.dir_in: - self.model_page, _ = self.start_new_session_and_model(self.model_page_dir) img = cv2.GaussianBlur(img, (5, 5), 0) img_page_prediction = self.do_prediction(False, img, self.model_page) @@ -1465,11 +1391,6 @@ class Eynollah: self.logger.debug("enter extract_text_regions") img_height_h = img.shape[0] img_width_h = img.shape[1] - if not self.dir_in: - if patches: - self.model_region_fl, _ = self.start_new_session_and_model(self.model_region_dir_fully) - else: - self.model_region_fl_np, _ = self.start_new_session_and_model(self.model_region_dir_fully_np) model_region = self.model_region_fl if patches else self.model_region_fl_np if self.light_version: @@ -1501,11 +1422,6 @@ class Eynollah: self.logger.debug("enter extract_text_regions") img_height_h = img.shape[0] img_width_h = img.shape[1] - if not self.dir_in: - if patches: - self.model_region_fl, _ = self.start_new_session_and_model(self.model_region_dir_fully) - else: - self.model_region_fl_np, _ = self.start_new_session_and_model(self.model_region_dir_fully_np) model_region = self.model_region_fl if patches else self.model_region_fl_np if not patches: @@ -1636,8 +1552,6 @@ class Eynollah: def textline_contours(self, img, use_patches, scaler_h, scaler_w, num_col_classifier=None): self.logger.debug('enter textline_contours') - if not self.dir_in: - self.model_textline, _ = self.start_new_session_and_model(self.model_textline_dir) #img = img.astype(np.uint8) img_org = np.copy(img) @@ -1739,9 +1653,6 @@ class Eynollah: img_h_new = int(img.shape[0] / float(img.shape[1]) * img_w_new) img_resized = resize_image(img,img_h_new, img_w_new ) - if not self.dir_in: - self.model_region, _ = self.start_new_session_and_model(self.model_region_dir_p_ens_light_only_images_extraction) - prediction_regions_org = self.do_prediction_new_concept(True, img_resized, self.model_region) prediction_regions_org = resize_image(prediction_regions_org,img_height_h, img_width_h ) @@ -1830,7 +1741,6 @@ class Eynollah: img_height_h = img_org.shape[0] img_width_h = img_org.shape[1] - #model_region, _ = self.start_new_session_and_model(self.model_region_dir_p_ens) #print(num_col_classifier,'num_col_classifier') if num_col_classifier == 1: @@ -1853,8 +1763,6 @@ class Eynollah: #if self.input_binary: #img_bin = np.copy(img_resized) ###if (not self.input_binary and self.full_layout) or (not self.input_binary and num_col_classifier >= 30): - ###if not self.dir_in: - ###self.model_bin, _ = self.start_new_session_and_model(self.model_dir_of_binarization) ###prediction_bin = self.do_prediction(True, img_resized, self.model_bin, n_batch_inference=5) ####print("inside bin ", time.time()-t_bin) @@ -1870,8 +1778,6 @@ class Eynollah: ###else: ###img_bin = np.copy(img_resized) if self.ocr and not self.input_binary: - if not self.dir_in: - self.model_bin, _ = self.start_new_session_and_model(self.model_dir_of_binarization) prediction_bin = self.do_prediction(True, img_resized, self.model_bin, n_batch_inference=5) prediction_bin = 255 * (prediction_bin[:,:,0] == 0) prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2) @@ -1894,12 +1800,7 @@ class Eynollah: #plt.show() if not skip_layout_and_reading_order: #print("inside 2 ", time.time()-t_in) - if not self.dir_in: - self.model_region_1_2, _ = self.start_new_session_and_model(self.model_region_dir_p_1_2_sp_np) - ##self.model_region, _ = self.start_new_session_and_model(self.model_region_dir_p_ens_light) - if num_col_classifier == 1 or num_col_classifier == 2: - model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p_1_2_sp_np) if self.image_org.shape[0]/self.image_org.shape[1] > 2.5: self.logger.debug("resized to %dx%d for %d cols", img_resized.shape[1], img_resized.shape[0], num_col_classifier) @@ -1997,9 +1898,6 @@ class Eynollah: img_org = np.copy(img) img_height_h = img_org.shape[0] img_width_h = img_org.shape[1] - - if not self.dir_in: - self.model_region, _ = self.start_new_session_and_model(self.model_region_dir_p_ens) ratio_y=1.3 ratio_x=1 @@ -2026,11 +1924,8 @@ class Eynollah: prediction_regions_org=prediction_regions_org[:,:,0] prediction_regions_org[(prediction_regions_org[:,:]==1) & (mask_zeros_y[:,:]==1)]=0 - if not self.dir_in: - self.model_region_p2, _ = self.start_new_session_and_model(self.model_region_dir_p2) - img = resize_image(img_org, int(img_org.shape[0]), int(img_org.shape[1])) - + prediction_regions_org2 = self.do_prediction(True, img, self.model_region_p2, marginal_of_patch_percent=0.2) prediction_regions_org2=resize_image(prediction_regions_org2, img_height_h, img_width_h ) @@ -2055,15 +1950,11 @@ class Eynollah: if self.input_binary: prediction_bin = np.copy(img_org) else: - if not self.dir_in: - self.model_bin, _ = self.start_new_session_and_model(self.model_dir_of_binarization) prediction_bin = self.do_prediction(True, img_org, self.model_bin, n_batch_inference=5) prediction_bin = resize_image(prediction_bin, img_height_h, img_width_h ) prediction_bin = 255 * (prediction_bin[:,:,0]==0) prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2) - - if not self.dir_in: - self.model_region, _ = self.start_new_session_and_model(self.model_region_dir_p_ens) + ratio_y=1 ratio_x=1 @@ -2096,17 +1987,10 @@ class Eynollah: except: if self.input_binary: prediction_bin = np.copy(img_org) - - if not self.dir_in: - self.model_bin, _ = self.start_new_session_and_model(self.model_dir_of_binarization) prediction_bin = self.do_prediction(True, img_org, self.model_bin, n_batch_inference=5) prediction_bin = resize_image(prediction_bin, img_height_h, img_width_h ) prediction_bin = 255 * (prediction_bin[:,:,0]==0) prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2) - - if not self.dir_in: - self.model_region, _ = self.start_new_session_and_model(self.model_region_dir_p_ens) - else: prediction_bin = np.copy(img_org) ratio_y=1 @@ -2736,10 +2620,6 @@ class Eynollah: img_org = np.copy(img) img_height_h = img_org.shape[0] img_width_h = img_org.shape[1] - - if not self.dir_in: - self.model_table, _ = self.start_new_session_and_model(self.model_table_dir) - patches = False if self.light_version: prediction_table = self.do_prediction_new_concept(patches, img, self.model_table) @@ -3375,8 +3255,12 @@ class Eynollah: return (polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_separators_d, regions_fully, regions_without_separators, polygons_of_marginals, contours_tables) - - def our_load_model(self, model_file): + + @staticmethod + def our_load_model(model_file): + if model_file.endswith('.h5') and Path(model_file[:-3]).exists(): + # prefer SavedModel over HDF5 format if it exists + model_file = model_file[:-3] try: model = load_model(model_file, compile=False) except: @@ -3427,9 +3311,6 @@ class Eynollah: img_header_and_sep = resize_image(img_header_and_sep, height1, width1) img_poly = resize_image(img_poly, height3, width3) - if not self.dir_in: - self.model_reading_order, _ = self.start_new_session_and_model(self.model_reading_order_dir) - inference_bs = 3 input_1 = np.zeros((inference_bs, height1, width1, 3)) ordered = [list(range(len(co_text_all)))] @@ -3730,7 +3611,7 @@ class Eynollah: for ij in range(len(all_found_textline_polygons[j])): con_ind = all_found_textline_polygons[j][ij] area = cv2.contourArea(con_ind) - con_ind = con_ind.astype(np.float) + con_ind = con_ind.astype(float) x_differential = np.diff( con_ind[:,0,0]) y_differential = np.diff( con_ind[:,0,1]) @@ -3834,7 +3715,7 @@ class Eynollah: con_ind = all_found_textline_polygons[j] #print(len(con_ind[:,0,0]),'con_ind[:,0,0]') area = cv2.contourArea(con_ind) - con_ind = con_ind.astype(np.float) + con_ind = con_ind.astype(float) x_differential = np.diff( con_ind[:,0,0]) y_differential = np.diff( con_ind[:,0,1]) @@ -3937,7 +3818,7 @@ class Eynollah: con_ind = all_found_textline_polygons[j][ij] area = cv2.contourArea(con_ind) - con_ind = con_ind.astype(np.float) + con_ind = con_ind.astype(float) x_differential = np.diff( con_ind[:,0,0]) y_differential = np.diff( con_ind[:,0,1]) @@ -4080,10 +3961,8 @@ class Eynollah: ind_textline_inside_tr = list(range(len(contours[jj]))) index_textline_inside_textregion = index_textline_inside_textregion + ind_textline_inside_tr - #ind_ins = [0] * len(contours[jj]) + jj - ind_ins = np.zeros( len(contours[jj]) ) + jj - list_ind_ins = list(ind_ins) - indexes_of_textline_tot = indexes_of_textline_tot + list_ind_ins + ind_ins = [jj] * len(contours[jj]) + indexes_of_textline_tot = indexes_of_textline_tot + ind_ins M_main_tot = [cv2.moments(contours_txtline_of_all_textregions[j]) for j in range(len(contours_txtline_of_all_textregions))] @@ -4171,7 +4050,7 @@ class Eynollah: for j in range(len(all_found_textline_polygons)): for i in range(len(all_found_textline_polygons[j])): con_ind = all_found_textline_polygons[j][i] - con_ind = con_ind.astype(np.float) + con_ind = con_ind.astype(float) x_differential = np.diff( con_ind[:,0,0]) y_differential = np.diff( con_ind[:,0,1]) @@ -4311,633 +4190,607 @@ class Eynollah: return (slopes_rem, all_found_textline_polygons_rem, boxes_text_rem, txt_con_org_rem, contours_only_text_parent_rem, index_by_text_par_con_rem_sort) - def run(self): + def run(self, image_filename : Optional[str] = None, dir_in : Optional[str] = None, overwrite : bool = False): """ Get image and scales, then extract the page of scanned image """ self.logger.debug("enter run") - t0_tot = time.time() - if not self.dir_in: - self.ls_imgs = [1] - - for img_name in self.ls_imgs: - self.logger.info(img_name) + if dir_in: + self.ls_imgs = os.listdir(dir_in) + elif image_filename: + self.ls_imgs = [image_filename] + else: + raise ValueError("run requires either a single image filename or a directory") + + for img_filename in self.ls_imgs: + self.logger.info(img_filename) t0 = time.time() - if self.dir_in: - self.reset_file_name_dir(os.path.join(self.dir_in,img_name)) - #print("text region early -11 in %.1fs", time.time() - t0) + self.reset_file_name_dir(os.path.join(dir_in or "", img_filename)) + #print("text region early -11 in %.1fs", time.time() - t0) if os.path.exists(self.writer.output_filename): - if self.overwrite: + if overwrite: self.logger.warning("will overwrite existing output file '%s'", self.writer.output_filename) else: self.logger.warning("will skip input for existing output file '%s'", self.writer.output_filename) continue - - img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version) - self.logger.info("Enhancing took %.1fs ", time.time() - t0) - if self.extract_only_images: - text_regions_p_1, erosion_hurts, polygons_lines_xml, polygons_of_images, image_page, page_coord, cont_page = \ - self.get_regions_light_v_extract_only_images(img_res, is_image_enhanced, num_col_classifier) - ocr_all_textlines = None - pcgts = self.writer.build_pagexml_no_full_layout( - [], page_coord, [], [], [], [], - polygons_of_images, [], [], [], [], [], - cont_page, [], [], ocr_all_textlines) - if self.plotter: - self.plotter.write_images_into_directory(polygons_of_images, image_page) - if self.dir_in: - self.writer.write_pagexml(pcgts) - continue - else: - return pcgts + pcgts = self.run_single() + self.logger.info("Job done in %.1fs", time.time() - t0) + #print("Job done in %.1fs" % (time.time() - t0)) + self.writer.write_pagexml(pcgts) - if self.skip_layout_and_reading_order: - _ ,_, _, textline_mask_tot_ea, img_bin_light = \ - self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier, - skip_layout_and_reading_order=self.skip_layout_and_reading_order) + if dir_in: + self.logger.info("All jobs done in %.1fs", time.time() - t0_tot) + print("all Job done in %.1fs", time.time() - t0_tot) - page_coord, image_page, textline_mask_tot_ea, img_bin_light, cont_page = \ - self.run_graphics_and_columns_without_layout(textline_mask_tot_ea, img_bin_light) + def run_single(self): + t0 = time.time() + img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version) + self.logger.info("Enhancing took %.1fs ", time.time() - t0) + if self.extract_only_images: + text_regions_p_1, erosion_hurts, polygons_lines_xml, polygons_of_images, image_page, page_coord, cont_page = \ + self.get_regions_light_v_extract_only_images(img_res, is_image_enhanced, num_col_classifier) + ocr_all_textlines = None + pcgts = self.writer.build_pagexml_no_full_layout( + [], page_coord, [], [], [], [], + polygons_of_images, [], [], [], [], [], + cont_page, [], [], ocr_all_textlines) + if self.plotter: + self.plotter.write_images_into_directory(polygons_of_images, image_page) + return pcgts + if self.skip_layout_and_reading_order: + _ ,_, _, textline_mask_tot_ea, img_bin_light = \ + self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier, + skip_layout_and_reading_order=self.skip_layout_and_reading_order) - ##all_found_textline_polygons =self.scale_contours_new(textline_mask_tot_ea) + page_coord, image_page, textline_mask_tot_ea, img_bin_light, cont_page = \ + self.run_graphics_and_columns_without_layout(textline_mask_tot_ea, img_bin_light) - cnt_clean_rot_raw, hir_on_cnt_clean_rot = return_contours_of_image(textline_mask_tot_ea) - all_found_textline_polygons = filter_contours_area_of_image( - textline_mask_tot_ea, cnt_clean_rot_raw, hir_on_cnt_clean_rot, max_area=1, min_area=0.00001) - all_found_textline_polygons=[ all_found_textline_polygons ] + ##all_found_textline_polygons =self.scale_contours_new(textline_mask_tot_ea) - all_found_textline_polygons = self.dilate_textregions_contours_textline_version( - all_found_textline_polygons) - all_found_textline_polygons = self.filter_contours_inside_a_bigger_one( - all_found_textline_polygons, textline_mask_tot_ea, type_contour="textline") + cnt_clean_rot_raw, hir_on_cnt_clean_rot = return_contours_of_image(textline_mask_tot_ea) + all_found_textline_polygons = filter_contours_area_of_image( + textline_mask_tot_ea, cnt_clean_rot_raw, hir_on_cnt_clean_rot, max_area=1, min_area=0.00001) + all_found_textline_polygons=[ all_found_textline_polygons ] - order_text_new = [0] - slopes =[0] - id_of_texts_tot =['region_0001'] + all_found_textline_polygons = self.dilate_textregions_contours_textline_version( + all_found_textline_polygons) + all_found_textline_polygons = self.filter_contours_inside_a_bigger_one( + all_found_textline_polygons, textline_mask_tot_ea, type_contour="textline") - polygons_of_images = [] - slopes_marginals = [] - polygons_of_marginals = [] - all_found_textline_polygons_marginals = [] - all_box_coord_marginals = [] - polygons_lines_xml = [] - contours_tables = [] - ocr_all_textlines = None - pcgts = self.writer.build_pagexml_no_full_layout( - cont_page, page_coord, order_text_new, id_of_texts_tot, - all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals, - all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, - cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines) - if self.dir_in: - self.writer.write_pagexml(pcgts) - continue - else: - return pcgts - #print("text region early -1 in %.1fs", time.time() - t0) - t1 = time.time() - if self.light_version: - text_regions_p_1 ,erosion_hurts, polygons_lines_xml, textline_mask_tot_ea, img_bin_light = \ - self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier) - #print("text region early -2 in %.1fs", time.time() - t0) + order_text_new = [0] + slopes =[0] + id_of_texts_tot =['region_0001'] - if num_col_classifier == 1 or num_col_classifier ==2: - if num_col_classifier == 1: - img_w_new = 1000 - else: - img_w_new = 1300 - img_h_new = img_w_new * textline_mask_tot_ea.shape[0] // textline_mask_tot_ea.shape[1] + polygons_of_images = [] + slopes_marginals = [] + polygons_of_marginals = [] + all_found_textline_polygons_marginals = [] + all_box_coord_marginals = [] + polygons_lines_xml = [] + contours_tables = [] + ocr_all_textlines = None + pcgts = self.writer.build_pagexml_no_full_layout( + cont_page, page_coord, order_text_new, id_of_texts_tot, + all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals, + all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, + cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines) + return pcgts - textline_mask_tot_ea_deskew = resize_image(textline_mask_tot_ea,img_h_new, img_w_new ) + #print("text region early -1 in %.1fs", time.time() - t0) + t1 = time.time() + if self.light_version: + text_regions_p_1 ,erosion_hurts, polygons_lines_xml, textline_mask_tot_ea, img_bin_light = \ + self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier) + #print("text region early -2 in %.1fs", time.time() - t0) - slope_deskew, slope_first = self.run_deskew(textline_mask_tot_ea_deskew) + if num_col_classifier == 1 or num_col_classifier ==2: + if num_col_classifier == 1: + img_w_new = 1000 else: - slope_deskew, slope_first = self.run_deskew(textline_mask_tot_ea) - #print("text region early -2,5 in %.1fs", time.time() - t0) - #self.logger.info("Textregion detection took %.1fs ", time.time() - t1t) - num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, \ - text_regions_p_1, cont_page, table_prediction, textline_mask_tot_ea, img_bin_light = \ - self.run_graphics_and_columns_light(text_regions_p_1, textline_mask_tot_ea, - num_col_classifier, num_column_is_classified, erosion_hurts, img_bin_light) - #self.logger.info("run graphics %.1fs ", time.time() - t1t) - #print("text region early -3 in %.1fs", time.time() - t0) - textline_mask_tot_ea_org = np.copy(textline_mask_tot_ea) - #print("text region early -4 in %.1fs", time.time() - t0) - else: - text_regions_p_1 ,erosion_hurts, polygons_lines_xml = \ - self.get_regions_from_xy_2models(img_res, is_image_enhanced, - num_col_classifier) - self.logger.info("Textregion detection took %.1fs ", time.time() - t1) - - t1 = time.time() - num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, \ - text_regions_p_1, cont_page, table_prediction = \ - self.run_graphics_and_columns(text_regions_p_1, num_col_classifier, num_column_is_classified, erosion_hurts) - self.logger.info("Graphics detection took %.1fs ", time.time() - t1) - #self.logger.info('cont_page %s', cont_page) - #plt.imshow(table_prediction) - #plt.show() + img_w_new = 1300 + img_h_new = img_w_new * textline_mask_tot_ea.shape[0] // textline_mask_tot_ea.shape[1] - if not num_col: - self.logger.info("No columns detected, outputting an empty PAGE-XML") - ocr_all_textlines = None - pcgts = self.writer.build_pagexml_no_full_layout( - [], page_coord, [], [], [], [], [], [], [], [], [], [], - cont_page, [], [], ocr_all_textlines) - self.logger.info("Job done in %.1fs", time.time() - t1) - if self.dir_in: - self.writer.write_pagexml(pcgts) - continue - else: - return pcgts + textline_mask_tot_ea_deskew = resize_image(textline_mask_tot_ea,img_h_new, img_w_new ) - #print("text region early in %.1fs", time.time() - t0) - t1 = time.time() - if not self.light_version: - textline_mask_tot_ea = self.run_textline(image_page) - self.logger.info("textline detection took %.1fs", time.time() - t1) - t1 = time.time() + slope_deskew, slope_first = self.run_deskew(textline_mask_tot_ea_deskew) + else: slope_deskew, slope_first = self.run_deskew(textline_mask_tot_ea) - self.logger.info("deskewing took %.1fs", time.time() - t1) - elif num_col_classifier in (1,2): - org_h_l_m = textline_mask_tot_ea.shape[0] - org_w_l_m = textline_mask_tot_ea.shape[1] - if num_col_classifier == 1: - img_w_new = 2000 - else: - img_w_new = 2400 - img_h_new = img_w_new * textline_mask_tot_ea.shape[0] // textline_mask_tot_ea.shape[1] + #print("text region early -2,5 in %.1fs", time.time() - t0) + #self.logger.info("Textregion detection took %.1fs ", time.time() - t1t) + num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, \ + text_regions_p_1, cont_page, table_prediction, textline_mask_tot_ea, img_bin_light = \ + self.run_graphics_and_columns_light(text_regions_p_1, textline_mask_tot_ea, + num_col_classifier, num_column_is_classified, erosion_hurts, img_bin_light) + #self.logger.info("run graphics %.1fs ", time.time() - t1t) + #print("text region early -3 in %.1fs", time.time() - t0) + textline_mask_tot_ea_org = np.copy(textline_mask_tot_ea) + #print("text region early -4 in %.1fs", time.time() - t0) + else: + text_regions_p_1 ,erosion_hurts, polygons_lines_xml = \ + self.get_regions_from_xy_2models(img_res, is_image_enhanced, + num_col_classifier) + self.logger.info("Textregion detection took %.1fs ", time.time() - t1) - image_page = resize_image(image_page,img_h_new, img_w_new ) - textline_mask_tot_ea = resize_image(textline_mask_tot_ea,img_h_new, img_w_new ) - mask_images = resize_image(mask_images,img_h_new, img_w_new ) - mask_lines = resize_image(mask_lines,img_h_new, img_w_new ) - text_regions_p_1 = resize_image(text_regions_p_1,img_h_new, img_w_new ) - table_prediction = resize_image(table_prediction,img_h_new, img_w_new ) - - textline_mask_tot, text_regions_p, image_page_rotated = \ - self.run_marginals(image_page, textline_mask_tot_ea, mask_images, mask_lines, - num_col_classifier, slope_deskew, text_regions_p_1, table_prediction) - - if self.light_version and num_col_classifier in (1,2): - image_page = resize_image(image_page,org_h_l_m, org_w_l_m ) - textline_mask_tot_ea = resize_image(textline_mask_tot_ea,org_h_l_m, org_w_l_m ) - text_regions_p = resize_image(text_regions_p,org_h_l_m, org_w_l_m ) - textline_mask_tot = resize_image(textline_mask_tot,org_h_l_m, org_w_l_m ) - text_regions_p_1 = resize_image(text_regions_p_1,org_h_l_m, org_w_l_m ) - table_prediction = resize_image(table_prediction,org_h_l_m, org_w_l_m ) - image_page_rotated = resize_image(image_page_rotated,org_h_l_m, org_w_l_m ) - - self.logger.info("detection of marginals took %.1fs", time.time() - t1) - #print("text region early 2 marginal in %.1fs", time.time() - t0) - ## birdan sora chock chakir t1 = time.time() - if not self.full_layout: - polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_separators_d, \ - boxes, boxes_d, polygons_of_marginals, contours_tables = \ - self.run_boxes_no_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, - num_col_classifier, table_prediction, erosion_hurts) - ###polygons_of_marginals = self.dilate_textregions_contours(polygons_of_marginals) + num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, \ + text_regions_p_1, cont_page, table_prediction = \ + self.run_graphics_and_columns(text_regions_p_1, num_col_classifier, num_column_is_classified, erosion_hurts) + self.logger.info("Graphics detection took %.1fs ", time.time() - t1) + #self.logger.info('cont_page %s', cont_page) + #plt.imshow(table_prediction) + #plt.show() + + if not num_col: + self.logger.info("No columns detected, outputting an empty PAGE-XML") + ocr_all_textlines = None + pcgts = self.writer.build_pagexml_no_full_layout( + [], page_coord, [], [], [], [], [], [], [], [], [], [], + cont_page, [], [], ocr_all_textlines) + return pcgts + + #print("text region early in %.1fs", time.time() - t0) + t1 = time.time() + if not self.light_version: + textline_mask_tot_ea = self.run_textline(image_page) + self.logger.info("textline detection took %.1fs", time.time() - t1) + t1 = time.time() + slope_deskew, slope_first = self.run_deskew(textline_mask_tot_ea) + self.logger.info("deskewing took %.1fs", time.time() - t1) + elif num_col_classifier in (1,2): + org_h_l_m = textline_mask_tot_ea.shape[0] + org_w_l_m = textline_mask_tot_ea.shape[1] + if num_col_classifier == 1: + img_w_new = 2000 else: - polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_separators_d, \ - regions_fully, regions_without_separators, polygons_of_marginals, contours_tables = \ - self.run_boxes_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, - num_col_classifier, img_only_regions, table_prediction, erosion_hurts, - img_bin_light if self.light_version else None) - ###polygons_of_marginals = self.dilate_textregions_contours(polygons_of_marginals) - if self.light_version: - drop_label_in_full_layout = 4 - textline_mask_tot_ea_org[img_revised_tab==drop_label_in_full_layout] = 0 + img_w_new = 2400 + img_h_new = img_w_new * textline_mask_tot_ea.shape[0] // textline_mask_tot_ea.shape[1] + + image_page = resize_image(image_page,img_h_new, img_w_new ) + textline_mask_tot_ea = resize_image(textline_mask_tot_ea,img_h_new, img_w_new ) + mask_images = resize_image(mask_images,img_h_new, img_w_new ) + mask_lines = resize_image(mask_lines,img_h_new, img_w_new ) + text_regions_p_1 = resize_image(text_regions_p_1,img_h_new, img_w_new ) + table_prediction = resize_image(table_prediction,img_h_new, img_w_new ) + + textline_mask_tot, text_regions_p, image_page_rotated = \ + self.run_marginals(image_page, textline_mask_tot_ea, mask_images, mask_lines, + num_col_classifier, slope_deskew, text_regions_p_1, table_prediction) + + if self.light_version and num_col_classifier in (1,2): + image_page = resize_image(image_page,org_h_l_m, org_w_l_m ) + textline_mask_tot_ea = resize_image(textline_mask_tot_ea,org_h_l_m, org_w_l_m ) + text_regions_p = resize_image(text_regions_p,org_h_l_m, org_w_l_m ) + textline_mask_tot = resize_image(textline_mask_tot,org_h_l_m, org_w_l_m ) + text_regions_p_1 = resize_image(text_regions_p_1,org_h_l_m, org_w_l_m ) + table_prediction = resize_image(table_prediction,org_h_l_m, org_w_l_m ) + image_page_rotated = resize_image(image_page_rotated,org_h_l_m, org_w_l_m ) + + self.logger.info("detection of marginals took %.1fs", time.time() - t1) + #print("text region early 2 marginal in %.1fs", time.time() - t0) + ## birdan sora chock chakir + t1 = time.time() + if not self.full_layout: + polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_separators_d, \ + boxes, boxes_d, polygons_of_marginals, contours_tables = \ + self.run_boxes_no_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, + num_col_classifier, table_prediction, erosion_hurts) + ###polygons_of_marginals = self.dilate_textregions_contours(polygons_of_marginals) + else: + polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_separators_d, \ + regions_fully, regions_without_separators, polygons_of_marginals, contours_tables = \ + self.run_boxes_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, + num_col_classifier, img_only_regions, table_prediction, erosion_hurts, + img_bin_light if self.light_version else None) + ###polygons_of_marginals = self.dilate_textregions_contours(polygons_of_marginals) + if self.light_version: + drop_label_in_full_layout = 4 + textline_mask_tot_ea_org[img_revised_tab==drop_label_in_full_layout] = 0 - text_only = ((img_revised_tab[:, :] == 1)) * 1 + text_only = ((img_revised_tab[:, :] == 1)) * 1 + if np.abs(slope_deskew) >= SLOPE_THRESHOLD: + text_only_d = ((text_regions_p_1_n[:, :] == 1)) * 1 + + #print("text region early 2 in %.1fs", time.time() - t0) + ###min_con_area = 0.000005 + contours_only_text, hir_on_text = return_contours_of_image(text_only) + contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text) + if len(contours_only_text_parent) > 0: + areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) + areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1]) + #self.logger.info('areas_cnt_text %s', areas_cnt_text) + contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)] + contours_only_text_parent = [c for jz, c in enumerate(contours_only_text_parent) + if areas_cnt_text[jz] > MIN_AREA_REGION] + areas_cnt_text_parent = [area for area in areas_cnt_text if area > MIN_AREA_REGION] + index_con_parents = np.argsort(areas_cnt_text_parent) + + contours_only_text_parent = self.return_list_of_contours_with_desired_order( + contours_only_text_parent, index_con_parents) + + ##try: + ##contours_only_text_parent = \ + ##list(np.array(contours_only_text_parent,dtype=object)[index_con_parents]) + ##except: + ##contours_only_text_parent = \ + ##list(np.array(contours_only_text_parent,dtype=np.int32)[index_con_parents]) + ##areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents]) + areas_cnt_text_parent = self.return_list_of_contours_with_desired_order( + areas_cnt_text_parent, index_con_parents) + + cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contours([contours_biggest]) + cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent) + if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - text_only_d = ((text_regions_p_1_n[:, :] == 1)) * 1 - - #print("text region early 2 in %.1fs", time.time() - t0) - ###min_con_area = 0.000005 - contours_only_text, hir_on_text = return_contours_of_image(text_only) - contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text) - if len(contours_only_text_parent) > 0: - areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) - areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1]) - #self.logger.info('areas_cnt_text %s', areas_cnt_text) - contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)] - contours_only_text_parent = [c for jz, c in enumerate(contours_only_text_parent) - if areas_cnt_text[jz] > MIN_AREA_REGION] - areas_cnt_text_parent = [area for area in areas_cnt_text if area > MIN_AREA_REGION] - index_con_parents = np.argsort(areas_cnt_text_parent) - - contours_only_text_parent = self.return_list_of_contours_with_desired_order( - contours_only_text_parent, index_con_parents) - - ##try: - ##contours_only_text_parent = \ - ##list(np.array(contours_only_text_parent,dtype=object)[index_con_parents]) - ##except: - ##contours_only_text_parent = \ - ##list(np.array(contours_only_text_parent,dtype=np.int32)[index_con_parents]) - ##areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents]) - areas_cnt_text_parent = self.return_list_of_contours_with_desired_order( - areas_cnt_text_parent, index_con_parents) - - cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contours([contours_biggest]) - cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent) + contours_only_text_d, hir_on_text_d = return_contours_of_image(text_only_d) + contours_only_text_parent_d = return_parent_contours(contours_only_text_d, hir_on_text_d) - if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - contours_only_text_d, hir_on_text_d = return_contours_of_image(text_only_d) - contours_only_text_parent_d = return_parent_contours(contours_only_text_d, hir_on_text_d) - - areas_cnt_text_d = np.array([cv2.contourArea(c) for c in contours_only_text_parent_d]) - areas_cnt_text_d = areas_cnt_text_d / float(text_only_d.shape[0] * text_only_d.shape[1]) - - if len(areas_cnt_text_d)>0: - contours_biggest_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)] - index_con_parents_d = np.argsort(areas_cnt_text_d) - contours_only_text_parent_d = self.return_list_of_contours_with_desired_order( - contours_only_text_parent_d, index_con_parents_d) - #try: - #contours_only_text_parent_d = \ - #list(np.array(contours_only_text_parent_d,dtype=object)[index_con_parents_d]) - #except: - #contours_only_text_parent_d = \ - #list(np.array(contours_only_text_parent_d,dtype=np.int32)[index_con_parents_d]) - #areas_cnt_text_d = list(np.array(areas_cnt_text_d)[index_con_parents_d]) - areas_cnt_text_d = self.return_list_of_contours_with_desired_order( - areas_cnt_text_d, index_con_parents_d) - - cx_bigest_d_big, cy_biggest_d_big, _, _, _, _, _ = find_new_features_of_contours([contours_biggest_d]) - cx_bigest_d, cy_biggest_d, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent_d) - try: - if len(cx_bigest_d) >= 5: - cx_bigest_d_last5 = cx_bigest_d[-5:] - cy_biggest_d_last5 = cy_biggest_d[-5:] - dists_d = [math.sqrt((cx_bigest_big[0] - cx_bigest_d_last5[j]) ** 2 + - (cy_biggest_big[0] - cy_biggest_d_last5[j]) ** 2) - for j in range(len(cy_biggest_d_last5))] - ind_largest = len(cx_bigest_d) -5 + np.argmin(dists_d) - else: - cx_bigest_d_last5 = cx_bigest_d[-len(cx_bigest_d):] - cy_biggest_d_last5 = cy_biggest_d[-len(cx_bigest_d):] - dists_d = [math.sqrt((cx_bigest_big[0]-cx_bigest_d_last5[j])**2 + - (cy_biggest_big[0]-cy_biggest_d_last5[j])**2) - for j in range(len(cy_biggest_d_last5))] - ind_largest = len(cx_bigest_d) - len(cx_bigest_d) + np.argmin(dists_d) - - cx_bigest_d_big[0] = cx_bigest_d[ind_largest] - cy_biggest_d_big[0] = cy_biggest_d[ind_largest] - except Exception as why: - self.logger.error(why) - - (h, w) = text_only.shape[:2] - center = (w // 2.0, h // 2.0) - M = cv2.getRotationMatrix2D(center, slope_deskew, 1.0) - M_22 = np.array(M)[:2, :2] - p_big = np.dot(M_22, [cx_bigest_big, cy_biggest_big]) - x_diff = p_big[0] - cx_bigest_d_big - y_diff = p_big[1] - cy_biggest_d_big - - contours_only_text_parent_d_ordered = [] - for i in range(len(contours_only_text_parent)): - p = np.dot(M_22, [cx_bigest[i], cy_biggest[i]]) - p[0] = p[0] - x_diff[0] - p[1] = p[1] - y_diff[0] - dists = [math.sqrt((p[0] - cx_bigest_d[j]) ** 2 + - (p[1] - cy_biggest_d[j]) ** 2) - for j in range(len(cx_bigest_d))] - contours_only_text_parent_d_ordered.append(contours_only_text_parent_d[np.argmin(dists)]) - # img2=np.zeros((text_only.shape[0],text_only.shape[1],3)) - # img2=cv2.fillPoly(img2,pts=[contours_only_text_parent_d[np.argmin(dists)]] ,color=(1,1,1)) - # plt.imshow(img2[:,:,0]) - # plt.show() - else: - contours_only_text_parent_d_ordered = [] - contours_only_text_parent_d = [] - contours_only_text_parent = [] + areas_cnt_text_d = np.array([cv2.contourArea(c) for c in contours_only_text_parent_d]) + areas_cnt_text_d = areas_cnt_text_d / float(text_only_d.shape[0] * text_only_d.shape[1]) + if len(areas_cnt_text_d)>0: + contours_biggest_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)] + index_con_parents_d = np.argsort(areas_cnt_text_d) + contours_only_text_parent_d = self.return_list_of_contours_with_desired_order( + contours_only_text_parent_d, index_con_parents_d) + #try: + #contours_only_text_parent_d = \ + #list(np.array(contours_only_text_parent_d,dtype=object)[index_con_parents_d]) + #except: + #contours_only_text_parent_d = \ + #list(np.array(contours_only_text_parent_d,dtype=np.int32)[index_con_parents_d]) + #areas_cnt_text_d = list(np.array(areas_cnt_text_d)[index_con_parents_d]) + areas_cnt_text_d = self.return_list_of_contours_with_desired_order( + areas_cnt_text_d, index_con_parents_d) + + cx_bigest_d_big, cy_biggest_d_big, _, _, _, _, _ = find_new_features_of_contours([contours_biggest_d]) + cx_bigest_d, cy_biggest_d, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent_d) + try: + if len(cx_bigest_d) >= 5: + cx_bigest_d_last5 = cx_bigest_d[-5:] + cy_biggest_d_last5 = cy_biggest_d[-5:] + dists_d = [math.sqrt((cx_bigest_big[0] - cx_bigest_d_last5[j]) ** 2 + + (cy_biggest_big[0] - cy_biggest_d_last5[j]) ** 2) + for j in range(len(cy_biggest_d_last5))] + ind_largest = len(cx_bigest_d) -5 + np.argmin(dists_d) + else: + cx_bigest_d_last5 = cx_bigest_d[-len(cx_bigest_d):] + cy_biggest_d_last5 = cy_biggest_d[-len(cx_bigest_d):] + dists_d = [math.sqrt((cx_bigest_big[0]-cx_bigest_d_last5[j])**2 + + (cy_biggest_big[0]-cy_biggest_d_last5[j])**2) + for j in range(len(cy_biggest_d_last5))] + ind_largest = len(cx_bigest_d) - len(cx_bigest_d) + np.argmin(dists_d) + + cx_bigest_d_big[0] = cx_bigest_d[ind_largest] + cy_biggest_d_big[0] = cy_biggest_d[ind_largest] + except Exception as why: + self.logger.error(why) + + (h, w) = text_only.shape[:2] + center = (w // 2.0, h // 2.0) + M = cv2.getRotationMatrix2D(center, slope_deskew, 1.0) + M_22 = np.array(M)[:2, :2] + p_big = np.dot(M_22, [cx_bigest_big, cy_biggest_big]) + x_diff = p_big[0] - cx_bigest_d_big + y_diff = p_big[1] - cy_biggest_d_big + + contours_only_text_parent_d_ordered = [] + for i in range(len(contours_only_text_parent)): + p = np.dot(M_22, [cx_bigest[i], cy_biggest[i]]) + p[0] = p[0] - x_diff[0] + p[1] = p[1] - y_diff[0] + dists = [math.sqrt((p[0] - cx_bigest_d[j]) ** 2 + + (p[1] - cy_biggest_d[j]) ** 2) + for j in range(len(cx_bigest_d))] + contours_only_text_parent_d_ordered.append(contours_only_text_parent_d[np.argmin(dists)]) + # img2=np.zeros((text_only.shape[0],text_only.shape[1],3)) + # img2=cv2.fillPoly(img2,pts=[contours_only_text_parent_d[np.argmin(dists)]] ,color=(1,1,1)) + # plt.imshow(img2[:,:,0]) + # plt.show() else: contours_only_text_parent_d_ordered = [] contours_only_text_parent_d = [] - #contours_only_text_parent = [] - if not len(contours_only_text_parent): - # stop early - empty_marginals = [[]] * len(polygons_of_marginals) - if self.full_layout: - pcgts = self.writer.build_pagexml_full_layout( - [], [], page_coord, [], [], [], [], [], [], - polygons_of_images, contours_tables, [], - polygons_of_marginals, empty_marginals, empty_marginals, [], [], [], - cont_page, polygons_lines_xml, []) - else: - pcgts = self.writer.build_pagexml_no_full_layout( - [], page_coord, [], [], [], [], - polygons_of_images, - polygons_of_marginals, empty_marginals, empty_marginals, [], [], - cont_page, polygons_lines_xml, contours_tables, []) - self.logger.info("Job done in %.1fs", time.time() - t0) - if self.dir_in: - self.writer.write_pagexml(pcgts) - continue - else: - return pcgts + contours_only_text_parent = [] - #print("text region early 3 in %.1fs", time.time() - t0) - if self.light_version: - contours_only_text_parent = self.dilate_textregions_contours( - contours_only_text_parent) - contours_only_text_parent = self.filter_contours_inside_a_bigger_one( - contours_only_text_parent, text_only, marginal_cnts=polygons_of_marginals) - #print("text region early 3.5 in %.1fs", time.time() - t0) - txt_con_org = get_textregion_contours_in_org_image_light( - contours_only_text_parent, self.image, slope_first, map=self.executor.map) - #txt_con_org = self.dilate_textregions_contours(txt_con_org) - #contours_only_text_parent = self.dilate_textregions_contours(contours_only_text_parent) else: - txt_con_org = get_textregion_contours_in_org_image( - contours_only_text_parent, self.image, slope_first) - #print("text region early 4 in %.1fs", time.time() - t0) - boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent) - boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals) - #print("text region early 5 in %.1fs", time.time() - t0) - ## birdan sora chock chakir - if not self.curved_line: - if self.light_version: - if self.textline_light: - all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, \ - all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_light2( - txt_con_org, contours_only_text_parent, textline_mask_tot_ea_org, - image_page_rotated, boxes_text, slope_deskew) - all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, \ - all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_light2( - polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_org, - image_page_rotated, boxes_marginals, slope_deskew) - - #slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, index_by_text_par_con = \ - # self.delete_regions_without_textlines(slopes, all_found_textline_polygons, - # boxes_text, txt_con_org, contours_only_text_parent, index_by_text_par_con) - #slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, polygons_of_marginals, polygons_of_marginals, _ = \ - # self.delete_regions_without_textlines(slopes_marginals, all_found_textline_polygons_marginals, - # boxes_marginals, polygons_of_marginals, polygons_of_marginals, np.array(range(len(polygons_of_marginals)))) - #all_found_textline_polygons = self.dilate_textlines(all_found_textline_polygons) - #####all_found_textline_polygons = self.dilate_textline_contours(all_found_textline_polygons) - all_found_textline_polygons = self.dilate_textregions_contours_textline_version( - all_found_textline_polygons) - all_found_textline_polygons = self.filter_contours_inside_a_bigger_one( - all_found_textline_polygons, textline_mask_tot_ea_org, type_contour="textline") - all_found_textline_polygons_marginals = self.dilate_textregions_contours_textline_version( - all_found_textline_polygons_marginals) - contours_only_text_parent, txt_con_org, all_found_textline_polygons, contours_only_text_parent_d_ordered, \ - index_by_text_par_con = self.filter_contours_without_textline_inside( - contours_only_text_parent, txt_con_org, all_found_textline_polygons, contours_only_text_parent_d_ordered) - else: - textline_mask_tot_ea = cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1) - all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, \ - index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_light( - txt_con_org, contours_only_text_parent, textline_mask_tot_ea, - image_page_rotated, boxes_text, slope_deskew) - all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, \ - all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_light( - polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, - image_page_rotated, boxes_marginals, slope_deskew) - #all_found_textline_polygons = self.filter_contours_inside_a_bigger_one( - # all_found_textline_polygons, textline_mask_tot_ea_org, type_contour="textline") + contours_only_text_parent_d_ordered = [] + contours_only_text_parent_d = [] + #contours_only_text_parent = [] + if not len(contours_only_text_parent): + # stop early + empty_marginals = [[]] * len(polygons_of_marginals) + if self.full_layout: + pcgts = self.writer.build_pagexml_full_layout( + [], [], page_coord, [], [], [], [], [], [], + polygons_of_images, contours_tables, [], + polygons_of_marginals, empty_marginals, empty_marginals, [], [], [], + cont_page, polygons_lines_xml, []) + else: + pcgts = self.writer.build_pagexml_no_full_layout( + [], page_coord, [], [], [], [], + polygons_of_images, + polygons_of_marginals, empty_marginals, empty_marginals, [], [], + cont_page, polygons_lines_xml, contours_tables, []) + return pcgts + + #print("text region early 3 in %.1fs", time.time() - t0) + if self.light_version: + contours_only_text_parent = self.dilate_textregions_contours( + contours_only_text_parent) + contours_only_text_parent = self.filter_contours_inside_a_bigger_one( + contours_only_text_parent, text_only, marginal_cnts=polygons_of_marginals) + #print("text region early 3.5 in %.1fs", time.time() - t0) + txt_con_org = get_textregion_contours_in_org_image_light( + contours_only_text_parent, self.image, slope_first, map=self.executor.map) + #txt_con_org = self.dilate_textregions_contours(txt_con_org) + #contours_only_text_parent = self.dilate_textregions_contours(contours_only_text_parent) + else: + txt_con_org = get_textregion_contours_in_org_image( + contours_only_text_parent, self.image, slope_first) + #print("text region early 4 in %.1fs", time.time() - t0) + boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent) + boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals) + #print("text region early 5 in %.1fs", time.time() - t0) + ## birdan sora chock chakir + if not self.curved_line: + if self.light_version: + if self.textline_light: + all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, \ + all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_light2( + txt_con_org, contours_only_text_parent, textline_mask_tot_ea_org, + image_page_rotated, boxes_text, slope_deskew) + all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, \ + all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_light2( + polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_org, + image_page_rotated, boxes_marginals, slope_deskew) + + #slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, index_by_text_par_con = \ + # self.delete_regions_without_textlines(slopes, all_found_textline_polygons, + # boxes_text, txt_con_org, contours_only_text_parent, index_by_text_par_con) + #slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, polygons_of_marginals, polygons_of_marginals, _ = \ + # self.delete_regions_without_textlines(slopes_marginals, all_found_textline_polygons_marginals, + # boxes_marginals, polygons_of_marginals, polygons_of_marginals, np.array(range(len(polygons_of_marginals)))) + #all_found_textline_polygons = self.dilate_textlines(all_found_textline_polygons) + #####all_found_textline_polygons = self.dilate_textline_contours(all_found_textline_polygons) + all_found_textline_polygons = self.dilate_textregions_contours_textline_version( + all_found_textline_polygons) + all_found_textline_polygons = self.filter_contours_inside_a_bigger_one( + all_found_textline_polygons, textline_mask_tot_ea_org, type_contour="textline") + all_found_textline_polygons_marginals = self.dilate_textregions_contours_textline_version( + all_found_textline_polygons_marginals) + contours_only_text_parent, txt_con_org, all_found_textline_polygons, contours_only_text_parent_d_ordered, \ + index_by_text_par_con = self.filter_contours_without_textline_inside( + contours_only_text_parent, txt_con_org, all_found_textline_polygons, contours_only_text_parent_d_ordered) else: textline_mask_tot_ea = cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1) - all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, \ - all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new( + all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, \ + index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_light( txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew) all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, \ - all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new( + all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_light( polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew) + #all_found_textline_polygons = self.filter_contours_inside_a_bigger_one( + # all_found_textline_polygons, textline_mask_tot_ea_org, type_contour="textline") else: - scale_param = 1 - textline_mask_tot_ea_erode = cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=2) + textline_mask_tot_ea = cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1) all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, \ - all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved( - txt_con_org, contours_only_text_parent, textline_mask_tot_ea_erode, - image_page_rotated, boxes_text, text_only, - num_col_classifier, scale_param, slope_deskew) - all_found_textline_polygons = small_textlines_to_parent_adherence2( - all_found_textline_polygons, textline_mask_tot_ea, num_col_classifier) + all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new( + txt_con_org, contours_only_text_parent, textline_mask_tot_ea, + image_page_rotated, boxes_text, slope_deskew) all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, \ - all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_curved( - polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_erode, - image_page_rotated, boxes_marginals, text_only, - num_col_classifier, scale_param, slope_deskew) - all_found_textline_polygons_marginals = small_textlines_to_parent_adherence2( - all_found_textline_polygons_marginals, textline_mask_tot_ea, num_col_classifier) - - #print("text region early 6 in %.1fs", time.time() - t0) - if self.full_layout: - if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - contours_only_text_parent_d_ordered = self.return_list_of_contours_with_desired_order( - contours_only_text_parent_d_ordered, index_by_text_par_con) - #try: - #contours_only_text_parent_d_ordered = \ - #list(np.array(contours_only_text_parent_d_ordered, dtype=np.int32)[index_by_text_par_con]) - #except: - #contours_only_text_parent_d_ordered = \ - #list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) - else: - #takes long timee - contours_only_text_parent_d_ordered = None - if self.light_version: - fun = check_any_text_region_in_model_one_is_main_or_header_light - else: - fun = check_any_text_region_in_model_one_is_main_or_header - text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, \ - all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, \ - contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = fun( - text_regions_p, regions_fully, contours_only_text_parent, - all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered) - - if self.plotter: - self.plotter.save_plot_of_layout(text_regions_p, image_page) - self.plotter.save_plot_of_layout_all(text_regions_p, image_page) - - pixel_img = 4 - polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img) - all_found_textline_polygons = adhere_drop_capital_region_into_corresponding_textline( - text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, - all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, - kernel=KERNEL, curved_line=self.curved_line, textline_light=self.textline_light) - - if not self.reading_order_machine_based: - pixel_seps = 6 - if not self.headers_off: - if np.abs(slope_deskew) < SLOPE_THRESHOLD: - num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, pixel_seps, contours_only_text_parent_h) - else: - _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, pixel_seps, contours_only_text_parent_h_d_ordered) - elif self.headers_off: - if np.abs(slope_deskew) < SLOPE_THRESHOLD: - num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, pixel_seps) - else: - _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), - num_col_classifier, self.tables, pixel_seps) - - if num_col_classifier >= 3: - if np.abs(slope_deskew) < SLOPE_THRESHOLD: - regions_without_separators = regions_without_separators.astype(np.uint8) - regions_without_separators = cv2.erode(regions_without_separators[:, :], KERNEL, iterations=6) - else: - regions_without_separators_d = regions_without_separators_d.astype(np.uint8) - regions_without_separators_d = cv2.erode(regions_without_separators_d[:, :], KERNEL, iterations=6) + all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new( + polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, + image_page_rotated, boxes_marginals, slope_deskew) + else: + scale_param = 1 + textline_mask_tot_ea_erode = cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=2) + all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, \ + all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved( + txt_con_org, contours_only_text_parent, textline_mask_tot_ea_erode, + image_page_rotated, boxes_text, text_only, + num_col_classifier, scale_param, slope_deskew) + all_found_textline_polygons = small_textlines_to_parent_adherence2( + all_found_textline_polygons, textline_mask_tot_ea, num_col_classifier) + all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, \ + all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_curved( + polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_erode, + image_page_rotated, boxes_marginals, text_only, + num_col_classifier, scale_param, slope_deskew) + all_found_textline_polygons_marginals = small_textlines_to_parent_adherence2( + all_found_textline_polygons_marginals, textline_mask_tot_ea, num_col_classifier) + + #print("text region early 6 in %.1fs", time.time() - t0) + if self.full_layout: + if np.abs(slope_deskew) >= SLOPE_THRESHOLD: + contours_only_text_parent_d_ordered = self.return_list_of_contours_with_desired_order( + contours_only_text_parent_d_ordered, index_by_text_par_con) + #try: + #contours_only_text_parent_d_ordered = \ + #list(np.array(contours_only_text_parent_d_ordered, dtype=np.int32)[index_by_text_par_con]) + #except: + #contours_only_text_parent_d_ordered = \ + #list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) + else: + #takes long timee + contours_only_text_parent_d_ordered = None + if self.light_version: + fun = check_any_text_region_in_model_one_is_main_or_header_light + else: + fun = check_any_text_region_in_model_one_is_main_or_header + text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, \ + all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, \ + contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = fun( + text_regions_p, regions_fully, contours_only_text_parent, + all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered) + if self.plotter: + self.plotter.save_plot_of_layout(text_regions_p, image_page) + self.plotter.save_plot_of_layout_all(text_regions_p, image_page) + + pixel_img = 4 + polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img) + all_found_textline_polygons = adhere_drop_capital_region_into_corresponding_textline( + text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, + all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, + kernel=KERNEL, curved_line=self.curved_line, textline_light=self.textline_light) + + if not self.reading_order_machine_based: + pixel_seps = 6 + if not self.headers_off: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, matrix_of_lines_ch, - num_col_classifier, erosion_hurts, self.tables, self.right2left) + num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( + np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), + num_col_classifier, self.tables, pixel_seps, contours_only_text_parent_h) else: - boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, - num_col_classifier, erosion_hurts, self.tables, self.right2left) - - if self.plotter: - self.plotter.write_images_into_directory(polygons_of_images, image_page) - t_order = time.time() + _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), + num_col_classifier, self.tables, pixel_seps, contours_only_text_parent_h_d_ordered) + elif self.headers_off: + if np.abs(slope_deskew) < SLOPE_THRESHOLD: + num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( + np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), + num_col_classifier, self.tables, pixel_seps) + else: + _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), + num_col_classifier, self.tables, pixel_seps) - if self.full_layout: - if self.reading_order_machine_based: - order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( - contours_only_text_parent, contours_only_text_parent_h, text_regions_p) - else: + if num_col_classifier >= 3: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - order_text_new, id_of_texts_tot = self.do_order_of_regions( - contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) + regions_without_separators = regions_without_separators.astype(np.uint8) + regions_without_separators = cv2.erode(regions_without_separators[:, :], KERNEL, iterations=6) else: - order_text_new, id_of_texts_tot = self.do_order_of_regions( - contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d) - self.logger.info("detection of reading order took %.1fs", time.time() - t_order) + regions_without_separators_d = regions_without_separators_d.astype(np.uint8) + regions_without_separators_d = cv2.erode(regions_without_separators_d[:, :], KERNEL, iterations=6) - if self.ocr: - ocr_all_textlines = [] - else: - ocr_all_textlines = None - pcgts = self.writer.build_pagexml_full_layout( - contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, - all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, - polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, - all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, - cont_page, polygons_lines_xml, ocr_all_textlines) - self.logger.info("Job done in %.1fs", time.time() - t0) - #print("Job done in %.1fs", time.time() - t0) - if self.dir_in: - self.writer.write_pagexml(pcgts) - continue + if np.abs(slope_deskew) < SLOPE_THRESHOLD: + boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( + splitter_y_new, regions_without_separators, matrix_of_lines_ch, + num_col_classifier, erosion_hurts, self.tables, self.right2left) else: - return pcgts + boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( + splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, + num_col_classifier, erosion_hurts, self.tables, self.right2left) + + if self.plotter: + self.plotter.write_images_into_directory(polygons_of_images, image_page) + t_order = time.time() + if self.full_layout: + if self.reading_order_machine_based: + order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( + contours_only_text_parent, contours_only_text_parent_h, text_regions_p) else: - contours_only_text_parent_h = None - if self.reading_order_machine_based: - order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( - contours_only_text_parent, contours_only_text_parent_h, text_regions_p) + if np.abs(slope_deskew) < SLOPE_THRESHOLD: + order_text_new, id_of_texts_tot = self.do_order_of_regions( + contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) else: - if np.abs(slope_deskew) < SLOPE_THRESHOLD: - order_text_new, id_of_texts_tot = self.do_order_of_regions( - contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) + order_text_new, id_of_texts_tot = self.do_order_of_regions( + contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d) + self.logger.info("detection of reading order took %.1fs", time.time() - t_order) + + if self.ocr: + ocr_all_textlines = [] + else: + ocr_all_textlines = None + pcgts = self.writer.build_pagexml_full_layout( + contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, + all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, + polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, + all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, + cont_page, polygons_lines_xml, ocr_all_textlines) + return pcgts + + contours_only_text_parent_h = None + if self.reading_order_machine_based: + order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( + contours_only_text_parent, contours_only_text_parent_h, text_regions_p) + else: + if np.abs(slope_deskew) < SLOPE_THRESHOLD: + order_text_new, id_of_texts_tot = self.do_order_of_regions( + contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) + else: + contours_only_text_parent_d_ordered = self.return_list_of_contours_with_desired_order( + contours_only_text_parent_d_ordered, index_by_text_par_con) + #try: + #contours_only_text_parent_d_ordered = \ + #list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) + #except: + #contours_only_text_parent_d_ordered = \ + #list(np.array(contours_only_text_parent_d_ordered, dtype=np.int32)[index_by_text_par_con]) + order_text_new, id_of_texts_tot = self.do_order_of_regions( + contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) + + if self.ocr: + device = cuda.get_current_device() + device.reset() + gc.collect() + model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") + torch.cuda.empty_cache() + model_ocr.to(device) + + ind_tot = 0 + #cv2.imwrite('./img_out.png', image_page) + ocr_all_textlines = [] + for indexing, ind_poly_first in enumerate(all_found_textline_polygons): + ocr_textline_in_textregion = [] + for indexing2, ind_poly in enumerate(ind_poly_first): + if not (self.textline_light or self.curved_line): + ind_poly = copy.deepcopy(ind_poly) + box_ind = all_box_coord[indexing] + #print(ind_poly,np.shape(ind_poly), 'ind_poly') + #print(box_ind) + ind_poly = self.return_textline_contour_with_added_box_coordinate(ind_poly, box_ind) + #print(ind_poly_copy) + ind_poly[ind_poly<0] = 0 + x, y, w, h = cv2.boundingRect(ind_poly) + #print(ind_poly_copy, np.shape(ind_poly_copy)) + #print(x, y, w, h, h/float(w),'ratio') + h2w_ratio = h/float(w) + mask_poly = np.zeros(image_page.shape) + if not self.light_version: + img_poly_on_img = np.copy(image_page) else: - contours_only_text_parent_d_ordered = self.return_list_of_contours_with_desired_order( - contours_only_text_parent_d_ordered, index_by_text_par_con) - #try: - #contours_only_text_parent_d_ordered = \ - #list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) - #except: - #contours_only_text_parent_d_ordered = \ - #list(np.array(contours_only_text_parent_d_ordered, dtype=np.int32)[index_by_text_par_con]) - order_text_new, id_of_texts_tot = self.do_order_of_regions( - contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) - - if self.ocr: - device = cuda.get_current_device() - device.reset() - gc.collect() - model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") - torch.cuda.empty_cache() - model_ocr.to(device) - - ind_tot = 0 - #cv2.imwrite('./img_out.png', image_page) - ocr_all_textlines = [] - for indexing, ind_poly_first in enumerate(all_found_textline_polygons): - ocr_textline_in_textregion = [] - for indexing2, ind_poly in enumerate(ind_poly_first): - if not (self.textline_light or self.curved_line): - ind_poly = copy.deepcopy(ind_poly) - box_ind = all_box_coord[indexing] - #print(ind_poly,np.shape(ind_poly), 'ind_poly') - #print(box_ind) - ind_poly = self.return_textline_contour_with_added_box_coordinate(ind_poly, box_ind) - #print(ind_poly_copy) - ind_poly[ind_poly<0] = 0 - x, y, w, h = cv2.boundingRect(ind_poly) - #print(ind_poly_copy, np.shape(ind_poly_copy)) - #print(x, y, w, h, h/float(w),'ratio') - h2w_ratio = h/float(w) - mask_poly = np.zeros(image_page.shape) - if not self.light_version: - img_poly_on_img = np.copy(image_page) - else: - img_poly_on_img = np.copy(img_bin_light) - mask_poly = cv2.fillPoly(mask_poly, pts=[ind_poly], color=(1, 1, 1)) - - if self.textline_light: - mask_poly = cv2.dilate(mask_poly, KERNEL, iterations=1) - img_poly_on_img[:,:,0][mask_poly[:,:,0] ==0] = 255 - img_poly_on_img[:,:,1][mask_poly[:,:,0] ==0] = 255 - img_poly_on_img[:,:,2][mask_poly[:,:,0] ==0] = 255 - - img_croped = img_poly_on_img[y:y+h, x:x+w, :] - #cv2.imwrite('./extracted_lines/'+str(ind_tot)+'.jpg', img_croped) - text_ocr = self.return_ocr_of_textline_without_common_section(img_croped, model_ocr, processor, device, w, h2w_ratio, ind_tot) - ocr_textline_in_textregion.append(text_ocr) - ind_tot = ind_tot +1 - ocr_all_textlines.append(ocr_textline_in_textregion) + img_poly_on_img = np.copy(img_bin_light) + mask_poly = cv2.fillPoly(mask_poly, pts=[ind_poly], color=(1, 1, 1)) + + if self.textline_light: + mask_poly = cv2.dilate(mask_poly, KERNEL, iterations=1) + img_poly_on_img[:,:,0][mask_poly[:,:,0] ==0] = 255 + img_poly_on_img[:,:,1][mask_poly[:,:,0] ==0] = 255 + img_poly_on_img[:,:,2][mask_poly[:,:,0] ==0] = 255 + + img_croped = img_poly_on_img[y:y+h, x:x+w, :] + #cv2.imwrite('./extracted_lines/'+str(ind_tot)+'.jpg', img_croped) + text_ocr = self.return_ocr_of_textline_without_common_section(img_croped, model_ocr, processor, device, w, h2w_ratio, ind_tot) + ocr_textline_in_textregion.append(text_ocr) + ind_tot = ind_tot +1 + ocr_all_textlines.append(ocr_textline_in_textregion) + + else: + ocr_all_textlines = None + #print(ocr_all_textlines) + self.logger.info("detection of reading order took %.1fs", time.time() - t_order) + pcgts = self.writer.build_pagexml_no_full_layout( + txt_con_org, page_coord, order_text_new, id_of_texts_tot, + all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, + all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, + cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines) + return pcgts + - else: - ocr_all_textlines = None - #print(ocr_all_textlines) - self.logger.info("detection of reading order took %.1fs", time.time() - t_order) - pcgts = self.writer.build_pagexml_no_full_layout( - txt_con_org, page_coord, order_text_new, id_of_texts_tot, - all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, - all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, - cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines) - #print("Job done in %.1fs" % (time.time() - t0)) - self.logger.info("Job done in %.1fs", time.time() - t0) - if not self.dir_in: - return pcgts - #print("text region early 7 in %.1fs", time.time() - t0) - - if self.dir_in: - self.writer.write_pagexml(pcgts) - self.logger.info("Job done in %.1fs", time.time() - t0) - #print("Job done in %.1fs" % (time.time() - t0)) - - if self.dir_in: - self.logger.info("All jobs done in %.1fs", time.time() - t0_tot) - print("all Job done in %.1fs", time.time() - t0_tot) - - class Eynollah_ocr: def __init__( self, diff --git a/src/eynollah/ocrd-tool-binarization.json b/src/eynollah/ocrd-tool-binarization.json deleted file mode 100644 index 1711e89..0000000 --- a/src/eynollah/ocrd-tool-binarization.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "version": "0.1.0", - "git_url": "https://github.com/qurator-spk/sbb_binarization", - "tools": { - "ocrd-sbb-binarize": { - "executable": "ocrd-sbb-binarize", - "description": "Pixelwise binarization with selectional auto-encoders in Keras", - "categories": ["Image preprocessing"], - "steps": ["preprocessing/optimization/binarization"], - "input_file_grp": [], - "output_file_grp": [], - "parameters": { - "operation_level": { - "type": "string", - "enum": ["page", "region"], - "default": "page", - "description": "PAGE XML hierarchy level to operate on" - }, - "model": { - "description": "Directory containing HDF5 or SavedModel/ProtoBuf models. Can be an absolute path or a path relative to the OCR-D resource location, the current working directory or the $SBB_BINARIZE_DATA environment variable (if set)", - "type": "string", - "format": "uri", - "content-type": "text/directory", - "required": true - } - }, - "resources": [ - { - "url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2020_01_16.zip", - "name": "default", - "type": "archive", - "path_in_archive": "saved_model_2020_01_16", - "size": 563147331, - "description": "default models provided by github.com/qurator-spk (SavedModel format)" - }, - { - "url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip", - "name": "default-2021-03-09", - "type": "archive", - "path_in_archive": ".", - "size": 133230419, - "description": "updated default models provided by github.com/qurator-spk (SavedModel format)" - } - ] - } - } -} diff --git a/src/eynollah/ocrd-tool.json b/src/eynollah/ocrd-tool.json index 9eb8932..125131c 100644 --- a/src/eynollah/ocrd-tool.json +++ b/src/eynollah/ocrd-tool.json @@ -1,21 +1,22 @@ { "version": "0.3.1", "git_url": "https://github.com/qurator-spk/eynollah", + "dockerhub": "ocrd/eynollah", "tools": { "ocrd-eynollah-segment": { "executable": "ocrd-eynollah-segment", "categories": ["Layout analysis"], "description": "Segment page into regions and lines and do reading order detection with eynollah", - "input_file_grp": ["OCR-D-IMG", "OCR-D-SEG-PAGE", "OCR-D-GT-SEG-PAGE"], - "output_file_grp": ["OCR-D-SEG-LINE"], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["layout/segmentation/region", "layout/segmentation/line"], "parameters": { "models": { "type": "string", - "format": "file", + "format": "uri", "content-type": "text/directory", "cacheable": true, - "description": "Path to directory containing models to be used (See https://qurator-data.de/eynollah)", + "description": "Directory containing models to be used (See https://qurator-data.de/eynollah)", "required": true }, "dpi": { @@ -32,7 +33,7 @@ "light_version": { "type": "boolean", "default": true, - "description": "Try to detect all element subtypes in light version" + "description": "Try to detect all element subtypes in light version (faster+simpler method for main region detection and deskewing)" }, "textline_light": { "type": "boolean", @@ -49,11 +50,31 @@ "default": false, "description": "try to return contour of textlines instead of just rectangle bounding box. Needs more processing time" }, + "ignore_page_extraction": { + "type": "boolean", + "default": false, + "description": "if this parameter set to true, this tool would ignore page extraction" + }, "allow_scaling": { "type": "boolean", "default": false, "description": "check the resolution against the number of detected columns and if needed, scale the image up or down during layout detection (heuristic to improve quality and performance)" }, + "allow_enhancement": { + "type": "boolean", + "default": false, + "description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not." + }, + "textline_light": { + "type": "boolean", + "default": false, + "description": "if this parameter set to true, this tool will try to return contoure of textlines instead of rectangle bounding box of textline with a faster method." + }, + "right_to_left": { + "type": "boolean", + "default": false, + "description": "if this parameter set to true, this tool will extract right-to-left reading order." + }, "headers_off": { "type": "boolean", "default": false, @@ -61,14 +82,55 @@ } }, "resources": [ - { - "description": "models for eynollah (TensorFlow SavedModel format)", - "url": "https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz", - "name": "default", - "size": 1894627041, - "type": "archive", - "path_in_archive": "models_eynollah" - } + { + "description": "models for eynollah (TensorFlow SavedModel format)", + "url": "https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz", + "name": "default", + "size": 1894627041, + "type": "archive", + "path_in_archive": "models_eynollah" + } + ] + }, + "ocrd-sbb-binarize": { + "executable": "ocrd-sbb-binarize", + "description": "Pixelwise binarization with selectional auto-encoders in Keras", + "categories": ["Image preprocessing"], + "steps": ["preprocessing/optimization/binarization"], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "parameters": { + "operation_level": { + "type": "string", + "enum": ["page", "region"], + "default": "page", + "description": "PAGE XML hierarchy level to operate on" + }, + "model": { + "description": "Directory containing HDF5 or SavedModel/ProtoBuf models. Can be an absolute path or a path relative to the OCR-D resource location, the current working directory or the $SBB_BINARIZE_DATA environment variable (if set)", + "type": "string", + "format": "uri", + "content-type": "text/directory", + "required": true + } + }, + "resources": [ + { + "url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2020_01_16.zip", + "name": "default", + "type": "archive", + "path_in_archive": "saved_model_2020_01_16", + "size": 563147331, + "description": "default models provided by github.com/qurator-spk (SavedModel format)" + }, + { + "url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip", + "name": "default-2021-03-09", + "type": "archive", + "path_in_archive": ".", + "size": 133230419, + "description": "updated default models provided by github.com/qurator-spk (SavedModel format)" + } ] } } diff --git a/src/eynollah/ocrd_cli_binarization.py b/src/eynollah/ocrd_cli_binarization.py index 6a8bbdc..848bbac 100644 --- a/src/eynollah/ocrd_cli_binarization.py +++ b/src/eynollah/ocrd_cli_binarization.py @@ -1,29 +1,16 @@ -from os import environ -from os.path import join -from pathlib import Path -from pkg_resources import resource_string -from json import loads +from typing import Optional from PIL import Image import numpy as np import cv2 from click import command -from ocrd_utils import ( - getLogger, - assert_file_grp_cardinality, - make_file_id, - MIMETYPE_PAGE -) -from ocrd import Processor -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import AlternativeImageType, to_xml +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage +from ocrd_models.ocrd_page import OcrdPage, AlternativeImageType from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from .sbb_binarize import SbbBinarizer -OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool-binarization.json').decode('utf8')) -TOOL = 'ocrd-sbb-binarize' def cv2pil(img): return Image.fromarray(img.astype('uint8')) @@ -35,39 +22,22 @@ def pil2cv(img): return cv2.cvtColor(pil_as_np_array, color_conversion) class SbbBinarizeProcessor(Processor): + # already employs GPU (without singleton process atm) + max_workers = 1 - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] - kwargs['version'] = OCRD_TOOL['version'] - super().__init__(*args, **kwargs) - if hasattr(self, 'output_file_grp'): - # processing context - self.setup() + @property + def executable(self): + return 'ocrd-sbb-binarize' def setup(self): """ Set up the model prior to processing. """ - LOG = getLogger('processor.SbbBinarize.__init__') - if not 'model' in self.parameter: - raise ValueError("'model' parameter is required") - # resolve relative path via environment variable - model_path = Path(self.parameter['model']) - if not model_path.is_absolute(): - if 'SBB_BINARIZE_DATA' in environ and environ['SBB_BINARIZE_DATA']: - LOG.info("Environment variable SBB_BINARIZE_DATA is set to '%s'" \ - " - prepending to model value '%s'. If you don't want this mechanism," \ - " unset the SBB_BINARIZE_DATA environment variable.", - environ['SBB_BINARIZE_DATA'], model_path) - model_path = Path(environ['SBB_BINARIZE_DATA']).joinpath(model_path) - model_path = model_path.resolve() - if not model_path.is_dir(): - raise FileNotFoundError("Does not exist or is not a directory: %s" % model_path) # resolve relative path via OCR-D ResourceManager - model_path = self.resolve_resource(str(model_path)) - self.binarizer = SbbBinarizer(model_dir=model_path, logger=LOG) + model_path = self.resolve_resource(self.parameter['model']) + self.binarizer = SbbBinarizer(model_dir=model_path, logger=self.logger) - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """ Binarize images with sbb_binarization (based on selectional auto-encoders). @@ -88,71 +58,52 @@ class SbbBinarizeProcessor(Processor): Produce a new PAGE output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.SbbBinarize') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - + assert input_pcgts + assert input_pcgts[0] + assert self.parameter oplevel = self.parameter['operation_level'] - - for n, input_file in enumerate(self.input_files): - file_id = make_file_id(input_file, self.output_file_grp) - page_id = input_file.pageId or input_file.ID - LOG.info("INPUT FILE %i / %s", n, page_id) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - pcgts.set_pcGtsId(file_id) - page = pcgts.get_Page() - page_image, page_xywh, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - - if oplevel == 'page': - LOG.info("Binarizing on 'page' level in page '%s'", page_id) - bin_image = cv2pil(self.binarizer.run(image=pil2cv(page_image), use_patches=True)) - # update METS (add the image file): - bin_image_path = self.workspace.save_image_file(bin_image, - file_id + '.IMG-BIN', - page_id=input_file.pageId, - file_grp=self.output_file_grp) - page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments='%s,binarized' % page_xywh['features'])) - - elif oplevel == 'region': - regions = page.get_AllRegions(['Text', 'Table'], depth=1) - if not regions: - LOG.warning("Page '%s' contains no text/table regions", page_id) - for region in regions: - region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh, feature_filter='binarized') - region_image_bin = cv2pil(binarizer.run(image=pil2cv(region_image), use_patches=True)) - region_image_bin_path = self.workspace.save_image_file( - region_image_bin, - "%s_%s.IMG-BIN" % (file_id, region.id), - page_id=input_file.pageId, - file_grp=self.output_file_grp) - region.add_AlternativeImage( - AlternativeImageType(filename=region_image_bin_path, comments='%s,binarized' % region_xywh['features'])) - - elif oplevel == 'line': - region_line_tuples = [(r.id, r.get_TextLine()) for r in page.get_AllRegions(['Text'], depth=0)] - if not region_line_tuples: - LOG.warning("Page '%s' contains no text lines", page_id) - for region_id, line in region_line_tuples: - line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh, feature_filter='binarized') - line_image_bin = cv2pil(binarizer.run(image=pil2cv(line_image), use_patches=True)) - line_image_bin_path = self.workspace.save_image_file( - line_image_bin, - "%s_%s_%s.IMG-BIN" % (file_id, region_id, line.id), - page_id=input_file.pageId, - file_grp=self.output_file_grp) - line.add_AlternativeImage( - AlternativeImageType(filename=line_image_bin_path, comments='%s,binarized' % line_xywh['features'])) - - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - mimetype=MIMETYPE_PAGE, - local_filename=join(self.output_file_grp, file_id + '.xml'), - content=to_xml(pcgts)) + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + page = pcgts.get_Page() + page_image, page_xywh, _ = self.workspace.image_from_page( + page, page_id, feature_filter='binarized') + + if oplevel == 'page': + self.logger.info("Binarizing on 'page' level in page '%s'", page_id) + page_image_bin = cv2pil(self.binarizer.run(image=pil2cv(page_image), use_patches=True)) + # update PAGE (reference the image file): + page_image_ref = AlternativeImageType(comments=page_xywh['features'] + ',binarized,clipped') + page.add_AlternativeImage(page_image_ref) + result.images.append(OcrdPageResultImage(page_image_bin, '.IMG-BIN', page_image_ref)) + + elif oplevel == 'region': + regions = page.get_AllRegions(['Text', 'Table'], depth=1) + if not regions: + self.logger.warning("Page '%s' contains no text/table regions", page_id) + for region in regions: + region_image, region_xywh = self.workspace.image_from_segment( + region, page_image, page_xywh, feature_filter='binarized') + region_image_bin = cv2pil(self.binarizer.run(image=pil2cv(region_image), use_patches=True)) + # update PAGE (reference the image file): + region_image_ref = AlternativeImageType(comments=region_xywh['features'] + ',binarized') + region.add_AlternativeImage(region_image_ref) + result.images.append(OcrdPageResultImage(region_image_bin, region.id + '.IMG-BIN', region_image_ref)) + + elif oplevel == 'line': + lines = page.get_AllTextLines() + if not lines: + self.logger.warning("Page '%s' contains no text lines", page_id) + for line in lines: + line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh, feature_filter='binarized') + line_image_bin = cv2pil(self.binarizer.run(image=pil2cv(line_image), use_patches=True)) + # update PAGE (reference the image file): + line_image_ref = AlternativeImageType(comments=line_xywh['features'] + ',binarized') + line.add_AlternativeImage(region_image_ref) + result.images.append(OcrdPageResultImage(line_image_bin, line.id + '.IMG-BIN', line_image_ref)) + + return result @command() @ocrd_cli_options -def cli(*args, **kwargs): +def main(*args, **kwargs): return ocrd_cli_wrap_processor(SbbBinarizeProcessor, *args, **kwargs) diff --git a/src/eynollah/plot.py b/src/eynollah/plot.py index b01fc04..412ae5a 100644 --- a/src/eynollah/plot.py +++ b/src/eynollah/plot.py @@ -1,5 +1,8 @@ -import matplotlib.pyplot as plt -import matplotlib.patches as mpatches +try: + import matplotlib.pyplot as plt + import matplotlib.patches as mpatches +except ImportError: + plt = mpatches = None import numpy as np import os.path import cv2 diff --git a/src/eynollah/processor.py b/src/eynollah/processor.py index 4eced21..8f99489 100644 --- a/src/eynollah/processor.py +++ b/src/eynollah/processor.py @@ -1,71 +1,91 @@ -from json import loads -from pkg_resources import resource_string -from tempfile import NamedTemporaryFile -from pathlib import Path -from os.path import join +from typing import Optional +from ocrd_models import OcrdPage +from ocrd import Processor, OcrdPageResult -from PIL import Image +from .eynollah import Eynollah, EynollahXmlWriter -from ocrd import Processor -from ocrd_modelfactory import page_from_file, exif_from_filename -from ocrd_models import OcrdFile, OcrdExif -from ocrd_models.ocrd_page import to_xml -from ocrd_utils import ( - getLogger, - MIMETYPE_PAGE, - assert_file_grp_cardinality, - make_file_id -) +class EynollahProcessor(Processor): + # already employs background CPU multiprocessing per page + # already employs GPU (without singleton process atm) + max_workers = 1 -from .eynollah import Eynollah -from .utils.pil_cv2 import pil2cv + @property + def executable(self): + return 'ocrd-eynollah-segment' -OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) + def setup(self) -> None: + if self.parameter['textline_light'] and not self.parameter['light_version']: + raise ValueError("Error: You set parameter 'textline_light' to enable light textline detection, " + "but parameter 'light_version' is not enabled") + self.eynollah = Eynollah( + self.resolve_resource(self.parameter['models']), + logger=self.logger, + allow_enhancement=self.parameter['allow_enhancement'], + curved_line=self.parameter['curved_line'], + right2left=self.parameter['right_to_left'], + ignore_page_extraction=self.parameter['ignore_page_extraction'], + light_version=self.parameter['light_version'], + textline_light=self.parameter['textline_light'], + full_layout=self.parameter['full_layout'], + allow_scaling=self.parameter['allow_scaling'], + headers_off=self.parameter['headers_off'], + tables=self.parameter['tables'], + ) + self.eynollah.plotter = None -class EynollahProcessor(Processor): + def shutdown(self): + if hasattr(self, 'eynollah'): + del self.eynollah + + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + """ + Performs cropping, region and line segmentation with Eynollah. + + For each page, open and deserialize PAGE input file (from existing + PAGE file in the input fileGrp, or generated from image file). + Retrieve its respective page-level image (ignoring annotation that + already added `binarized`, `cropped` or `deskewed` features). + + Set up Eynollah to detect regions and lines, and add each one to the + page, respectively. - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment'] - kwargs['version'] = OCRD_TOOL['version'] - super().__init__(*args, **kwargs) + \b + - If ``tables``, try to detect table blocks and add them as TableRegion. + - If ``full_layout``, then in addition to paragraphs and marginals, also + try to detect drop capitals and headings. + - If ``ignore_page_extraction``, then attempt no cropping of the page. + - If ``curved_line``, then compute contour polygons for text lines + instead of simple bounding boxes. - def process(self): - LOG = getLogger('eynollah') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - for n, input_file in enumerate(self.input_files): - page_id = input_file.pageId or input_file.ID - LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files)) - pcgts = page_from_file(self.workspace.download_file(input_file)) - LOG.debug('width %s height %s', pcgts.get_Page().imageWidth, pcgts.get_Page().imageHeight) - self.add_metadata(pcgts) - page = pcgts.get_Page() - # XXX loses DPI information - # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename - eynollah_kwargs = { - 'dir_models': self.resolve_resource(self.parameter['models']), - 'dir_out': self.output_file_grp, - 'allow_enhancement': False, - 'curved_line': self.parameter['curved_line'], - 'full_layout': self.parameter['full_layout'], - 'allow_scaling': self.parameter['allow_scaling'], - 'light_version': self.parameter['light_version'], - 'textline_light': self.parameter['textline_light'], - 'headers_off': self.parameter['headers_off'], - 'tables': self.parameter['tables'], - 'override_dpi': self.parameter['dpi'], - 'logger': LOG, - 'pcgts': pcgts, - 'image_filename': image_filename - } - Eynollah(**eynollah_kwargs).run() - file_id = make_file_id(input_file, self.output_file_grp) - pcgts.set_pcGtsId(file_id) - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=page_id, - mimetype=MIMETYPE_PAGE, - local_filename=join(self.output_file_grp, file_id) + '.xml', - content=to_xml(pcgts)) + Produce a new output file by serialising the resulting hierarchy. + """ + assert input_pcgts + assert input_pcgts[0] + assert self.parameter + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + page = pcgts.get_Page() + page_image, _, _ = self.workspace.image_from_page( + page, page_id, + # avoid any features that would change the coordinate system: cropped,deskewed + # (the PAGE builder merely adds regions, so afterwards we would not know which to transform) + # also avoid binarization as models usually fare better on grayscale/RGB + feature_filter='cropped,deskewed,binarized') + if hasattr(page_image, 'filename'): + image_filename = page_image.filename + else: + image_filename = "dummy" # will be replaced by ocrd.Processor.process_page_file + result.images.append(OcrdPageResultImage(page_image, '.IMG', page)) # mark as new original + # FIXME: mask out already existing regions (incremental segmentation) + self.eynollah.cache_images( + image_pil=page_image, + dpi=self.parameter['dpi'], + ) + self.eynollah.writer = EynollahXmlWriter( + dir_out=None, + image_filename=image_filename, + curved_line=self.eynollah.curved_line, + textline_light=self.eynollah.textline_light, + pcgts=pcgts) + self.eynollah.run_single() + return result diff --git a/src/eynollah/sbb_binarize.py b/src/eynollah/sbb_binarize.py index 36e9ab0..f43b6ba 100644 --- a/src/eynollah/sbb_binarize.py +++ b/src/eynollah/sbb_binarize.py @@ -4,24 +4,18 @@ Tool to load model and binarize a given image. import sys from glob import glob -from os import environ, devnull -from os.path import join -from warnings import catch_warnings, simplefilter import os +import logging import numpy as np from PIL import Image import cv2 -environ['TF_CPP_MIN_LOG_LEVEL'] = '3' -stderr = sys.stderr -sys.stderr = open(devnull, 'w') +from ocrd_utils import tf_disable_interactive_logs +tf_disable_interactive_logs() import tensorflow as tf from tensorflow.keras.models import load_model from tensorflow.python.keras import backend as tensorflow_backend -sys.stderr = stderr - -import logging def resize_image(img_in, input_height, input_width): return cv2.resize(img_in, (input_width, input_height), interpolation=cv2.INTER_NEAREST) @@ -53,7 +47,7 @@ class SbbBinarizer: del self.session def load_model(self, model_name): - model = load_model(join(self.model_dir, model_name), compile=False) + model = load_model(os.path.join(self.model_dir, model_name), compile=False) model_height = model.layers[len(model.layers)-1].output_shape[1] model_width = model.layers[len(model.layers)-1].output_shape[2] n_classes = model.layers[len(model.layers)-1].output_shape[3] diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index a67fc38..b93801c 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1,13 +1,17 @@ +import time import math -import matplotlib.pyplot as plt +try: + import matplotlib.pyplot as plt +except ImportError: + plt = None import numpy as np from shapely import geometry import cv2 import imutils from scipy.signal import find_peaks from scipy.ndimage import gaussian_filter1d -import time + from .is_nan import isNaN from .contour import (contours_in_same_horizon, find_new_features_of_contours, @@ -237,10 +241,8 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( if len(remained_sep_indexes)>1: #print(np.array(remained_sep_indexes),'np.array(remained_sep_indexes)') #print(np.array(mother),'mother') - ##remained_sep_indexes_without_mother = remained_sep_indexes[mother==0] - ##remained_sep_indexes_with_child_without_mother = remained_sep_indexes[mother==0 & child==1] - remained_sep_indexes_without_mother=np.array(list(remained_sep_indexes))[np.array(mother)==0] - remained_sep_indexes_with_child_without_mother=np.array(list(remained_sep_indexes))[(np.array(mother)==0) & (np.array(child)==1)] + remained_sep_indexes_without_mother = remained_sep_indexes[mother==0] + remained_sep_indexes_with_child_without_mother = remained_sep_indexes[(mother==0) & (child==1)] #print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother') #print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother') @@ -980,7 +982,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light( (regions_model_full[:,:,0]==2)).sum() pixels_main = all_pixels - pixels_header - if (pixels_header>=pixels_main) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ): + if (pixels_header/float(pixels_main)>=0.3) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ): regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=2 contours_only_text_parent_head.append(con) if contours_only_text_parent_d_ordered is not None: diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index be00db0..1adb943 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -247,7 +247,7 @@ def get_textregion_contours_in_org_image_light(cnts, img, slope_first, map=map): img = cv2.resize(img, (int(img.shape[1]/6), int(img.shape[0]/6)), interpolation=cv2.INTER_NEAREST) ##cnts = list( (np.array(cnts)/2).astype(np.int16) ) #cnts = cnts/2 - cnts = [(i/6).astype(np.int) for i in cnts] + cnts = [(i/6).astype(int) for i in cnts] results = map(partial(do_back_rotation_and_get_cnt_back, img=img, slope_first=slope_first, diff --git a/src/eynollah/utils/pil_cv2.py b/src/eynollah/utils/pil_cv2.py index 83ae47d..9f6913e 100644 --- a/src/eynollah/utils/pil_cv2.py +++ b/src/eynollah/utils/pil_cv2.py @@ -1,3 +1,4 @@ +from contextlib import nullcontext from PIL import Image import numpy as np from ocrd_models import OcrdExif @@ -17,12 +18,13 @@ def pil2cv(img): def check_dpi(img): try: if isinstance(img, Image.Image): - pil_image = img + pil_image = nullcontext(img) elif isinstance(img, str): pil_image = Image.open(img) else: - pil_image = cv2pil(img) - exif = OcrdExif(pil_image) + pil_image = nullcontext(cv2pil(img)) + with pil_image: + exif = OcrdExif(pil_image) resolution = exif.resolution if resolution == 1: raise Exception() diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index 7e77afe..3499c29 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -1616,7 +1616,7 @@ def do_work_of_slopes_new( textline_con_fil = filter_contours_area_of_image(img_int_p, textline_con, hierarchy, max_area=1, min_area=0.00008) - y_diff_mean = find_contours_mean_y_diff(textline_con_fil) + y_diff_mean = find_contours_mean_y_diff(textline_con_fil) if len(textline_con_fil) > 1 else np.NaN if np.isnan(y_diff_mean): slope_for_all = MAX_SLOPE else: @@ -1681,7 +1681,7 @@ def do_work_of_slopes_new_curved( textline_con_fil = filter_contours_area_of_image(img_int_p, textline_con, hierarchy, max_area=1, min_area=0.0008) - y_diff_mean = find_contours_mean_y_diff(textline_con_fil) + y_diff_mean = find_contours_mean_y_diff(textline_con_fil) if len(textline_con_fil) > 1 else np.NaN if np.isnan(y_diff_mean): slope_for_all = MAX_SLOPE else: diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index 66747b1..7bcd9af 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -28,7 +28,7 @@ class EynollahXmlWriter(): self.counter = EynollahIdCounter() self.dir_out = dir_out self.image_filename = image_filename - self.output_filename = os.path.join(self.dir_out, self.image_filename_stem) + ".xml" + self.output_filename = os.path.join(self.dir_out or "", self.image_filename_stem) + ".xml" self.curved_line = curved_line self.textline_light = textline_light self.pcgts = pcgts diff --git a/tests/resources/euler_rechenkunst01_1738_0025.tif b/tests/resources/euler_rechenkunst01_1738_0025.tif new file mode 100644 index 0000000..db6bae1 Binary files /dev/null and b/tests/resources/euler_rechenkunst01_1738_0025.tif differ