diff --git a/.github/workflows/test-eynollah.yml b/.github/workflows/test-eynollah.yml index b27586c..042e508 100644 --- a/.github/workflows/test-eynollah.yml +++ b/.github/workflows/test-eynollah.yml @@ -27,7 +27,12 @@ jobs: - uses: actions/cache@v4 id: seg_model_cache with: - path: models_eynollah + path: models_layout_v0_5_0 + key: ${{ runner.os }}-models + - uses: actions/cache@v4 + id: ocr_model_cache + with: + path: models_ocr_v0_5_0 key: ${{ runner.os }}-models - uses: actions/cache@v4 id: bin_model_cache @@ -35,7 +40,7 @@ jobs: path: default-2021-03-09 key: ${{ runner.os }}-modelbin - name: Download models - if: steps.seg_model_cache.outputs.cache-hit != 'true' || steps.bin_model_cache.outputs.cache-hit != 'true' + if: steps.seg_model_cache.outputs.cache-hit != 'true' || steps.bin_model_cache.outputs.cache-hit != 'true' || steps.ocr_model_cache.outputs.cache-hit != true run: make models - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 diff --git a/.gitignore b/.gitignore index 5236dde..0d5d834 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ models_eynollah* output.html /build /dist +*.tif diff --git a/CHANGELOG.md b/CHANGELOG.md index ad86fe5..a05919e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,13 @@ Versioned according to [Semantic Versioning](http://semver.org/). Fixed: - * restoring the contour in the original image caused an error due to an empty tuple + * restoring the contour in the original image caused an error due to an empty tuple, #154 + +Added: + + * `eynollah machine-based-reading-order` CLI to run reading order detection, #175 + * `eynollah enhancement` CLI to run image enhancement, #175 + * Improved models for page extraction and reading order detection, #175 ## [0.4.0] - 2025-04-07 diff --git a/Makefile b/Makefile index 73d4d34..a920615 100644 --- a/Makefile +++ b/Makefile @@ -9,12 +9,15 @@ DOCKER ?= docker #SEG_MODEL := https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz #SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz -SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah.tar.gz +# SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah.tar.gz #SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz #SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz +SEG_MODEL := https://zenodo.org/records/17194824/files/models_layout_v0_5_0.tar.gz?download=1 BIN_MODEL := https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip +OCR_MODEL := https://zenodo.org/records/17194824/files/models_ocr_v0_5_0.tar.gz?download=1 + PYTEST_ARGS ?= -vv # BEGIN-EVAL makefile-parser --make-help Makefile @@ -28,7 +31,7 @@ help: @echo " install Install package with pip" @echo " install-dev Install editable with pip" @echo " deps-test Install test dependencies with pip" - @echo " models Download and extract models to $(CURDIR)/models_eynollah" + @echo " models Download and extract models to $(CURDIR)/models_layout_v0_5_0" @echo " smoke-test Run simple CLI check" @echo " ocrd-test Run OCR-D CLI check" @echo " test Run unit tests" @@ -44,14 +47,20 @@ help: # END-EVAL -# Download and extract models to $(PWD)/models_eynollah -models: models_eynollah default-2021-03-09 +# Download and extract models to $(PWD)/models_layout_v0_5_0 +models: models_layout_v0_5_0 models_ocr_v0_5_0 default-2021-03-09 -models_eynollah: models_eynollah.tar.gz - tar zxf models_eynollah.tar.gz +models_layout_v0_5_0: models_layout_v0_5_0.tar.gz + tar zxf models_layout_v0_5_0.tar.gz -models_eynollah.tar.gz: - wget $(SEG_MODEL) +models_layout_v0_5_0.tar.gz: + wget -O $@ $(SEG_MODEL) + +models_ocr_v0_5_0: models_ocr_v0_5_0.tar.gz + tar zxf models_ocr_v0_5_0.tar.gz + +models_ocr_v0_5_0.tar.gz: + wget -O $@ $(OCR_MODEL) default-2021-03-09: $(notdir $(BIN_MODEL)) unzip $(notdir $(BIN_MODEL)) @@ -73,20 +82,28 @@ install: install-dev: $(PIP) install -e .$(and $(EXTRAS),[$(EXTRAS)]) -deps-test: models_eynollah +deps-test: models_layout_v0_5_0 $(PIP) install -r requirements-test.txt smoke-test: TMPDIR != mktemp -d smoke-test: tests/resources/kant_aufklaerung_1784_0020.tif # layout analysis: - eynollah layout -i $< -o $(TMPDIR) -m $(CURDIR)/models_eynollah + eynollah layout -i $< -o $(TMPDIR) -m $(CURDIR)/models_layout_v0_5_0 fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 $(TMPDIR)/$(basename $( Document Layout Analysis with Deep Learning and Heuristics + +> Document Layout Analysis, Binarization and OCR with Deep Learning and Heuristics [![PyPI Version](https://img.shields.io/pypi/v/eynollah)](https://pypi.org/project/eynollah/) [![GH Actions Test](https://github.com/qurator-spk/eynollah/actions/workflows/test-eynollah.yml/badge.svg)](https://github.com/qurator-spk/eynollah/actions/workflows/test-eynollah.yml) @@ -19,9 +20,11 @@ * Output in [PAGE-XML](https://github.com/PRImA-Research-Lab/PAGE-XML) * [OCR-D](https://github.com/qurator-spk/eynollah#use-as-ocr-d-processor) interface -:warning: Development is currently focused on achieving the best possible quality of results for a wide variety of historical documents and therefore processing can be very slow. We aim to improve this, but contributions are welcome. +:warning: Development is currently focused on achieving the best possible quality of results for a wide variety of +historical documents and therefore processing can be very slow. We aim to improve this, but contributions are welcome. ## Installation + Python `3.8-3.11` with Tensorflow `<2.13` on Linux are currently supported. For (limited) GPU support the CUDA toolkit needs to be installed. @@ -41,19 +44,40 @@ cd eynollah; pip install -e . Alternatively, you can run `make install` or `make install-dev` for editable installation. +To also install the dependencies for the OCR engines: + +``` +pip install "eynollah[OCR]" +# or +make install EXTRAS=OCR +``` + ## Models -Pre-trained models can be downloaded from [qurator-data.de](https://qurator-data.de/eynollah/) or [huggingface](https://huggingface.co/SBB?search_models=eynollah). +Pretrained models can be downloaded from [zenodo](https://zenodo.org/records/17194824) or [huggingface](https://huggingface.co/SBB?search_models=eynollah). For documentation on methods and models, have a look at [`models.md`](https://github.com/qurator-spk/eynollah/tree/main/docs/models.md). ## Train + In case you want to train your own model with Eynollah, have a look at [`train.md`](https://github.com/qurator-spk/eynollah/tree/main/docs/train.md). ## Usage -The command-line interface can be called like this: + +Eynollah supports five use cases: layout analysis (segmentation), binarization, +image enhancement, text recognition (OCR), and (trainable) reading order detection. + +### Layout Analysis + +The layout analysis module is responsible for detecting layouts, identifying text lines, and determining reading order +using both heuristic methods or a machine-based reading order detection model. + +Note that there are currently two supported ways for reading order detection: either as part of layout analysis based +on image input, or, currently under development, for given layout analysis results based on PAGE-XML data as input. + +The command-line interface for layout analysis can be called like this: ```sh -eynollah \ +eynollah layout \ -i | -di \ -o \ -m \ @@ -66,6 +90,7 @@ The following options can be used to further configure the processing: |-------------------|:-------------------------------------------------------------------------------| | `-fl` | full layout analysis including all steps and segmentation classes | | `-light` | lighter and faster but simpler method for main region detection and deskewing | +| `-tll` | this indicates the light textline and should be passed with light version | | `-tab` | apply table detection | | `-ae` | apply enhancement (the resulting image is saved to the output directory) | | `-as` | apply scaling | @@ -80,9 +105,51 @@ The following options can be used to further configure the processing: | `-sp ` | save cropped page image to this directory | | `-sa ` | save all (plot, enhanced/binary image, layout) to this directory | -If no option is set, the tool performs layout detection of main regions (background, text, images, separators and marginals). +If no option is set, the tool performs layout detection of main regions (background, text, images, separators +and marginals). The best output quality is produced when RGB images are used as input rather than greyscale or binarized images. +### Binarization + +The binarization module performs document image binarization using pretrained pixelwise segmentation models. + +The command-line interface for binarization of single image can be called like this: + +```sh +eynollah binarization \ + -i | -di \ + -o \ + -m \ +``` + +### OCR + +The OCR module performs text recognition from images using two main families of pretrained models: CNN-RNN–based OCR and Transformer-based OCR. + +The command-line interface for ocr can be called like this: + +```sh +eynollah ocr \ + -i | -di \ + -dx \ + -o \ + -m | --model_name \ +``` + +### Machine-based-reading-order + +The machine-based reading-order module employs a pretrained model to identify the reading order from layouts represented in PAGE-XML files. + +The command-line interface for machine based reading order can be called like this: + +```sh +eynollah machine-based-reading-order \ + -i | -di \ + -xml | -dx \ + -m \ + -o +``` + #### Use as OCR-D processor Eynollah ships with a CLI interface to be used as [OCR-D](https://ocr-d.de) [processor](https://ocr-d.de/en/spec/cli), @@ -90,8 +157,7 @@ formally described in [`ocrd-tool.json`](https://github.com/qurator-spk/eynollah In this case, the source image file group with (preferably) RGB images should be used as input like this: - ocrd-eynollah-segment -I OCR-D-IMG -O OCR-D-SEG -P models 2022-04-05 - + ocrd-eynollah-segment -I OCR-D-IMG -O OCR-D-SEG -P models eynollah_layout_v0_5_0 If the input file group is PAGE-XML (from a previous OCR-D workflow step), Eynollah behaves as follows: - existing regions are kept and ignored (i.e. in effect they might overlap segments from Eynollah results) @@ -103,15 +169,20 @@ If the input file group is PAGE-XML (from a previous OCR-D workflow step), Eynol (because some other preprocessing step was in effect like `denoised`), then the output PAGE-XML will be based on that as new top-level (`@imageFilename`) - - ocrd-eynollah-segment -I OCR-D-XYZ -O OCR-D-SEG -P models 2022-04-05 + ocrd-eynollah-segment -I OCR-D-XYZ -O OCR-D-SEG -P models eynollah_layout_v0_5_0 Still, in general, it makes more sense to add other workflow steps **after** Eynollah. +There is also an OCR-D processor for the binarization: + + ocrd-sbb-binarize -I OCR-D-IMG -O OCR-D-BIN -P models default-2021-03-09 + #### Additional documentation + Please check the [wiki](https://github.com/qurator-spk/eynollah/wiki). ## How to cite + If you find this tool useful in your work, please consider citing our paper: ```bibtex diff --git a/docs/models.md b/docs/models.md index ac563b0..3d296d5 100644 --- a/docs/models.md +++ b/docs/models.md @@ -1,5 +1,6 @@ # Models documentation -This suite of 14 models presents a document layout analysis (DLA) system for historical documents implemented by + +This suite of 15 models presents a document layout analysis (DLA) system for historical documents implemented by pixel-wise segmentation using a combination of a ResNet50 encoder with various U-Net decoders. In addition, heuristic methods are applied to detect marginals and to determine the reading order of text regions. @@ -23,6 +24,7 @@ See the flowchart below for the different stages and how they interact: ## Models ### Image enhancement + Model card: [Image Enhancement](https://huggingface.co/SBB/eynollah-enhancement) This model addresses image resolution, specifically targeting documents with suboptimal resolution. In instances where @@ -30,12 +32,14 @@ the detection of document layout exhibits inadequate performance, the proposed e the quality and clarity of the images, thus facilitating enhanced visual interpretation and analysis. ### Page extraction / border detection + Model card: [Page Extraction/Border Detection](https://huggingface.co/SBB/eynollah-page-extraction) A problem that can negatively affect OCR are black margins around a page caused by document scanning. A deep learning model helps to crop to the page borders by using a pixel-wise segmentation method. ### Column classification + Model card: [Column Classification](https://huggingface.co/SBB/eynollah-column-classifier) This model is a trained classifier that recognizes the number of columns in a document by use of a training set with @@ -43,6 +47,7 @@ manual classification of all documents into six classes with either one, two, th respectively. ### Binarization + Model card: [Binarization](https://huggingface.co/SBB/eynollah-binarization) This model is designed to tackle the intricate task of document image binarization, which involves segmentation of the @@ -52,6 +57,7 @@ capability of the model enables improved accuracy and reliability in subsequent enhanced document understanding and interpretation. ### Main region detection + Model card: [Main Region Detection](https://huggingface.co/SBB/eynollah-main-regions) This model has employed a different set of labels, including an artificial class specifically designed to encompass the @@ -61,6 +67,7 @@ during the inference phase. By incorporating this methodology, improved efficien model's ability to accurately identify and classify text regions within documents. ### Main region detection (with scaling augmentation) + Model card: [Main Region Detection (with scaling augmentation)](https://huggingface.co/SBB/eynollah-main-regions-aug-scaling) Utilizing scaling augmentation, this model leverages the capability to effectively segment elements of extremely high or @@ -69,12 +76,14 @@ categorizing and isolating such elements, thereby enhancing its overall performa documents with varying scale characteristics. ### Main region detection (with rotation augmentation) + Model card: [Main Region Detection (with rotation augmentation)](https://huggingface.co/SBB/eynollah-main-regions-aug-rotation) This model takes advantage of rotation augmentation. This helps the tool to segment the vertical text regions in a robust way. ### Main region detection (ensembled) + Model card: [Main Region Detection (ensembled)](https://huggingface.co/SBB/eynollah-main-regions-ensembled) The robustness of this model is attained through an ensembling technique that combines the weights from various epochs. @@ -82,16 +91,19 @@ By employing this approach, the model achieves a high level of resilience and st strengths of multiple epochs to enhance its overall performance and deliver consistent and reliable results. ### Full region detection (1,2-column documents) + Model card: [Full Region Detection (1,2-column documents)](https://huggingface.co/SBB/eynollah-full-regions-1column) This model deals with documents comprising of one and two columns. ### Full region detection (3,n-column documents) + Model card: [Full Region Detection (3,n-column documents)](https://huggingface.co/SBB/eynollah-full-regions-3pluscolumn) This model is responsible for detecting headers and drop capitals in documents with three or more columns. ### Textline detection + Model card: [Textline Detection](https://huggingface.co/SBB/eynollah-textline) The method for textline detection combines deep learning and heuristics. In the deep learning part, an image-to-image @@ -106,6 +118,7 @@ segmentation is first deskewed and then the textlines are separated with the sam textline bounding boxes. Later, the strap is rotated back into its original orientation. ### Textline detection (light) + Model card: [Textline Detection Light (simpler but faster method)](https://huggingface.co/SBB/eynollah-textline_light) The method for textline detection combines deep learning and heuristics. In the deep learning part, an image-to-image @@ -119,6 +132,7 @@ enhancing the model's ability to accurately identify and delineate individual te eliminates the need for additional heuristics in extracting textline contours. ### Table detection + Model card: [Table Detection](https://huggingface.co/SBB/eynollah-tables) The objective of this model is to perform table segmentation in historical document images. Due to the pixel-wise @@ -128,17 +142,21 @@ effectively identify and delineate tables within the historical document images, enabling subsequent analysis and interpretation. ### Image detection + Model card: [Image Detection](https://huggingface.co/SBB/eynollah-image-extraction) This model is used for the task of illustration detection only. ### Reading order detection + Model card: [Reading Order Detection]() TODO ## Heuristic methods + Additionally, some heuristic methods are employed to further improve the model predictions: + * After border detection, the largest contour is determined by a bounding box, and the image cropped to these coordinates. * For text region detection, the image is scaled up to make it easier for the model to detect background space between text regions. * A minimum area is defined for text regions in relation to the overall image dimensions, so that very small regions that are noise can be filtered out. diff --git a/docs/train.md b/docs/train.md index 9f44a63..47ad67b 100644 --- a/docs/train.md +++ b/docs/train.md @@ -1,4 +1,5 @@ # Training documentation + This aims to assist users in preparing training datasets, training models, and performing inference with trained models. We cover various use cases including pixel-wise segmentation, image classification, image enhancement, and machine-based reading order detection. For each use case, we provide guidance on how to generate the corresponding training dataset. @@ -11,6 +12,7 @@ The following three tasks can all be accomplished using the code in the * inference with the trained model ## Generate training dataset + The script `generate_gt_for_training.py` is used for generating training datasets. As the results of the following command demonstrates, the dataset generator provides three different commands: @@ -23,14 +25,19 @@ These three commands are: * pagexml2label ### image-enhancement + Generating a training dataset for image enhancement is quite straightforward. All that is needed is a set of high-resolution images. The training dataset can then be generated using the following command: -`python generate_gt_for_training.py image-enhancement -dis "dir of high resolution images" -dois "dir where degraded -images will be written" -dols "dir where the corresponding high resolution image will be written as label" -scs -"degrading scales json file"` +```sh +python generate_gt_for_training.py image-enhancement \ + -dis "dir of high resolution images" \ + -dois "dir where degraded images will be written" \ + -dols "dir where the corresponding high resolution image will be written as label" \ + -scs "degrading scales json file" +``` -The scales JSON file is a dictionary with a key named 'scales' and values representing scales smaller than 1. Images are +The scales JSON file is a dictionary with a key named `scales` and values representing scales smaller than 1. Images are downscaled based on these scales and then upscaled again to their original size. This process causes the images to lose resolution at different scales. The degraded images are used as input images, and the original high-resolution images serve as labels. The enhancement model can be trained with this generated dataset. The scales JSON file looks like this: @@ -42,6 +49,7 @@ serve as labels. The enhancement model can be trained with this generated datase ``` ### machine-based-reading-order + For machine-based reading order, we aim to determine the reading priority between two sets of text regions. The model's input is a three-channel image: the first and last channels contain information about each of the two text regions, while the middle channel encodes prominent layout elements necessary for reading order, such as separators and headers. @@ -52,10 +60,18 @@ For output images, it is necessary to specify the width and height. Additionally to filter out regions smaller than this minimum size. This minimum size is defined as the ratio of the text region area to the image area, with a default value of zero. To run the dataset generator, use the following command: -`python generate_gt_for_training.py machine-based-reading-order -dx "dir of GT xml files" -domi "dir where output images -will be written" -docl "dir where the labels will be written" -ih "height" -iw "width" -min "min area ratio"` +```shell +python generate_gt_for_training.py machine-based-reading-order \ + -dx "dir of GT xml files" \ + -domi "dir where output images will be written" \ + -docl "dir where the labels will be written" \ + -ih "height" \ + -iw "width" \ + -min "min area ratio" +``` ### pagexml2label + pagexml2label is designed to generate labels from GT page XML files for various pixel-wise segmentation use cases, including 'layout,' 'textline,' 'printspace,' 'glyph,' and 'word' segmentation. To train a pixel-wise segmentation model, we require images along with their corresponding labels. Our training script @@ -119,9 +135,13 @@ graphic region, "stamp" has its own class, while all other types are classified region" are also present in the label. However, other regions like "noise region" and "table region" will not be included in the label PNG file, even if they have information in the page XML files, as we chose not to include them. -`python generate_gt_for_training.py pagexml2label -dx "dir of GT xml files" -do "dir where output label png files will -be written" -cfg "custom config json file" -to "output type which has 2d and 3d. 2d is used for training and 3d is just -to visualise the labels" "` +```sh +python generate_gt_for_training.py pagexml2label \ + -dx "dir of GT xml files" \ + -do "dir where output label png files will be written" \ + -cfg "custom config json file" \ + -to "output type which has 2d and 3d. 2d is used for training and 3d is just to visualise the labels" +``` We have also defined an artificial class that can be added to the boundary of text region types or text lines. This key is called "artificial_class_on_boundary." If users want to apply this to certain text regions in the layout use case, @@ -169,12 +189,19 @@ in this scenario, since cropping will be applied to the label files, the directo provided to ensure that they are cropped in sync with the labels. This ensures that the correct images and labels required for training are obtained. The command should resemble the following: -`python generate_gt_for_training.py pagexml2label -dx "dir of GT xml files" -do "dir where output label png files will -be written" -cfg "custom config json file" -to "output type which has 2d and 3d. 2d is used for training and 3d is just -to visualise the labels" -ps -di "dir where the org images are located" -doi "dir where the cropped output images will -be written" ` +```sh +python generate_gt_for_training.py pagexml2label \ + -dx "dir of GT xml files" \ + -do "dir where output label png files will be written" \ + -cfg "custom config json file" \ + -to "output type which has 2d and 3d. 2d is used for training and 3d is just to visualise the labels" \ + -ps \ + -di "dir where the org images are located" \ + -doi "dir where the cropped output images will be written" +``` ## Train a model + ### classification For the classification use case, we haven't provided a ground truth generator, as it's unnecessary. For classification, @@ -225,7 +252,9 @@ And the "dir_eval" the same structure as train directory: The classification model can be trained using the following command line: -`python train.py with config_classification.json` +```sh +python train.py with config_classification.json +``` As evident in the example JSON file above, for classification, we utilize a "f1_threshold_classification" parameter. This parameter is employed to gather all models with an evaluation f1 score surpassing this threshold. Subsequently, @@ -276,6 +305,7 @@ The classification model can be trained like the classification case command lin ### Segmentation (Textline, Binarization, Page extraction and layout) and enhancement #### Parameter configuration for segmentation or enhancement usecases + The following parameter configuration can be applied to all segmentation use cases and enhancements. The augmentation, its sub-parameters, and continued training are defined only for segmentation use cases and enhancements, not for classification and machine-based reading order, as you can see in their example config files. @@ -355,6 +385,7 @@ command, similar to the process for classification and reading order: `python train.py with config_classification.json` #### Binarization + An example config json file for binarization can be like this: ```yaml @@ -550,6 +581,7 @@ For page segmentation (or printspace or border segmentation), the model needs to hence the patches parameter should be set to false. #### layout segmentation + An example config json file for layout segmentation with 5 classes (including background) can be like this: ```yaml @@ -605,26 +637,41 @@ An example config json file for layout segmentation with 5 classes (including ba ## Inference with the trained model ### classification + For conducting inference with a trained model, you simply need to execute the following command line, specifying the directory of the model and the image on which to perform inference: -`python inference.py -m "model dir" -i "image" ` +```sh +python inference.py -m "model dir" -i "image" +``` This will straightforwardly return the class of the image. ### machine based reading order + To infer the reading order using a reading order model, we need a page XML file containing layout information but without the reading order. We simply need to provide the model directory, the XML file, and the output directory. The new XML file with the added reading order will be written to the output directory with the same name. We need to run: -`python inference.py -m "model dir" -xml "page xml file" -o "output dir to write new xml with reading order" ` +```sh +python inference.py \ + -m "model dir" \ + -xml "page xml file" \ + -o "output dir to write new xml with reading order" +``` ### Segmentation (Textline, Binarization, Page extraction and layout) and enhancement For conducting inference with a trained model for segmentation and enhancement you need to run the following command line: -`python inference.py -m "model dir" -i "image" -p -s "output image" ` +```sh +python inference.py \ + -m "model dir" \ + -i "image" \ + -p \ + -s "output image" +``` Note that in the case of page extraction the -p flag is not needed. diff --git a/pyproject.toml b/pyproject.toml index 4da39ef..8a63543 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ optional-dependencies.test = {file = ["requirements-test.txt"]} where = ["src"] [tool.setuptools.package-data] -"*" = ["*.json", '*.yml', '*.xml', '*.xsd'] +"*" = ["*.json", '*.yml', '*.xml', '*.xsd', '*.ttf'] [tool.coverage.run] branch = true diff --git a/requirements.txt b/requirements.txt index 9ed0584..4bc0c6a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,6 @@ numpy <1.24.0 scikit-learn >= 0.23.2 tensorflow < 2.13 numba <= 0.58.1 +scikit-image loky +biopython diff --git a/src/eynollah/Charis-Regular.ttf b/src/eynollah/Charis-Regular.ttf new file mode 100644 index 0000000..a4e75a4 Binary files /dev/null and b/src/eynollah/Charis-Regular.ttf differ diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index c189aca..93bb676 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -1,8 +1,11 @@ import sys import click +import logging from ocrd_utils import initLogging, getLevelName, getLogger from eynollah.eynollah import Eynollah, Eynollah_ocr from eynollah.sbb_binarize import SbbBinarizer +from eynollah.image_enhancer import Enhancer +from eynollah.mb_ro_on_layout import machine_based_reading_order_on_layout @click.group() def main(): @@ -10,79 +13,98 @@ def main(): @main.command() @click.option( - "--dir_xml", - "-dx", - help="directory of GT page-xml files", + "--input", + "-i", + help="PAGE-XML input filename", + type=click.Path(exists=True, dir_okay=False), +) +@click.option( + "--dir_in", + "-di", + help="directory of PAGE-XML input files (instead of --input)", type=click.Path(exists=True, file_okay=False), ) @click.option( - "--dir_out_modal_image", - "-domi", - help="directory where ground truth images would be written", + "--out", + "-o", + help="directory for output images", type=click.Path(exists=True, file_okay=False), + required=True, ) @click.option( - "--dir_out_classes", - "-docl", - help="directory where ground truth classes would be written", + "--model", + "-m", + help="directory of models", type=click.Path(exists=True, file_okay=False), + required=True, ) @click.option( - "--input_height", - "-ih", - help="input height", + "--log_level", + "-l", + type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']), + help="Override log level globally to this", ) -@click.option( - "--input_width", - "-iw", - help="input width", -) -@click.option( - "--min_area_size", - "-min", - help="min area size of regions considered for reading order training.", -) -def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, input_height, input_width, min_area_size): - xml_files_ind = os.listdir(dir_xml) + +def machine_based_reading_order(input, dir_in, out, model, log_level): + assert bool(input) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both." + orderer = machine_based_reading_order_on_layout(model) + if log_level: + orderer.logger.setLevel(getLevelName(log_level)) + + orderer.run(xml_filename=input, + dir_in=dir_in, + dir_out=out, + ) + @main.command() @click.option('--patches/--no-patches', default=True, help='by enabling this parameter you let the model to see the image in patches.') @click.option('--model_dir', '-m', type=click.Path(exists=True, file_okay=False), required=True, help='directory containing models for prediction') -@click.argument('input_image', required=False) -@click.argument('output_image', required=False) +@click.option( + "--input-image", "--image", + "-i", + help="input image filename", + type=click.Path(exists=True, dir_okay=False) +) @click.option( "--dir_in", "-di", - help="directory of input images", + help="directory of input images (instead of --image)", type=click.Path(exists=True, file_okay=False), ) @click.option( - "--dir_out", - "-do", - help="directory for output images", - type=click.Path(exists=True, file_okay=False), + "--output", + "-o", + help="output image (if using -i) or output image directory (if using -di)", + type=click.Path(file_okay=True, dir_okay=True), + required=True, ) -def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out): - assert (dir_out is None) == (dir_in is None), "Options -di and -do are mutually dependent" - assert (input_image is None) == (output_image is None), "INPUT_IMAGE and OUTPUT_IMAGE are mutually dependent" - assert (dir_in is None) != (input_image is None), "Specify either -di and -do options, or INPUT_IMAGE and OUTPUT_IMAGE" - SbbBinarizer(model_dir).run(image_path=input_image, use_patches=patches, save=output_image, dir_in=dir_in, dir_out=dir_out) - - +@click.option( + "--log_level", + "-l", + type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']), + help="Override log level globally to this", +) +def binarization(patches, model_dir, input_image, dir_in, output, log_level): + assert bool(input_image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both." + binarizer = SbbBinarizer(model_dir) + if log_level: + binarizer.log.setLevel(getLevelName(log_level)) + binarizer.run(image_path=input_image, use_patches=patches, output=output, dir_in=dir_in) @main.command() @click.option( "--image", "-i", - help="image filename", + help="input image filename", type=click.Path(exists=True, dir_okay=False), ) @click.option( "--out", "-o", - help="directory to write output xml data", + help="directory for output PAGE-XML files", type=click.Path(exists=True, file_okay=False), required=True, ) @@ -95,7 +117,82 @@ def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out) @click.option( "--dir_in", "-di", - help="directory of images", + help="directory of input images (instead of --image)", + type=click.Path(exists=True, file_okay=False), +) +@click.option( + "--model", + "-m", + help="directory of models", + type=click.Path(exists=True, file_okay=False), + required=True, +) + +@click.option( + "--num_col_upper", + "-ncu", + help="lower limit of columns in document image", +) +@click.option( + "--num_col_lower", + "-ncl", + help="upper limit of columns in document image", +) +@click.option( + "--save_org_scale/--no_save_org_scale", + "-sos/-nosos", + is_flag=True, + help="if this parameter set to true, this tool will save the enhanced image in org scale.", +) +@click.option( + "--log_level", + "-l", + type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']), + help="Override log level globally to this", +) + +def enhancement(image, out, overwrite, dir_in, model, num_col_upper, num_col_lower, save_org_scale, log_level): + assert bool(image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both." + initLogging() + enhancer = Enhancer( + model, + num_col_upper=num_col_upper, + num_col_lower=num_col_lower, + save_org_scale=save_org_scale, + ) + if log_level: + enhancer.logger.setLevel(getLevelName(log_level)) + enhancer.run(overwrite=overwrite, + dir_in=dir_in, + image_filename=image, + dir_out=out, + ) + +@main.command() +@click.option( + "--image", + "-i", + help="input image filename", + type=click.Path(exists=True, dir_okay=False), +) + +@click.option( + "--out", + "-o", + help="directory for output PAGE-XML files", + type=click.Path(exists=True, file_okay=False), + required=True, +) +@click.option( + "--overwrite", + "-O", + help="overwrite (instead of skipping) if output xml exists", + is_flag=True, +) +@click.option( + "--dir_in", + "-di", + help="directory of input images (instead of --image)", type=click.Path(exists=True, file_okay=False), ) @click.option( @@ -225,6 +322,17 @@ def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out) is_flag=True, help="if this parameter set to true, this tool will try to do ocr", ) +@click.option( + "--transformer_ocr", + "-tr/-notr", + is_flag=True, + help="if this parameter set to true, this tool will apply transformer ocr", +) +@click.option( + "--batch_size_ocr", + "-bs_ocr", + help="number of inference batch size of ocr model. Default b_s for trocr and cnn_rnn models are 2 and 8 respectively", +) @click.option( "--num_col_upper", "-ncu", @@ -235,23 +343,46 @@ def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out) "-ncl", help="upper limit of columns in document image", ) +@click.option( + "--threshold_art_class_layout", + "-tharl", + help="threshold of artifical class in the case of layout detection. The default value is 0.1", +) +@click.option( + "--threshold_art_class_textline", + "-thart", + help="threshold of artifical class in the case of textline detection. The default value is 0.1", +) @click.option( "--skip_layout_and_reading_order", "-slro/-noslro", is_flag=True, help="if this parameter set to true, this tool will ignore layout detection and reading order. It means that textline detection will be done within printspace and contours of textline will be written in xml output file.", ) +# TODO move to top-level CLI context @click.option( "--log_level", "-l", type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']), - help="Override log level globally to this", + help="Override 'eynollah' log level globally to this", +) +# +@click.option( + "--setup-logging", + is_flag=True, + help="Setup a basic console logger", ) -def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_deskewed, save_all, extract_only_images, save_page, enable_plotting, allow_enhancement, curved_line, textline_light, full_layout, tables, right2left, input_binary, allow_scaling, headers_off, light_version, reading_order_machine_based, do_ocr, num_col_upper, num_col_lower, skip_layout_and_reading_order, ignore_page_extraction, log_level): - initLogging() - if log_level: - getLogger('eynollah').setLevel(getLevelName(log_level)) +def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_deskewed, save_all, extract_only_images, save_page, enable_plotting, allow_enhancement, curved_line, textline_light, full_layout, tables, right2left, input_binary, allow_scaling, headers_off, light_version, reading_order_machine_based, do_ocr, transformer_ocr, batch_size_ocr, num_col_upper, num_col_lower, threshold_art_class_textline, threshold_art_class_layout, skip_layout_and_reading_order, ignore_page_extraction, log_level, setup_logging): + if setup_logging: + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setLevel(logging.INFO) + formatter = logging.Formatter('%(message)s') + console_handler.setFormatter(formatter) + getLogger('eynollah').addHandler(console_handler) + getLogger('eynollah').setLevel(logging.INFO) + else: + initLogging() assert enable_plotting or not save_layout, "Plotting with -sl also requires -ep" assert enable_plotting or not save_deskewed, "Plotting with -sd also requires -ep" assert enable_plotting or not save_all, "Plotting with -sa also requires -ep" @@ -270,17 +401,10 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ assert not extract_only_images or not tables, "Image extraction -eoi can not be set alongside tables -tab" assert not extract_only_images or not right2left, "Image extraction -eoi can not be set alongside right2left -r2l" assert not extract_only_images or not headers_off, "Image extraction -eoi can not be set alongside headers_off -ho" - assert image or dir_in, "Either a single image -i or a dir_in -di is required" + assert bool(image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both." eynollah = Eynollah( model, - logger=getLogger('eynollah'), - dir_out=out, - dir_of_cropped_images=save_images, extract_only_images=extract_only_images, - dir_of_layout=save_layout, - dir_of_deskewed=save_deskewed, - dir_of_all=save_all, - dir_save_page=save_page, enable_plotting=enable_plotting, allow_enhancement=allow_enhancement, curved_line=curved_line, @@ -295,54 +419,82 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ ignore_page_extraction=ignore_page_extraction, reading_order_machine_based=reading_order_machine_based, do_ocr=do_ocr, + transformer_ocr=transformer_ocr, + batch_size_ocr=batch_size_ocr, num_col_upper=num_col_upper, num_col_lower=num_col_lower, skip_layout_and_reading_order=skip_layout_and_reading_order, + threshold_art_class_textline=threshold_art_class_textline, + threshold_art_class_layout=threshold_art_class_layout, + ) + if log_level: + eynollah.logger.setLevel(getLevelName(log_level)) + eynollah.run(overwrite=overwrite, + image_filename=image, + dir_in=dir_in, + dir_out=out, + dir_of_cropped_images=save_images, + dir_of_layout=save_layout, + dir_of_deskewed=save_deskewed, + dir_of_all=save_all, + dir_save_page=save_page, ) - if dir_in: - eynollah.run(dir_in=dir_in, overwrite=overwrite) - else: - eynollah.run(image_filename=image, overwrite=overwrite) - @main.command() +@click.option( + "--image", + "-i", + help="input image filename", + type=click.Path(exists=True, dir_okay=False), +) @click.option( "--dir_in", "-di", - help="directory of images", + help="directory of input images (instead of --image)", type=click.Path(exists=True, file_okay=False), ) @click.option( "--dir_in_bin", "-dib", - help="directory of binarized images. This should be given if you want to do prediction based on both rgb and bin images. And all bin images are png files", + help="directory of binarized images (in addition to --dir_in for RGB images; filename stems must match the RGB image files, with '.png' suffix).\nPerform prediction using both RGB and binary images. (This does not necessarily improve results, however it may be beneficial for certain document images.)", type=click.Path(exists=True, file_okay=False), ) -@click.option( - "--out", - "-o", - help="directory to write output xml data", - type=click.Path(exists=True, file_okay=False), - required=True, -) @click.option( "--dir_xmls", "-dx", - help="directory of xmls", + help="directory of input PAGE-XML files (in addition to --dir_in; filename stems must match the image files, with '.xml' suffix).", type=click.Path(exists=True, file_okay=False), + required=True, +) +@click.option( + "--out", + "-o", + help="directory for output PAGE-XML files", + type=click.Path(exists=True, file_okay=False), + required=True, ) @click.option( "--dir_out_image_text", "-doit", - help="directory of images with predicted text", + help="directory for output images, newly rendered with predicted text", type=click.Path(exists=True, file_okay=False), ) +@click.option( + "--overwrite", + "-O", + help="overwrite (instead of skipping) if output xml exists", + is_flag=True, +) @click.option( "--model", "-m", help="directory of models", type=click.Path(exists=True, file_okay=False), - required=True, +) +@click.option( + "--model_name", + help="Specific model file path to use for OCR", + type=click.Path(exists=True, file_okay=False), ) @click.option( "--tr_ocr", @@ -363,16 +515,19 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ help="if this parameter set to true, cropped textline images will not be masked with textline contour.", ) @click.option( - "--draw_texts_on_image", - "-dtoi/-ndtoi", - is_flag=True, - help="if this parameter set to true, the predicted texts will be displayed on an image.", + "--batch_size", + "-bs", + help="number of inference batch size. Default b_s for trocr and cnn_rnn models are 2 and 8 respectively", ) @click.option( - "--prediction_with_both_of_rgb_and_bin", - "-brb/-nbrb", - is_flag=True, - help="If this parameter is set to True, the prediction will be performed using both RGB and binary images. However, this does not necessarily improve results; it may be beneficial for certain document images.", + "--dataset_abbrevation", + "-ds_pref", + help="in the case of extracting textline and text from a xml GT file user can add an abbrevation of dataset name to generated dataset", +) +@click.option( + "--min_conf_value_of_textline_text", + "-min_conf", + help="minimum OCR confidence value. Text lines with a confidence value lower than this threshold will not be included in the output XML file.", ) @click.option( "--log_level", @@ -381,24 +536,36 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ help="Override log level globally to this", ) -def ocr(dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, prediction_with_both_of_rgb_and_bin, log_level): +def ocr(image, dir_in, dir_in_bin, dir_xmls, out, dir_out_image_text, overwrite, model, model_name, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, batch_size, dataset_abbrevation, min_conf_value_of_textline_text, log_level): initLogging() - if log_level: - getLogger('eynollah').setLevel(getLevelName(log_level)) + + assert bool(model) != bool(model_name), "Either -m (model directory) or --model_name (specific model name) must be provided." + assert not export_textline_images_and_text or not tr_ocr, "Exporting textline and text -etit can not be set alongside transformer ocr -tr_ocr" + assert not export_textline_images_and_text or not model, "Exporting textline and text -etit can not be set alongside model -m" + assert not export_textline_images_and_text or not batch_size, "Exporting textline and text -etit can not be set alongside batch size -bs" + assert not export_textline_images_and_text or not dir_in_bin, "Exporting textline and text -etit can not be set alongside directory of bin images -dib" + assert not export_textline_images_and_text or not dir_out_image_text, "Exporting textline and text -etit can not be set alongside directory of images with predicted text -doit" + assert bool(image) != bool(dir_in), "Either -i (single image) or -di (directory) must be provided, but not both." eynollah_ocr = Eynollah_ocr( - dir_xmls=dir_xmls, - dir_out_image_text=dir_out_image_text, - dir_in=dir_in, - dir_in_bin=dir_in_bin, - dir_out=out, dir_models=model, + model_name=model_name, tr_ocr=tr_ocr, export_textline_images_and_text=export_textline_images_and_text, do_not_mask_with_textline_contour=do_not_mask_with_textline_contour, - draw_texts_on_image=draw_texts_on_image, - prediction_with_both_of_rgb_and_bin=prediction_with_both_of_rgb_and_bin, + batch_size=batch_size, + pref_of_dataset=dataset_abbrevation, + min_conf_value_of_textline_text=min_conf_value_of_textline_text, + ) + if log_level: + eynollah_ocr.logger.setLevel(getLevelName(log_level)) + eynollah_ocr.run(overwrite=overwrite, + dir_in=dir_in, + dir_in_bin=dir_in_bin, + image_filename=image, + dir_xmls=dir_xmls, + dir_out_image_text=dir_out_image_text, + dir_out=out, ) - eynollah_ocr.run() if __name__ == "__main__": main() diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index d47016b..20954a0 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -6,7 +6,13 @@ document layout analysis (segmentation) with output in PAGE-XML """ -from logging import Logger +# cannot use importlib.resources until we move to 3.9+ forimportlib.resources.files +import sys +if sys.version_info < (3, 10): + import importlib_resources +else: + import importlib.resources as importlib_resources + from difflib import SequenceMatcher as sq from PIL import Image, ImageDraw, ImageFont import math @@ -22,7 +28,6 @@ from multiprocessing import cpu_count import gc import copy import json - from loky import ProcessPoolExecutor import xml.etree.ElementTree as ET import cv2 @@ -30,9 +35,10 @@ import numpy as np from scipy.signal import find_peaks from scipy.ndimage import gaussian_filter1d from numba import cuda - +from skimage.morphology import skeletonize from ocrd import OcrdPage from ocrd_utils import getLogger, tf_disable_interactive_logs +import statistics try: import torch @@ -77,12 +83,26 @@ from .utils.contour import ( from .utils.rotate import ( rotate_image, rotation_not_90_func, - rotation_not_90_func_full_layout + rotation_not_90_func_full_layout, + rotation_image_new +) +from .utils.utils_ocr import ( + return_textline_contour_with_added_box_coordinate, + preprocess_and_resize_image_for_ocrcnn_model, + return_textlines_split_if_needed, + decode_batch_predictions, + return_rnn_cnn_ocr_of_given_textlines, + fit_text_single_line, + break_curved_line_into_small_pieces_and_then_merge, + get_orientation_moments, + rotate_image_with_padding, + get_contours_and_bounding_boxes ) from .utils.separate_lines import ( textline_contours_postprocessing, separate_lines_new2, return_deskew_slop, + return_deskew_slop_old_mp, do_work_of_slopes_new, do_work_of_slopes_new_curved, do_work_of_slopes_new_light, @@ -94,6 +114,7 @@ from .utils.drop_capitals import ( from .utils.marginals import get_marginals from .utils.resize import resize_image from .utils import ( + is_image_filename, boosting_headers_by_longshot_region_segmentation, crop_image_inside_box, find_num_col, @@ -177,13 +198,7 @@ class Eynollah: def __init__( self, dir_models : str, - dir_out : Optional[str] = None, - dir_of_cropped_images : Optional[str] = None, extract_only_images : bool =False, - dir_of_layout : Optional[str] = None, - dir_of_deskewed : Optional[str] = None, - dir_of_all : Optional[str] = None, - dir_save_page : Optional[str] = None, enable_plotting : bool = False, allow_enhancement : bool = False, curved_line : bool = False, @@ -198,22 +213,21 @@ class Eynollah: ignore_page_extraction : bool = False, reading_order_machine_based : bool = False, do_ocr : bool = False, + transformer_ocr: bool = False, + batch_size_ocr: Optional[int] = None, num_col_upper : Optional[int] = None, num_col_lower : Optional[int] = None, + threshold_art_class_layout: Optional[float] = None, + threshold_art_class_textline: Optional[float] = None, skip_layout_and_reading_order : bool = False, - logger : Optional[Logger] = None, ): + self.logger = getLogger('eynollah') + self.plotter = None + if skip_layout_and_reading_order: textline_light = True self.light_version = light_version - self.dir_out = dir_out - self.dir_of_all = dir_of_all - self.dir_save_page = dir_save_page self.reading_order_machine_based = reading_order_machine_based - self.dir_of_deskewed = dir_of_deskewed - self.dir_of_deskewed = dir_of_deskewed - self.dir_of_cropped_images=dir_of_cropped_images - self.dir_of_layout=dir_of_layout self.enable_plotting = enable_plotting self.allow_enhancement = allow_enhancement self.curved_line = curved_line @@ -229,6 +243,7 @@ class Eynollah: self.ignore_page_extraction = ignore_page_extraction self.skip_layout_and_reading_order = skip_layout_and_reading_order self.ocr = do_ocr + self.tr = transformer_ocr if num_col_upper: self.num_col_upper = int(num_col_upper) else: @@ -237,10 +252,17 @@ class Eynollah: self.num_col_lower = int(num_col_lower) else: self.num_col_lower = num_col_lower - self.logger = logger if logger else getLogger('eynollah') - # for parallelization of CPU-intensive tasks: - self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200) - atexit.register(self.executor.shutdown) + + if threshold_art_class_layout: + self.threshold_art_class_layout = float(threshold_art_class_layout) + else: + self.threshold_art_class_layout = 0.1 + + if threshold_art_class_textline: + self.threshold_art_class_textline = float(threshold_art_class_textline) + else: + self.threshold_art_class_textline = 0.1 + self.dir_models = dir_models self.model_dir_of_enhancement = dir_models + "/eynollah-enhancement_20210425" self.model_dir_of_binarization = dir_models + "/eynollah-binarization_20210425" @@ -255,11 +277,11 @@ class Eynollah: #"/eynollah-full-regions-1column_20210425" self.model_region_dir_fully_np = dir_models + "/modelens_full_lay_1__4_3_091124" #self.model_region_dir_fully = dir_models + "/eynollah-full-regions-3+column_20210425" - self.model_page_dir = dir_models + "/eynollah-page-extraction_20210425" + self.model_page_dir = dir_models + "/model_eynollah_page_extraction_20250915" self.model_region_dir_p_ens = dir_models + "/eynollah-main-regions-ensembled_20210425" self.model_region_dir_p_ens_light = dir_models + "/eynollah-main-regions_20220314" self.model_region_dir_p_ens_light_only_images_extraction = dir_models + "/eynollah-main-regions_20231127_672_org_ens_11_13_16_17_18" - self.model_reading_order_dir = dir_models + "/model_ens_reading_order_machine_based" + self.model_reading_order_dir = dir_models + "/model_eynollah_reading_order_20250824"#"/model_mb_ro_aug_ens_11"#"/model_step_3200000_mb_ro"#"/model_ens_reading_order_machine_based"#"/model_mb_ro_aug_ens_8"#"/model_ens_reading_order_machine_based" #"/modelens_12sp_elay_0_3_4__3_6_n" #"/modelens_earlylayout_12spaltige_2_3_5_6_7_8" #"/modelens_early12_sp_2_3_5_6_7_8_9_10_12_14_15_16_18" @@ -286,14 +308,23 @@ class Eynollah: else: #"/eynollah-textline_20210425" self.model_textline_dir = dir_models + "/modelens_textline_0_1__2_4_16092024" - if self.ocr: - self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124" + if self.ocr and self.tr: + self.model_ocr_dir = dir_models + "/model_eynollah_ocr_trocr_20250919" + elif self.ocr and not self.tr: + self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250904" if self.tables: if self.light_version: self.model_table_dir = dir_models + "/modelens_table_0t4_201124" else: self.model_table_dir = dir_models + "/eynollah-tables_20210319" + + + t_start = time.time() + # for parallelization of CPU-intensive tasks: + self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200) + atexit.register(self.executor.shutdown) + # #gpu_options = tf.compat.v1.GPUOptions(allow_growth=True) # #gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=7.7, allow_growth=True) # #session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) @@ -307,7 +338,9 @@ class Eynollah: tf.config.experimental.set_memory_growth(device, True) except: self.logger.warning("no GPU device available") - + + self.logger.info("Loading models...") + self.model_page = self.our_load_model(self.model_page_dir) self.model_classifier = self.our_load_model(self.model_dir_of_col_classifier) self.model_bin = self.our_load_model(self.model_dir_of_binarization) @@ -327,13 +360,41 @@ class Eynollah: self.model_region_fl = self.our_load_model(self.model_region_dir_fully) if self.reading_order_machine_based: self.model_reading_order = self.our_load_model(self.model_reading_order_dir) - if self.ocr: + if self.ocr and self.tr: self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #("microsoft/trocr-base-printed")#("microsoft/trocr-base-handwritten") self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") + elif self.ocr and not self.tr: + model_ocr = load_model(self.model_ocr_dir , compile=False) + + self.prediction_model = tf.keras.models.Model( + model_ocr.get_layer(name = "image").input, + model_ocr.get_layer(name = "dense2").output) + if not batch_size_ocr: + self.b_s_ocr = 8 + else: + self.b_s_ocr = int(batch_size_ocr) + + + with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: + characters = json.load(config_file) + + + AUTOTUNE = tf.data.AUTOTUNE + + # Mapping characters to integers. + char_to_num = StringLookup(vocabulary=list(characters), mask_token=None) + + # Mapping integers back to original characters. + self.num_to_char = StringLookup( + vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True + ) + if self.tables: self.model_table = self.our_load_model(self.model_table_dir) + + self.logger.info(f"Model initialization complete ({time.time() - t_start:.1f}s)") def cache_images(self, image_filename=None, image_pil=None, dpi=None): ret = {} @@ -357,21 +418,11 @@ class Eynollah: if dpi is not None: self.dpi = dpi - def reset_file_name_dir(self, image_filename): + def reset_file_name_dir(self, image_filename, dir_out): t_c = time.time() self.cache_images(image_filename=image_filename) - - self.plotter = None if not self.enable_plotting else EynollahPlotter( - dir_out=self.dir_out, - dir_of_all=self.dir_of_all, - dir_save_page=self.dir_save_page, - dir_of_deskewed=self.dir_of_deskewed, - dir_of_cropped_images=self.dir_of_cropped_images, - dir_of_layout=self.dir_of_layout, - image_filename_stem=Path(Path(image_filename).name).stem) - self.writer = EynollahXmlWriter( - dir_out=self.dir_out, + dir_out=dir_out, image_filename=image_filename, curved_line=self.curved_line, textline_light = self.textline_light) @@ -667,6 +718,7 @@ class Eynollah: label_p_pred = self.model_classifier.predict(img_in, verbose=0) num_col = np.argmax(label_p_pred[0]) + 1 + elif (self.num_col_upper and self.num_col_lower) and (self.num_col_upper!=self.num_col_lower): if self.input_binary: img_in = np.copy(img) @@ -784,7 +836,7 @@ class Eynollah: self, patches, img, model, n_batch_inference=1, marginal_of_patch_percent=0.1, thresholding_for_some_classes_in_light_version=False, - thresholding_for_artificial_class_in_light_version=False): + thresholding_for_artificial_class_in_light_version=False, thresholding_for_fl_light_version=False, threshold_art_class_textline=0.1): self.logger.debug("enter do_prediction") img_height_model = model.layers[-1].output_shape[1] @@ -802,10 +854,22 @@ class Eynollah: if thresholding_for_artificial_class_in_light_version: seg_art = label_p_pred[0,:,:,2] - seg_art[seg_art<0.2] = 0 + seg_art[seg_art0] =1 + + skeleton_art = skeletonize(seg_art) + skeleton_art = skeleton_art*1 - seg[seg_art==1]=2 + seg[skeleton_art==1]=2 + + if thresholding_for_fl_light_version: + seg_header = label_p_pred[0,:,:,2] + + seg_header[seg_header<0.2] = 0 + seg_header[seg_header>0] =1 + + seg[seg_header==1]=2 + seg_color = np.repeat(seg[:, :, np.newaxis], 3, axis=2) prediction_true = resize_image(seg_color, img_h_page, img_w_page).astype(np.uint8) return prediction_true @@ -896,14 +960,17 @@ class Eynollah: if thresholding_for_artificial_class_in_light_version: seg_art = label_p_pred[:,:,:,2] - seg_art[seg_art<0.2] = 0 + seg_art[seg_art0] =1 - seg[seg_art==1]=2 + ##seg[seg_art==1]=2 indexer_inside_batch = 0 for i_batch, j_batch in zip(list_i_s, list_j_s): seg_in = seg[indexer_inside_batch] + + if thresholding_for_artificial_class_in_light_version: + seg_in_art = seg_art[indexer_inside_batch] index_y_u_in = list_y_u[indexer_inside_batch] index_y_d_in = list_y_d[indexer_inside_batch] @@ -917,54 +984,107 @@ class Eynollah: seg_in[0:-margin or None, 0:-margin or None, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[0:-margin or None, + 0:-margin or None] + elif i_batch == nxf - 1 and j_batch == nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + margin:index_x_u_in - 0] = \ seg_in[margin:, margin:, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[margin:, + margin:] + elif i_batch == 0 and j_batch == nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + 0:index_x_u_in - margin] = \ seg_in[margin:, 0:-margin or None, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[margin:, + 0:-margin or None] + elif i_batch == nxf - 1 and j_batch == 0: prediction_true[index_y_d_in + 0:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - 0] = \ seg_in[0:-margin or None, margin:, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[0:-margin or None, + margin:] + elif i_batch == 0 and j_batch != 0 and j_batch != nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + 0:index_x_u_in - margin] = \ seg_in[margin:-margin or None, 0:-margin or None, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[margin:-margin or None, + 0:-margin or None] + elif i_batch == nxf - 1 and j_batch != 0 and j_batch != nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - 0] = \ seg_in[margin:-margin or None, margin:, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[margin:-margin or None, + margin:] + elif i_batch != 0 and i_batch != nxf - 1 and j_batch == 0: prediction_true[index_y_d_in + 0:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - margin] = \ seg_in[0:-margin or None, margin:-margin or None, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[0:-margin or None, + margin:-margin or None] + elif i_batch != 0 and i_batch != nxf - 1 and j_batch == nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + margin:index_x_u_in - margin] = \ seg_in[margin:, margin:-margin or None, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[margin:, + margin:-margin or None] + else: prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - margin] = \ seg_in[margin:-margin or None, margin:-margin or None, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[margin:-margin or None, + margin:-margin or None] indexer_inside_batch += 1 @@ -979,6 +1099,19 @@ class Eynollah: img_patch[:] = 0 prediction_true = prediction_true.astype(np.uint8) + + if thresholding_for_artificial_class_in_light_version: + kernel_min = np.ones((3, 3), np.uint8) + prediction_true[:,:,0][prediction_true[:,:,0]==2] = 0 + + skeleton_art = skeletonize(prediction_true[:,:,1]) + skeleton_art = skeleton_art*1 + + skeleton_art = skeleton_art.astype('uint8') + + skeleton_art = cv2.dilate(skeleton_art, kernel_min, iterations=1) + + prediction_true[:,:,0][skeleton_art==1]=2 #del model gc.collect() return prediction_true @@ -1117,7 +1250,7 @@ class Eynollah: self, patches, img, model, n_batch_inference=1, marginal_of_patch_percent=0.1, thresholding_for_some_classes_in_light_version=False, - thresholding_for_artificial_class_in_light_version=False): + thresholding_for_artificial_class_in_light_version=False, threshold_art_class_textline=0.1, threshold_art_class_layout=0.1): self.logger.debug("enter do_prediction_new_concept") img_height_model = model.layers[-1].output_shape[1] @@ -1132,19 +1265,28 @@ class Eynollah: label_p_pred = model.predict(img[np.newaxis], verbose=0) seg = np.argmax(label_p_pred, axis=3)[0] - if thresholding_for_artificial_class_in_light_version: - #seg_text = label_p_pred[0,:,:,1] - #seg_text[seg_text<0.2] =0 - #seg_text[seg_text>0] =1 - #seg[seg_text==1]=1 - - seg_art = label_p_pred[0,:,:,4] - seg_art[seg_art<0.2] =0 - seg_art[seg_art>0] =1 - seg[seg_art==1]=4 - seg_color = np.repeat(seg[:, :, np.newaxis], 3, axis=2) prediction_true = resize_image(seg_color, img_h_page, img_w_page).astype(np.uint8) + + if thresholding_for_artificial_class_in_light_version: + kernel_min = np.ones((3, 3), np.uint8) + seg_art = label_p_pred[0,:,:,4] + seg_art[seg_art0] =1 + #seg[seg_art==1]=4 + seg_art = resize_image(seg_art, img_h_page, img_w_page).astype(np.uint8) + + prediction_true[:,:,0][prediction_true[:,:,0]==4] = 0 + + skeleton_art = skeletonize(seg_art) + skeleton_art = skeleton_art*1 + + skeleton_art = skeleton_art.astype('uint8') + + skeleton_art = cv2.dilate(skeleton_art, kernel_min, iterations=1) + + prediction_true[:,:,0][skeleton_art==1] = 4 + return prediction_true , resize_image(label_p_pred[0, :, :, 1] , img_h_page, img_w_page) if img.shape[0] < img_height_model: @@ -1217,26 +1359,29 @@ class Eynollah: if thresholding_for_some_classes_in_light_version: seg_art = label_p_pred[:,:,:,4] - seg_art[seg_art<0.2] =0 + seg_art[seg_art0] =1 seg_line = label_p_pred[:,:,:,3] - seg_line[seg_line>0.1] =1 + seg_line[seg_line>0.4] =1#seg_line[seg_line>0.5] =1#seg_line[seg_line>0.1] =1 seg_line[seg_line<1] =0 - seg[seg_art==1]=4 - seg[(seg_line==1) & (seg==0)]=3 + ##seg[seg_art==1]=4 + #seg[(seg_line==1) & (seg==0)]=3 if thresholding_for_artificial_class_in_light_version: seg_art = label_p_pred[:,:,:,2] - seg_art[seg_art<0.2] = 0 + seg_art[seg_art0] =1 - seg[seg_art==1]=2 + ##seg[seg_art==1]=2 indexer_inside_batch = 0 for i_batch, j_batch in zip(list_i_s, list_j_s): seg_in = seg[indexer_inside_batch] + + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + seg_in_art = seg_art[indexer_inside_batch] index_y_u_in = list_y_u[indexer_inside_batch] index_y_d_in = list_y_d[indexer_inside_batch] @@ -1255,6 +1400,12 @@ class Eynollah: label_p_pred[0, 0:-margin or None, 0:-margin or None, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[0:-margin or None, + 0:-margin or None] + elif i_batch == nxf - 1 and j_batch == nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + margin:index_x_u_in - 0] = \ @@ -1266,6 +1417,12 @@ class Eynollah: label_p_pred[0, margin:, margin:, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[margin:, + margin:] + elif i_batch == 0 and j_batch == nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + 0:index_x_u_in - margin] = \ @@ -1277,6 +1434,13 @@ class Eynollah: label_p_pred[0, margin:, 0:-margin or None, 1] + + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[margin:, + 0:-margin or None] + elif i_batch == nxf - 1 and j_batch == 0: prediction_true[index_y_d_in + 0:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - 0] = \ @@ -1288,6 +1452,12 @@ class Eynollah: label_p_pred[0, 0:-margin or None, margin:, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[0:-margin or None, + margin:] + elif i_batch == 0 and j_batch != 0 and j_batch != nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + 0:index_x_u_in - margin] = \ @@ -1299,6 +1469,11 @@ class Eynollah: label_p_pred[0, margin:-margin or None, 0:-margin or None, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[margin:-margin or None, + 0:-margin or None] elif i_batch == nxf - 1 and j_batch != 0 and j_batch != nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - 0] = \ @@ -1310,6 +1485,11 @@ class Eynollah: label_p_pred[0, margin:-margin or None, margin:, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[margin:-margin or None, + margin:] elif i_batch != 0 and i_batch != nxf - 1 and j_batch == 0: prediction_true[index_y_d_in + 0:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - margin] = \ @@ -1321,6 +1501,11 @@ class Eynollah: label_p_pred[0, 0:-margin or None, margin:-margin or None, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[0:-margin or None, + margin:-margin or None] elif i_batch != 0 and i_batch != nxf - 1 and j_batch == nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + margin:index_x_u_in - margin] = \ @@ -1332,6 +1517,11 @@ class Eynollah: label_p_pred[0, margin:, margin:-margin or None, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[margin:, + margin:-margin or None] else: prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - margin] = \ @@ -1343,6 +1533,11 @@ class Eynollah: label_p_pred[0, margin:-margin or None, margin:-margin or None, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[margin:-margin or None, + margin:-margin or None] indexer_inside_batch += 1 list_i_s = [] @@ -1356,6 +1551,32 @@ class Eynollah: img_patch[:] = 0 prediction_true = prediction_true.astype(np.uint8) + + if thresholding_for_artificial_class_in_light_version: + kernel_min = np.ones((3, 3), np.uint8) + prediction_true[:,:,0][prediction_true[:,:,0]==2] = 0 + + skeleton_art = skeletonize(prediction_true[:,:,1]) + skeleton_art = skeleton_art*1 + + skeleton_art = skeleton_art.astype('uint8') + + skeleton_art = cv2.dilate(skeleton_art, kernel_min, iterations=1) + + prediction_true[:,:,0][skeleton_art==1]=2 + + if thresholding_for_some_classes_in_light_version: + kernel_min = np.ones((3, 3), np.uint8) + prediction_true[:,:,0][prediction_true[:,:,0]==4] = 0 + + skeleton_art = skeletonize(prediction_true[:,:,1]) + skeleton_art = skeleton_art*1 + + skeleton_art = skeleton_art.astype('uint8') + + skeleton_art = cv2.dilate(skeleton_art, kernel_min, iterations=1) + + prediction_true[:,:,0][skeleton_art==1]=4 gc.collect() return prediction_true, confidence_matrix @@ -1363,11 +1584,11 @@ class Eynollah: self.logger.debug("enter extract_page") cont_page = [] if not self.ignore_page_extraction: - img = cv2.GaussianBlur(self.image, (5, 5), 0) + img = np.copy(self.image)#cv2.GaussianBlur(self.image, (5, 5), 0) img_page_prediction = self.do_prediction(False, img, self.model_page) imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY) _, thresh = cv2.threshold(imgray, 0, 255, 0) - thresh = cv2.dilate(thresh, KERNEL, iterations=3) + ##thresh = cv2.dilate(thresh, KERNEL, iterations=3) contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) if len(contours)>0: @@ -1375,24 +1596,25 @@ class Eynollah: for j in range(len(contours))]) cnt = contours[np.argmax(cnt_size)] x, y, w, h = cv2.boundingRect(cnt) - if x <= 30: - w += x - x = 0 - if (self.image.shape[1] - (x + w)) <= 30: - w = w + (self.image.shape[1] - (x + w)) - if y <= 30: - h = h + y - y = 0 - if (self.image.shape[0] - (y + h)) <= 30: - h = h + (self.image.shape[0] - (y + h)) + #if x <= 30: + #w += x + #x = 0 + #if (self.image.shape[1] - (x + w)) <= 30: + #w = w + (self.image.shape[1] - (x + w)) + #if y <= 30: + #h = h + y + #y = 0 + #if (self.image.shape[0] - (y + h)) <= 30: + #h = h + (self.image.shape[0] - (y + h)) box = [x, y, w, h] else: box = [0, 0, img.shape[1], img.shape[0]] cropped_page, page_coord = crop_image_inside_box(box, self.image) - cont_page.append(np.array([[page_coord[2], page_coord[0]], - [page_coord[3], page_coord[0]], - [page_coord[3], page_coord[1]], - [page_coord[2], page_coord[1]]])) + cont_page = [cnt] + #cont_page.append(np.array([[page_coord[2], page_coord[0]], + #[page_coord[3], page_coord[0]], + #[page_coord[3], page_coord[1]], + #[page_coord[2], page_coord[1]]])) self.logger.debug("exit extract_page") else: box = [0, 0, self.image.shape[1], self.image.shape[0]] @@ -1440,10 +1662,11 @@ class Eynollah: model_region = self.model_region_fl if patches else self.model_region_fl_np if self.light_version: - pass + thresholding_for_fl_light_version = True elif not patches: img = otsu_copy_binary(img).astype(np.uint8) prediction_regions = None + thresholding_for_fl_light_version = False elif cols: img = otsu_copy_binary(img).astype(np.uint8) if cols == 1: @@ -1459,7 +1682,7 @@ class Eynollah: else: img = resize_image(img, int(img_height_h * 2500 / float(img_width_h)), 2500).astype(np.uint8) - prediction_regions = self.do_prediction(patches, img, model_region, marginal_of_patch_percent=0.1, n_batch_inference=3) + prediction_regions = self.do_prediction(patches, img, model_region, marginal_of_patch_percent=0.1, n_batch_inference=3, thresholding_for_fl_light_version=thresholding_for_fl_light_version) prediction_regions = resize_image(prediction_regions, img_height_h, img_width_h) self.logger.debug("exit extract_text_regions") return prediction_regions, prediction_regions @@ -1516,12 +1739,85 @@ class Eynollah: prediction_regions = resize_image(prediction_regions, img_height_h, img_width_h) self.logger.debug("exit extract_text_regions") return prediction_regions, prediction_regions2 + + def get_textlines_of_a_textregion_sorted(self, textlines_textregion, cx_textline, cy_textline, w_h_textline): + N = len(cy_textline) + if N==0: + return [] + + diff_cy = np.abs( np.diff(sorted(cy_textline)) ) + diff_cx = np.abs(np.diff(sorted(cx_textline)) ) + + if len(diff_cy)>0: + mean_y_diff = np.mean(diff_cy) + mean_x_diff = np.mean(diff_cx) + count_hor = np.count_nonzero(np.array(w_h_textline) > 1) + count_ver = len(w_h_textline) - count_hor + + else: + mean_y_diff = 0 + mean_x_diff = 0 + count_hor = 1 + count_ver = 0 + + + if count_hor >= count_ver: + row_threshold = mean_y_diff / 1.5 if mean_y_diff > 0 else 10 + + indices_sorted_by_y = sorted(range(N), key=lambda i: cy_textline[i]) + + rows = [] + current_row = [indices_sorted_by_y[0]] + for i in range(1, N): + current_idx = indices_sorted_by_y[i] + prev_idx = current_row[0] + if abs(cy_textline[current_idx] - cy_textline[prev_idx]) <= row_threshold: + current_row.append(current_idx) + else: + rows.append(current_row) + current_row = [current_idx] + rows.append(current_row) + + sorted_textlines = [] + for row in rows: + row_sorted = sorted(row, key=lambda i: cx_textline[i]) + for idx in row_sorted: + sorted_textlines.append(textlines_textregion[idx]) + + else: + row_threshold = mean_x_diff / 1.5 if mean_x_diff > 0 else 10 + indices_sorted_by_x = sorted(range(N), key=lambda i: cx_textline[i]) + + rows = [] + current_row = [indices_sorted_by_x[0]] + + for i in range(1, N): + current_idy = indices_sorted_by_x[i] + prev_idy = current_row[0] + if abs(cx_textline[current_idy] - cx_textline[prev_idy] ) <= row_threshold: + current_row.append(current_idy) + else: + rows.append(current_row) + current_row = [current_idy] + rows.append(current_row) + + sorted_textlines = [] + for row in rows: + row_sorted = sorted(row , key=lambda i: cy_textline[i]) + for idy in row_sorted: + sorted_textlines.append(textlines_textregion[idy]) + + return sorted_textlines + + def get_slopes_and_deskew_new_light2(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, slope_deskew): polygons_of_textlines = return_contours_of_interested_region(textline_mask_tot,1,0.00001) M_main_tot = [cv2.moments(polygons_of_textlines[j]) for j in range(len(polygons_of_textlines))] + + w_h_textlines = [cv2.boundingRect(polygons_of_textlines[i])[2:] for i in range(len(polygons_of_textlines))] cx_main_tot = [(M_main_tot[j]["m10"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] cy_main_tot = [(M_main_tot[j]["m01"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] @@ -1536,8 +1832,13 @@ class Eynollah: results = np.array(results) indexes_in = args_textlines[results==1] textlines_ins = [polygons_of_textlines[ind] for ind in indexes_in] + cx_textline_in = [cx_main_tot[ind] for ind in indexes_in] + cy_textline_in = [cy_main_tot[ind] for ind in indexes_in] + w_h_textlines_in = [w_h_textlines[ind][0] / float(w_h_textlines[ind][1]) for ind in indexes_in] - all_found_textline_polygons.append(textlines_ins[::-1]) + textlines_ins = self.get_textlines_of_a_textregion_sorted(textlines_ins, cx_textline_in, cy_textline_in, w_h_textlines_in) + + all_found_textline_polygons.append(textlines_ins)#[::-1]) slopes.append(slope_deskew) _, crop_coor = crop_image_inside_box(boxes[index],image_page_rotated) @@ -1608,7 +1909,7 @@ class Eynollah: prediction_textline = self.do_prediction( use_patches, img, self.model_textline, marginal_of_patch_percent=0.15, n_batch_inference=3, - thresholding_for_artificial_class_in_light_version=self.textline_light) + thresholding_for_artificial_class_in_light_version=self.textline_light, threshold_art_class_textline=self.threshold_art_class_textline) #if not self.textline_light: #if num_col_classifier==1: #prediction_textline_nopatch = self.do_prediction(False, img, self.model_textline) @@ -1622,7 +1923,55 @@ class Eynollah: textline_mask_tot_ea_art = textline_mask_tot_ea_art.astype('uint8') #textline_mask_tot_ea_art = cv2.dilate(textline_mask_tot_ea_art, KERNEL, iterations=1) prediction_textline[:,:][textline_mask_tot_ea_art[:,:]==1]=2 + """ + else: + textline_mask_tot_ea_art = textline_mask_tot_ea_art.astype('uint8') + hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (8, 1)) + + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) + ##cv2.imwrite('textline_mask_tot_ea_art.png', textline_mask_tot_ea_art) + textline_mask_tot_ea_art = cv2.dilate(textline_mask_tot_ea_art, hor_kernel, iterations=1) + + ###cv2.imwrite('dil_textline_mask_tot_ea_art.png', dil_textline_mask_tot_ea_art) + + textline_mask_tot_ea_art = textline_mask_tot_ea_art.astype('uint8') + + #print(np.shape(dil_textline_mask_tot_ea_art), np.unique(dil_textline_mask_tot_ea_art), 'dil_textline_mask_tot_ea_art') + tsk = time.time() + skeleton_art_textline = skeletonize(textline_mask_tot_ea_art[:,:,0]) + + skeleton_art_textline = skeleton_art_textline*1 + + skeleton_art_textline = skeleton_art_textline.astype('uint8') + + skeleton_art_textline = cv2.dilate(skeleton_art_textline, kernel, iterations=1) + + #print(np.unique(skeleton_art_textline), np.shape(skeleton_art_textline)) + + #print(skeleton_art_textline, np.unique(skeleton_art_textline)) + + #cv2.imwrite('skeleton_art_textline.png', skeleton_art_textline) + + prediction_textline[:,:,0][skeleton_art_textline[:,:]==1]=2 + + #cv2.imwrite('prediction_textline1.png', prediction_textline[:,:,0]) + + ##hor_kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT, (4, 1)) + ##ver_kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 3)) + ##textline_mask_tot_ea_main = (prediction_textline[:,:]==1)*1 + ##textline_mask_tot_ea_main = textline_mask_tot_ea_main.astype('uint8') + + ##dil_textline_mask_tot_ea_main = cv2.erode(textline_mask_tot_ea_main, ver_kernel2, iterations=1) + + ##dil_textline_mask_tot_ea_main = cv2.dilate(textline_mask_tot_ea_main, hor_kernel2, iterations=1) + + ##dil_textline_mask_tot_ea_main = cv2.dilate(textline_mask_tot_ea_main, ver_kernel2, iterations=1) + + ##prediction_textline[:,:][dil_textline_mask_tot_ea_main[:,:]==1]=1 + + """ + textline_mask_tot_ea_lines = (prediction_textline[:,:]==1)*1 textline_mask_tot_ea_lines = textline_mask_tot_ea_lines.astype('uint8') if not self.textline_light: @@ -1631,10 +1980,15 @@ class Eynollah: prediction_textline[:,:][textline_mask_tot_ea_lines[:,:]==1]=1 if not self.textline_light: prediction_textline[:,:][old_art[:,:]==1]=2 + + #cv2.imwrite('prediction_textline2.png', prediction_textline[:,:,0]) prediction_textline_longshot = self.do_prediction(False, img, self.model_textline) prediction_textline_longshot_true_size = resize_image(prediction_textline_longshot, img_h, img_w) - + + + #cv2.imwrite('prediction_textline.png', prediction_textline[:,:,0]) + #sys.exit() self.logger.debug('exit textline_contours') return ((prediction_textline[:, :, 0]==1).astype(np.uint8), (prediction_textline_longshot_true_size[:, :, 0]==1).astype(np.uint8)) @@ -1656,8 +2010,8 @@ class Eynollah: y_diff_mean = find_contours_mean_y_diff(textline_con_fil) sigma_des = max(1, int(y_diff_mean * (4.0 / 40.0))) crop_img[crop_img > 0] = 1 - slope_corresponding_textregion = return_deskew_slop(crop_img, sigma_des, - map=self.executor.map, logger=self.logger, plotter=self.plotter) + slope_corresponding_textregion = return_deskew_slop_old_mp(crop_img, sigma_des, + logger=self.logger, plotter=self.plotter) except Exception as why: self.logger.error(why) slope_corresponding_textregion = MAX_SLOPE @@ -1823,7 +2177,7 @@ class Eynollah: ###img_bin = np.copy(prediction_bin) ###else: ###img_bin = np.copy(img_resized) - if self.ocr and not self.input_binary: + if (self.ocr and self.tr) and not self.input_binary: prediction_bin = self.do_prediction(True, img_resized, self.model_bin, n_batch_inference=5) prediction_bin = 255 * (prediction_bin[:,:,0] == 0) prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2) @@ -1840,7 +2194,7 @@ class Eynollah: textline_mask_tot_ea = resize_image(textline_mask_tot_ea,img_height_h, img_width_h ) #print(self.image_org.shape) - #cv2.imwrite('out_13.png', self.image_page_org_size) + #cv2.imwrite('textline.png', textline_mask_tot_ea) #plt.imshwo(self.image_page_org_size) #plt.show() @@ -1852,13 +2206,13 @@ class Eynollah: img_resized.shape[1], img_resized.shape[0], num_col_classifier) prediction_regions_org, confidence_matrix = self.do_prediction_new_concept( True, img_resized, self.model_region_1_2, n_batch_inference=1, - thresholding_for_some_classes_in_light_version=True) + thresholding_for_some_classes_in_light_version=True, threshold_art_class_layout=self.threshold_art_class_layout) else: prediction_regions_org = np.zeros((self.image_org.shape[0], self.image_org.shape[1], 3)) confidence_matrix = np.zeros((self.image_org.shape[0], self.image_org.shape[1])) prediction_regions_page, confidence_matrix_page = self.do_prediction_new_concept( False, self.image_page_org_size, self.model_region_1_2, n_batch_inference=1, - thresholding_for_artificial_class_in_light_version=True) + thresholding_for_artificial_class_in_light_version=True, threshold_art_class_layout=self.threshold_art_class_layout) ys = slice(*self.page_coord[0:2]) xs = slice(*self.page_coord[2:4]) prediction_regions_org[ys, xs] = prediction_regions_page @@ -1871,7 +2225,7 @@ class Eynollah: img_resized.shape[1], img_resized.shape[0], new_h, num_col_classifier) prediction_regions_org, confidence_matrix = self.do_prediction_new_concept( True, img_resized, self.model_region_1_2, n_batch_inference=2, - thresholding_for_some_classes_in_light_version=True) + thresholding_for_some_classes_in_light_version=True, threshold_art_class_layout=self.threshold_art_class_layout) ###prediction_regions_org = self.do_prediction(True, img_bin, self.model_region, n_batch_inference=3, thresholding_for_some_classes_in_light_version=True) #print("inside 3 ", time.time()-t_in) #plt.imshow(prediction_regions_org[:,:,0]) @@ -2781,10 +3135,21 @@ class Eynollah: if self.plotter: self.plotter.save_page_image(image_page) - + + if not self.ignore_page_extraction: + mask_page = np.zeros((text_regions_p_1.shape[0], text_regions_p_1.shape[1])).astype(np.int8) + mask_page = cv2.fillPoly(mask_page, pts=[cont_page[0]], color=(1,)) + + text_regions_p_1[mask_page==0] = 0 + textline_mask_tot_ea[mask_page==0] = 0 + text_regions_p_1 = text_regions_p_1[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] textline_mask_tot_ea = textline_mask_tot_ea[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] img_bin_light = img_bin_light[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] + + ###text_regions_p_1 = text_regions_p_1[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] + ###textline_mask_tot_ea = textline_mask_tot_ea[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] + ###img_bin_light = img_bin_light[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] mask_images = (text_regions_p_1[:, :] == 2) * 1 mask_images = mask_images.astype(np.uint8) @@ -2809,6 +3174,26 @@ class Eynollah: num_col = num_col + 1 if not num_column_is_classified: num_col_classifier = num_col + 1 + if self.num_col_upper and self.num_col_lower: + if self.num_col_upper == self.num_col_lower: + num_col_classifier = self.num_col_upper + else: + if num_col_classifier < self.num_col_lower: + num_col_classifier = self.num_col_lower + if num_col_classifier > self.num_col_upper: + num_col_classifier = self.num_col_upper + + elif self.num_col_lower and not self.num_col_upper: + if num_col_classifier < self.num_col_lower: + num_col_classifier = self.num_col_lower + + elif self.num_col_upper and not self.num_col_lower: + if num_col_classifier > self.num_col_upper: + num_col_classifier = self.num_col_upper + + else: + pass + except Exception as why: self.logger.error(why) num_col = None @@ -2923,8 +3308,8 @@ class Eynollah: def run_deskew(self, textline_mask_tot_ea): #print(textline_mask_tot_ea.shape, 'textline_mask_tot_ea deskew') - slope_deskew = return_deskew_slop(cv2.erode(textline_mask_tot_ea, KERNEL, iterations=2), 2, 30, True, - map=self.executor.map, logger=self.logger, plotter=self.plotter) + slope_deskew = return_deskew_slop_old_mp(cv2.erode(textline_mask_tot_ea, KERNEL, iterations=2), 2, 30, True, + logger=self.logger, plotter=self.plotter) slope_first = 0 if self.plotter: @@ -2942,7 +3327,6 @@ class Eynollah: text_regions_p_1[mask_lines[:, :] == 1] = 3 text_regions_p = text_regions_p_1[:, :] text_regions_p = np.array(text_regions_p) - if num_col_classifier in (1, 2): try: regions_without_separators = (text_regions_p[:, :] == 1) * 1 @@ -3248,8 +3632,10 @@ class Eynollah: # 6 is the separators lable in old full layout model # 4 is the drop capital class in old full layout model # in the new full layout drop capital is 3 and separators are 5 - - text_regions_p[:,:][regions_fully[:,:,0]==5]=6 + + # the separators in full layout will not be written on layout + if not self.reading_order_machine_based: + text_regions_p[:,:][regions_fully[:,:,0]==5]=6 ###regions_fully[:, :, 0][regions_fully_only_drop[:, :, 0] == 3] = 4 #text_regions_p[:,:][regions_fully[:,:,0]==6]=6 @@ -3318,9 +3704,100 @@ class Eynollah: return model def do_order_of_regions_with_model(self, contours_only_text_parent, contours_only_text_parent_h, text_regions_p): + + height1 =672#448 + width1 = 448#224 + + height2 =672#448 + width2= 448#224 + + height3 =672#448 + width3 = 448#224 + + inference_bs = 3 + + ver_kernel = np.ones((5, 1), dtype=np.uint8) + hor_kernel = np.ones((1, 5), dtype=np.uint8) + + + min_cont_size_to_be_dilated = 10 + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + cx_conts, cy_conts, x_min_conts, x_max_conts, y_min_conts, y_max_conts, _ = find_new_features_of_contours(contours_only_text_parent) + args_cont_located = np.array(range(len(contours_only_text_parent))) + + diff_y_conts = np.abs(y_max_conts[:]-y_min_conts) + diff_x_conts = np.abs(x_max_conts[:]-x_min_conts) + + mean_x = statistics.mean(diff_x_conts) + median_x = statistics.median(diff_x_conts) + + + diff_x_ratio= diff_x_conts/mean_x + + args_cont_located_excluded = args_cont_located[diff_x_ratio>=1.3] + args_cont_located_included = args_cont_located[diff_x_ratio<1.3] + + contours_only_text_parent_excluded = [contours_only_text_parent[ind] for ind in range(len(contours_only_text_parent)) if diff_x_ratio[ind]>=1.3]#contours_only_text_parent[diff_x_ratio>=1.3] + contours_only_text_parent_included = [contours_only_text_parent[ind] for ind in range(len(contours_only_text_parent)) if diff_x_ratio[ind]<1.3]#contours_only_text_parent[diff_x_ratio<1.3] + + + cx_conts_excluded = [cx_conts[ind] for ind in range(len(cx_conts)) if diff_x_ratio[ind]>=1.3]#cx_conts[diff_x_ratio>=1.3] + cx_conts_included = [cx_conts[ind] for ind in range(len(cx_conts)) if diff_x_ratio[ind]<1.3]#cx_conts[diff_x_ratio<1.3] + + cy_conts_excluded = [cy_conts[ind] for ind in range(len(cy_conts)) if diff_x_ratio[ind]>=1.3]#cy_conts[diff_x_ratio>=1.3] + cy_conts_included = [cy_conts[ind] for ind in range(len(cy_conts)) if diff_x_ratio[ind]<1.3]#cy_conts[diff_x_ratio<1.3] + + #print(diff_x_ratio, 'ratio') + text_regions_p = text_regions_p.astype('uint8') + + if len(contours_only_text_parent_excluded)>0: + textregion_par = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1])).astype('uint8') + textregion_par = cv2.fillPoly(textregion_par, pts=contours_only_text_parent_included, color=(1,1)) + else: + textregion_par = (text_regions_p[:,:]==1)*1 + textregion_par = textregion_par.astype('uint8') + + text_regions_p_textregions_dilated = cv2.erode(textregion_par , hor_kernel, iterations=2) + text_regions_p_textregions_dilated = cv2.dilate(text_regions_p_textregions_dilated , ver_kernel, iterations=4) + text_regions_p_textregions_dilated = cv2.erode(text_regions_p_textregions_dilated , hor_kernel, iterations=1) + text_regions_p_textregions_dilated = cv2.dilate(text_regions_p_textregions_dilated , ver_kernel, iterations=5) + text_regions_p_textregions_dilated[text_regions_p[:,:]>1] = 0 + + + contours_only_dilated, hir_on_text_dilated = return_contours_of_image(text_regions_p_textregions_dilated) + contours_only_dilated = return_parent_contours(contours_only_dilated, hir_on_text_dilated) + + indexes_of_located_cont, center_x_coordinates_of_located, center_y_coordinates_of_located = self.return_indexes_of_contours_loctaed_inside_another_list_of_contours(contours_only_dilated, contours_only_text_parent_included, cx_conts_included, cy_conts_included, args_cont_located_included) + + + if len(args_cont_located_excluded)>0: + for ind in args_cont_located_excluded: + indexes_of_located_cont.append(np.array([ind])) + contours_only_dilated.append(contours_only_text_parent[ind]) + center_y_coordinates_of_located.append(0) + + array_list = [np.array([elem]) if isinstance(elem, int) else elem for elem in indexes_of_located_cont] + flattened_array = np.concatenate([arr.ravel() for arr in array_list]) + #print(len( np.unique(flattened_array)), 'indexes_of_located_cont uniques') + + missing_textregions = list( set(np.array(range(len(contours_only_text_parent))) ) - set(np.unique(flattened_array)) ) + #print(missing_textregions, 'missing_textregions') + + for ind in missing_textregions: + indexes_of_located_cont.append(np.array([ind])) + contours_only_dilated.append(contours_only_text_parent[ind]) + center_y_coordinates_of_located.append(0) + + + if contours_only_text_parent_h: + for vi in range(len(contours_only_text_parent_h)): + indexes_of_located_cont.append(int(vi+len(contours_only_text_parent))) + + array_list = [np.array([elem]) if isinstance(elem, int) else elem for elem in indexes_of_located_cont] + flattened_array = np.concatenate([arr.ravel() for arr in array_list]) + y_len = text_regions_p.shape[0] x_len = text_regions_p.shape[1] - img_poly = np.zeros((y_len,x_len), dtype='uint8') img_poly[text_regions_p[:,:]==1] = 1 @@ -3328,25 +3805,24 @@ class Eynollah: img_poly[text_regions_p[:,:]==3] = 4 img_poly[text_regions_p[:,:]==6] = 5 - - #temp - sep_mask = (img_poly==5)*1 - sep_mask = sep_mask.astype('uint8') - sep_mask = cv2.erode(sep_mask, kernel=KERNEL, iterations=2) - img_poly[img_poly==5] = 0 - img_poly[sep_mask==1] = 5 - # - img_header_and_sep = np.zeros((y_len,x_len), dtype='uint8') if contours_only_text_parent_h: _, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, _ = find_new_features_of_contours( contours_only_text_parent_h) for j in range(len(cy_main)): img_header_and_sep[int(y_max_main[j]):int(y_max_main[j])+12, - int(x_min_main[j]):int(x_max_main[j])] = 1 - co_text_all = contours_only_text_parent + contours_only_text_parent_h + int(x_min_main[j]):int(x_max_main[j])] = 1 + co_text_all_org = contours_only_text_parent + contours_only_text_parent_h + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + co_text_all = contours_only_dilated + contours_only_text_parent_h + else: + co_text_all = contours_only_text_parent + contours_only_text_parent_h else: - co_text_all = contours_only_text_parent + co_text_all_org = contours_only_text_parent + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + co_text_all = contours_only_dilated + else: + co_text_all = contours_only_text_parent if not len(co_text_all): return [], [] @@ -3361,21 +3837,13 @@ class Eynollah: cv2.fillPoly(img, pts=[co_text_all[i]], color=(1,)) labels_con[:,:,i] = img - height1 =672#448 - width1 = 448#224 - - height2 =672#448 - width2= 448#224 - - height3 =672#448 - width3 = 448#224 labels_con = resize_image(labels_con.astype(np.uint8), height1, width1).astype(bool) img_header_and_sep = resize_image(img_header_and_sep, height1, width1) img_poly = resize_image(img_poly, height3, width3) - inference_bs = 3 + input_1 = np.zeros((inference_bs, height1, width1, 3)) ordered = [list(range(len(co_text_all)))] index_update = 0 @@ -3425,221 +3893,25 @@ class Eynollah: break ordered = [i[0] for i in ordered] - region_ids = ['region_%04d' % i for i in range(len(co_text_all))] - return ordered, region_ids - - def return_start_and_end_of_common_text_of_textline_ocr(self, textline_image, ind_tot): - width = np.shape(textline_image)[1] - height = np.shape(textline_image)[0] - common_window = int(0.2*width) - - width1 = int ( width/2. - common_window ) - width2 = int ( width/2. + common_window ) - - img_sum = np.sum(textline_image[:,:,0], axis=0) - sum_smoothed = gaussian_filter1d(img_sum, 3) - - peaks_real, _ = find_peaks(sum_smoothed, height=0) - if len(peaks_real)>70: - print(len(peaks_real), 'len(peaks_real)') - - peaks_real = peaks_real[(peaks_realwidth1)] - - arg_sort = np.argsort(sum_smoothed[peaks_real]) - arg_sort4 =arg_sort[::-1][:4] - peaks_sort_4 = peaks_real[arg_sort][::-1][:4] - argsort_sorted = np.argsort(peaks_sort_4) - - first_4_sorted = peaks_sort_4[argsort_sorted] - y_4_sorted = sum_smoothed[peaks_real][arg_sort4[argsort_sorted]] - #print(first_4_sorted,'first_4_sorted') - - arg_sortnew = np.argsort(y_4_sorted) - peaks_final =np.sort( first_4_sorted[arg_sortnew][2:] ) - - #plt.figure(ind_tot) - #plt.imshow(textline_image) - #plt.plot([peaks_final[0], peaks_final[0]], [0, height-1]) - #plt.plot([peaks_final[1], peaks_final[1]], [0, height-1]) - #plt.savefig('./'+str(ind_tot)+'.png') - - return peaks_final[0], peaks_final[1] + + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + org_contours_indexes = [] + for ind in range(len(ordered)): + region_with_curr_order = ordered[ind] + if region_with_curr_order < len(contours_only_dilated): + if np.isscalar(indexes_of_located_cont[region_with_curr_order]): + org_contours_indexes = org_contours_indexes + [indexes_of_located_cont[region_with_curr_order]] + else: + arg_sort_located_cont = np.argsort(center_y_coordinates_of_located[region_with_curr_order]) + org_contours_indexes = org_contours_indexes + list(np.array(indexes_of_located_cont[region_with_curr_order])[arg_sort_located_cont]) ##org_contours_indexes + list ( + else: + org_contours_indexes = org_contours_indexes + [indexes_of_located_cont[region_with_curr_order]] + + region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] + return org_contours_indexes, region_ids else: - pass - - def return_start_and_end_of_common_text_of_textline_ocr_without_common_section(self, textline_image, ind_tot): - width = np.shape(textline_image)[1] - height = np.shape(textline_image)[0] - common_window = int(0.06*width) - - width1 = int ( width/2. - common_window ) - width2 = int ( width/2. + common_window ) - - img_sum = np.sum(textline_image[:,:,0], axis=0) - sum_smoothed = gaussian_filter1d(img_sum, 3) - - peaks_real, _ = find_peaks(sum_smoothed, height=0) - if len(peaks_real)>70: - #print(len(peaks_real), 'len(peaks_real)') - - peaks_real = peaks_real[(peaks_realwidth1)] - - arg_max = np.argmax(sum_smoothed[peaks_real]) - peaks_final = peaks_real[arg_max] - - #plt.figure(ind_tot) - #plt.imshow(textline_image) - #plt.plot([peaks_final, peaks_final], [0, height-1]) - ##plt.plot([peaks_final[1], peaks_final[1]], [0, height-1]) - #plt.savefig('./'+str(ind_tot)+'.png') - - return peaks_final - else: - return None - - def return_start_and_end_of_common_text_of_textline_ocr_new_splitted( - self, peaks_real, sum_smoothed, start_split, end_split): - - peaks_real = peaks_real[(peaks_realstart_split)] - - arg_sort = np.argsort(sum_smoothed[peaks_real]) - arg_sort4 =arg_sort[::-1][:4] - peaks_sort_4 = peaks_real[arg_sort][::-1][:4] - argsort_sorted = np.argsort(peaks_sort_4) - - first_4_sorted = peaks_sort_4[argsort_sorted] - y_4_sorted = sum_smoothed[peaks_real][arg_sort4[argsort_sorted]] - #print(first_4_sorted,'first_4_sorted') - - arg_sortnew = np.argsort(y_4_sorted) - peaks_final =np.sort( first_4_sorted[arg_sortnew][3:] ) - return peaks_final[0] - - def return_start_and_end_of_common_text_of_textline_ocr_new(self, textline_image, ind_tot): - width = np.shape(textline_image)[1] - height = np.shape(textline_image)[0] - common_window = int(0.15*width) - - width1 = int ( width/2. - common_window ) - width2 = int ( width/2. + common_window ) - mid = int(width/2.) - - img_sum = np.sum(textline_image[:,:,0], axis=0) - sum_smoothed = gaussian_filter1d(img_sum, 3) - - peaks_real, _ = find_peaks(sum_smoothed, height=0) - if len(peaks_real)>70: - peak_start = self.return_start_and_end_of_common_text_of_textline_ocr_new_splitted( - peaks_real, sum_smoothed, width1, mid+2) - peak_end = self.return_start_and_end_of_common_text_of_textline_ocr_new_splitted( - peaks_real, sum_smoothed, mid-2, width2) - - #plt.figure(ind_tot) - #plt.imshow(textline_image) - #plt.plot([peak_start, peak_start], [0, height-1]) - #plt.plot([peak_end, peak_end], [0, height-1]) - #plt.savefig('./'+str(ind_tot)+'.png') - - return peak_start, peak_end - else: - pass - - def return_ocr_of_textline_without_common_section( - self, textline_image, model_ocr, processor, device, width_textline, h2w_ratio,ind_tot): - - if h2w_ratio > 0.05: - pixel_values = processor(textline_image, return_tensors="pt").pixel_values - generated_ids = model_ocr.generate(pixel_values.to(device)) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - else: - #width = np.shape(textline_image)[1] - #height = np.shape(textline_image)[0] - #common_window = int(0.3*width) - #width1 = int ( width/2. - common_window ) - #width2 = int ( width/2. + common_window ) - - split_point = self.return_start_and_end_of_common_text_of_textline_ocr_without_common_section( - textline_image, ind_tot) - if split_point: - image1 = textline_image[:, :split_point,:]# image.crop((0, 0, width2, height)) - image2 = textline_image[:, split_point:,:]#image.crop((width1, 0, width, height)) - - #pixel_values1 = processor(image1, return_tensors="pt").pixel_values - #pixel_values2 = processor(image2, return_tensors="pt").pixel_values - - pixel_values_merged = processor([image1,image2], return_tensors="pt").pixel_values - generated_ids_merged = model_ocr.generate(pixel_values_merged.to(device)) - generated_text_merged = processor.batch_decode(generated_ids_merged, skip_special_tokens=True) - - #print(generated_text_merged,'generated_text_merged') - - #generated_ids1 = model_ocr.generate(pixel_values1.to(device)) - #generated_ids2 = model_ocr.generate(pixel_values2.to(device)) - - #generated_text1 = processor.batch_decode(generated_ids1, skip_special_tokens=True)[0] - #generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0] - - #generated_text = generated_text1 + ' ' + generated_text2 - generated_text = generated_text_merged[0] + ' ' + generated_text_merged[1] - - #print(generated_text1,'generated_text1') - #print(generated_text2, 'generated_text2') - #print('########################################') - else: - pixel_values = processor(textline_image, return_tensors="pt").pixel_values - generated_ids = model_ocr.generate(pixel_values.to(device)) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - - #print(generated_text,'generated_text') - #print('########################################') - return generated_text - - def return_ocr_of_textline( - self, textline_image, model_ocr, processor, device, width_textline, h2w_ratio,ind_tot): - - if h2w_ratio > 0.05: - pixel_values = processor(textline_image, return_tensors="pt").pixel_values - generated_ids = model_ocr.generate(pixel_values.to(device)) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - else: - #width = np.shape(textline_image)[1] - #height = np.shape(textline_image)[0] - #common_window = int(0.3*width) - #width1 = int ( width/2. - common_window ) - #width2 = int ( width/2. + common_window ) - - try: - width1, width2 = self.return_start_and_end_of_common_text_of_textline_ocr_new(textline_image, ind_tot) - - image1 = textline_image[:, :width2,:]# image.crop((0, 0, width2, height)) - image2 = textline_image[:, width1:,:]#image.crop((width1, 0, width, height)) - - pixel_values1 = processor(image1, return_tensors="pt").pixel_values - pixel_values2 = processor(image2, return_tensors="pt").pixel_values - - generated_ids1 = model_ocr.generate(pixel_values1.to(device)) - generated_ids2 = model_ocr.generate(pixel_values2.to(device)) - - generated_text1 = processor.batch_decode(generated_ids1, skip_special_tokens=True)[0] - generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0] - #print(generated_text1,'generated_text1') - #print(generated_text2, 'generated_text2') - #print('########################################') - - match = sq(None, generated_text1, generated_text2).find_longest_match( - 0, len(generated_text1), 0, len(generated_text2)) - generated_text = generated_text1 + generated_text2[match.b+match.size:] - except: - pixel_values = processor(textline_image, return_tensors="pt").pixel_values - generated_ids = model_ocr.generate(pixel_values.to(device)) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - - return generated_text - - def return_textline_contour_with_added_box_coordinate(self, textline_contour, box_ind): - textline_contour[:,0] = textline_contour[:,0] + box_ind[2] - textline_contour[:,1] = textline_contour[:,1] + box_ind[0] - return textline_contour + region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] + return ordered, region_ids def return_list_of_contours_with_desired_order(self, ls_cons, sorted_indexes): return [ls_cons[sorted_indexes[index]] for index in range(len(sorted_indexes))] @@ -3811,7 +4083,7 @@ class Eynollah: if dilation_m1<6: dilation_m1 = 6 #print(dilation_m1, 'dilation_m1') - dilation_m1 = 6 + dilation_m1 = 4#6 dilation_m2 = int(dilation_m1/2.) +1 for i in range(len(x_differential)): @@ -4068,6 +4340,29 @@ class Eynollah: contours[ind_u_a_trs].pop(ittrd) return contours + + def return_indexes_of_contours_loctaed_inside_another_list_of_contours(self, contours, contours_loc, cx_main_loc, cy_main_loc, indexes_loc): + indexes_of_located_cont = [] + center_x_coordinates_of_located = [] + center_y_coordinates_of_located = [] + #M_main_tot = [cv2.moments(contours_loc[j]) + #for j in range(len(contours_loc))] + #cx_main_loc = [(M_main_tot[j]["m10"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] + #cy_main_loc = [(M_main_tot[j]["m01"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] + + for ij in range(len(contours)): + results = [cv2.pointPolygonTest(contours[ij], (cx_main_loc[ind], cy_main_loc[ind]), False) + for ind in range(len(cy_main_loc)) ] + results = np.array(results) + indexes_in = np.where((results == 0) | (results == 1)) + indexes = indexes_loc[indexes_in]# [(results == 0) | (results == 1)]#np.where((results == 0) | (results == 1)) + + indexes_of_located_cont.append(indexes) + center_x_coordinates_of_located.append(np.array(cx_main_loc)[indexes_in] ) + center_y_coordinates_of_located.append(np.array(cy_main_loc)[indexes_in] ) + + return indexes_of_located_cont, center_x_coordinates_of_located, center_y_coordinates_of_located + def filter_contours_without_textline_inside( self, contours,text_con_org, contours_textline, contours_only_text_parent_d_ordered, conf_contours_textregions): @@ -4255,26 +4550,106 @@ class Eynollah: return (slopes_rem, all_found_textline_polygons_rem, boxes_text_rem, txt_con_org_rem, contours_only_text_parent_rem, index_by_text_par_con_rem_sort) + + def separate_marginals_to_left_and_right_and_order_from_top_to_down(self, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes_marginals, mid_point_of_page_width): + cx_marg, cy_marg, _, _, _, _, _ = find_new_features_of_contours( + polygons_of_marginals) + + cx_marg = np.array(cx_marg) + cy_marg = np.array(cy_marg) + + poly_marg_left = list( np.array(polygons_of_marginals)[cx_marg < mid_point_of_page_width] ) + poly_marg_right = list( np.array(polygons_of_marginals)[cx_marg >= mid_point_of_page_width] ) + + all_found_textline_polygons_marginals_left = list( np.array(all_found_textline_polygons_marginals)[cx_marg < mid_point_of_page_width] ) + all_found_textline_polygons_marginals_right = list( np.array(all_found_textline_polygons_marginals)[cx_marg >= mid_point_of_page_width] ) + + all_box_coord_marginals_left = list( np.array(all_box_coord_marginals)[cx_marg < mid_point_of_page_width] ) + all_box_coord_marginals_right = list( np.array(all_box_coord_marginals)[cx_marg >= mid_point_of_page_width] ) + + slopes_marg_left = list( np.array(slopes_marginals)[cx_marg < mid_point_of_page_width] ) + slopes_marg_right = list( np.array(slopes_marginals)[cx_marg >= mid_point_of_page_width] ) + + cy_marg_left = cy_marg[cx_marg < mid_point_of_page_width] + cy_marg_right = cy_marg[cx_marg >= mid_point_of_page_width] + + ordered_left_marginals = [poly for _, poly in sorted(zip(cy_marg_left, poly_marg_left), key=lambda x: x[0])] + ordered_right_marginals = [poly for _, poly in sorted(zip(cy_marg_right, poly_marg_right), key=lambda x: x[0])] + + ordered_left_marginals_textline = [poly for _, poly in sorted(zip(cy_marg_left, all_found_textline_polygons_marginals_left), key=lambda x: x[0])] + ordered_right_marginals_textline = [poly for _, poly in sorted(zip(cy_marg_right, all_found_textline_polygons_marginals_right), key=lambda x: x[0])] + + ordered_left_marginals_bbox = [poly for _, poly in sorted(zip(cy_marg_left, all_box_coord_marginals_left), key=lambda x: x[0])] + ordered_right_marginals_bbox = [poly for _, poly in sorted(zip(cy_marg_right, all_box_coord_marginals_right), key=lambda x: x[0])] + + ordered_left_slopes_marginals = [poly for _, poly in sorted(zip(cy_marg_left, slopes_marg_left), key=lambda x: x[0])] + ordered_right_slopes_marginals = [poly for _, poly in sorted(zip(cy_marg_right, slopes_marg_right), key=lambda x: x[0])] + + return ordered_left_marginals, ordered_right_marginals, ordered_left_marginals_textline, ordered_right_marginals_textline, ordered_left_marginals_bbox, ordered_right_marginals_bbox, ordered_left_slopes_marginals, ordered_right_slopes_marginals - def run(self, image_filename : Optional[str] = None, dir_in : Optional[str] = None, overwrite : bool = False): + + def run(self, + overwrite: bool = False, + image_filename: Optional[str] = None, + dir_in: Optional[str] = None, + dir_out: Optional[str] = None, + dir_of_cropped_images: Optional[str] = None, + dir_of_layout: Optional[str] = None, + dir_of_deskewed: Optional[str] = None, + dir_of_all: Optional[str] = None, + dir_save_page: Optional[str] = None, + ): """ Get image and scales, then extract the page of scanned image """ self.logger.debug("enter run") t0_tot = time.time() + # Log enabled features directly + enabled_modes = [] + if self.light_version: + enabled_modes.append("Light version") + if self.textline_light: + enabled_modes.append("Light textline detection") + if self.full_layout: + enabled_modes.append("Full layout analysis") + if self.ocr: + enabled_modes.append("OCR") + if self.tables: + enabled_modes.append("Table detection") + if enabled_modes: + self.logger.info("Enabled modes: " + ", ".join(enabled_modes)) + if self.enable_plotting: + self.logger.info("Saving debug plots") + if dir_of_cropped_images: + self.logger.info(f"Saving cropped images to: {dir_of_cropped_images}") + if dir_of_layout: + self.logger.info(f"Saving layout plots to: {dir_of_layout}") + if dir_of_deskewed: + self.logger.info(f"Saving deskewed images to: {dir_of_deskewed}") + if dir_in: - self.ls_imgs = os.listdir(dir_in) + ls_imgs = [os.path.join(dir_in, image_filename) + for image_filename in filter(is_image_filename, + os.listdir(dir_in))] elif image_filename: - self.ls_imgs = [image_filename] + ls_imgs = [image_filename] else: raise ValueError("run requires either a single image filename or a directory") - for img_filename in self.ls_imgs: + for img_filename in ls_imgs: self.logger.info(img_filename) t0 = time.time() - self.reset_file_name_dir(os.path.join(dir_in or "", img_filename)) + self.reset_file_name_dir(img_filename, dir_out) + if self.enable_plotting: + self.plotter = EynollahPlotter(dir_out=dir_out, + dir_of_all=dir_of_all, + dir_save_page=dir_save_page, + dir_of_deskewed=dir_of_deskewed, + dir_of_cropped_images=dir_of_cropped_images, + dir_of_layout=dir_of_layout, + image_filename_stem=Path(image_filename).stem) #print("text region early -11 in %.1fs", time.time() - t0) if os.path.exists(self.writer.output_filename): if overwrite: @@ -4285,30 +4660,47 @@ class Eynollah: pcgts = self.run_single() self.logger.info("Job done in %.1fs", time.time() - t0) - #print("Job done in %.1fs" % (time.time() - t0)) self.writer.write_pagexml(pcgts) if dir_in: self.logger.info("All jobs done in %.1fs", time.time() - t0_tot) - print("all Job done in %.1fs", time.time() - t0_tot) def run_single(self): t0 = time.time() + + self.logger.info(f"Processing file: {self.writer.image_filename}") + self.logger.info("Step 1/5: Image Enhancement") + img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version) - self.logger.info("Enhancing took %.1fs ", time.time() - t0) + + self.logger.info(f"Image: {self.image.shape[1]}x{self.image.shape[0]}, {self.dpi} DPI, {num_col_classifier} columns") + if is_image_enhanced: + self.logger.info("Enhancement applied") + + self.logger.info(f"Enhancement complete ({time.time() - t0:.1f}s)") + + + # Image Extraction Mode if self.extract_only_images: + self.logger.info("Step 2/5: Image Extraction Mode") + text_regions_p_1, erosion_hurts, polygons_lines_xml, polygons_of_images, image_page, page_coord, cont_page = \ self.get_regions_light_v_extract_only_images(img_res, is_image_enhanced, num_col_classifier) - ocr_all_textlines = None pcgts = self.writer.build_pagexml_no_full_layout( [], page_coord, [], [], [], [], - polygons_of_images, [], [], [], [], [], - cont_page, [], [], ocr_all_textlines, []) + polygons_of_images, [], [], [], [], [], [], [], [], [], + cont_page, [], []) if self.plotter: self.plotter.write_images_into_directory(polygons_of_images, image_page) + + self.logger.info("Image extraction complete") return pcgts + # Basic Processing Mode if self.skip_layout_and_reading_order: + self.logger.info("Step 2/5: Basic Processing Mode") + self.logger.info("Skipping layout analysis and reading order detection") + _ ,_, _, textline_mask_tot_ea, img_bin_light, _ = \ self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier, skip_layout_and_reading_order=self.skip_layout_and_reading_order) @@ -4316,12 +4708,20 @@ class Eynollah: page_coord, image_page, textline_mask_tot_ea, img_bin_light, cont_page = \ self.run_graphics_and_columns_without_layout(textline_mask_tot_ea, img_bin_light) - ##all_found_textline_polygons =self.scale_contours_new(textline_mask_tot_ea) cnt_clean_rot_raw, hir_on_cnt_clean_rot = return_contours_of_image(textline_mask_tot_ea) all_found_textline_polygons = filter_contours_area_of_image( textline_mask_tot_ea, cnt_clean_rot_raw, hir_on_cnt_clean_rot, max_area=1, min_area=0.00001) + + M_main_tot = [cv2.moments(all_found_textline_polygons[j]) + for j in range(len(all_found_textline_polygons))] + w_h_textlines = [cv2.boundingRect(all_found_textline_polygons[j])[2:] for j in range(len(all_found_textline_polygons))] + w_h_textlines = [w_h_textlines[j][0] / float(w_h_textlines[j][1]) for j in range(len(w_h_textlines))] + cx_main_tot = [(M_main_tot[j]["m10"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] + cy_main_tot = [(M_main_tot[j]["m01"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] + + all_found_textline_polygons = self.get_textlines_of_a_textregion_sorted(all_found_textline_polygons, cx_main_tot, cy_main_tot, w_h_textlines)#all_found_textline_polygons[::-1] all_found_textline_polygons=[ all_found_textline_polygons ] @@ -4329,31 +4729,45 @@ class Eynollah: all_found_textline_polygons) all_found_textline_polygons = self.filter_contours_inside_a_bigger_one( all_found_textline_polygons, None, textline_mask_tot_ea, type_contour="textline") - - + + order_text_new = [0] slopes =[0] id_of_texts_tot =['region_0001'] polygons_of_images = [] - slopes_marginals = [] - polygons_of_marginals = [] - all_found_textline_polygons_marginals = [] - all_box_coord_marginals = [] + slopes_marginals_left = [] + slopes_marginals_right = [] + polygons_of_marginals_left = [] + polygons_of_marginals_right = [] + all_found_textline_polygons_marginals_left = [] + all_found_textline_polygons_marginals_right = [] + all_box_coord_marginals_left = [] + all_box_coord_marginals_right = [] polygons_lines_xml = [] contours_tables = [] - ocr_all_textlines = None - conf_contours_textregions =None + conf_contours_textregions =[0] + + if self.ocr and not self.tr: + gc.collect() + ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons, self.prediction_model, self.b_s_ocr, self.num_to_char, textline_light=True) + else: + ocr_all_textlines = None + pcgts = self.writer.build_pagexml_no_full_layout( cont_page, page_coord, order_text_new, id_of_texts_tot, - all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals, - all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, - cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines, conf_contours_textregions) + all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals_left, polygons_of_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines=ocr_all_textlines, conf_contours_textregion=conf_contours_textregions, skip_layout_reading_order=self.skip_layout_and_reading_order) + self.logger.info("Basic processing complete") return pcgts #print("text region early -1 in %.1fs", time.time() - t0) t1 = time.time() + self.logger.info("Step 2/5: Layout Analysis") + if self.light_version: + self.logger.info("Using light version processing") text_regions_p_1 ,erosion_hurts, polygons_lines_xml, textline_mask_tot_ea, img_bin_light, confidence_matrix = \ self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier) #print("text region early -2 in %.1fs", time.time() - t0) @@ -4384,34 +4798,37 @@ class Eynollah: text_regions_p_1 ,erosion_hurts, polygons_lines_xml = \ self.get_regions_from_xy_2models(img_res, is_image_enhanced, num_col_classifier) - self.logger.info("Textregion detection took %.1fs ", time.time() - t1) + self.logger.info(f"Textregion detection took {time.time() - t1:.1f}s") confidence_matrix = np.zeros((text_regions_p_1.shape[:2])) t1 = time.time() num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, \ text_regions_p_1, cont_page, table_prediction = \ self.run_graphics_and_columns(text_regions_p_1, num_col_classifier, num_column_is_classified, erosion_hurts) - self.logger.info("Graphics detection took %.1fs ", time.time() - t1) + self.logger.info(f"Graphics detection took {time.time() - t1:.1f}s") #self.logger.info('cont_page %s', cont_page) #plt.imshow(table_prediction) #plt.show() + self.logger.info(f"Layout analysis complete ({time.time() - t1:.1f}s)") if not num_col: - self.logger.info("No columns detected, outputting an empty PAGE-XML") - ocr_all_textlines = None + self.logger.info("No columns detected - generating empty PAGE-XML") + pcgts = self.writer.build_pagexml_no_full_layout( - [], page_coord, [], [], [], [], [], [], [], [], [], [], - cont_page, [], [], ocr_all_textlines, []) + [], page_coord, [], [], [], [], [], [], [], [], [], [], [], [], [], [], + cont_page, [], []) return pcgts #print("text region early in %.1fs", time.time() - t0) t1 = time.time() if not self.light_version: textline_mask_tot_ea = self.run_textline(image_page) - self.logger.info("textline detection took %.1fs", time.time() - t1) + self.logger.info(f"Textline detection took {time.time() - t1:.1f}s") t1 = time.time() slope_deskew, slope_first = self.run_deskew(textline_mask_tot_ea) - self.logger.info("deskewing took %.1fs", time.time() - t1) + if np.abs(slope_deskew) > 0.01: # Only log if there is significant skew + self.logger.info(f"Applied deskew correction: {slope_deskew:.2f} degrees") + self.logger.info(f"Deskewing took {time.time() - t1:.1f}s") elif num_col_classifier in (1,2): org_h_l_m = textline_mask_tot_ea.shape[0] org_w_l_m = textline_mask_tot_ea.shape[1] @@ -4431,6 +4848,13 @@ class Eynollah: textline_mask_tot, text_regions_p, image_page_rotated = \ self.run_marginals(image_page, textline_mask_tot_ea, mask_images, mask_lines, num_col_classifier, slope_deskew, text_regions_p_1, table_prediction) + + self.logger.info("Step 3/5: Text Line Detection") + + if self.curved_line: + self.logger.info("Mode: Curved line detection") + elif self.textline_light: + self.logger.info("Mode: Light detection") if self.light_version and num_col_classifier in (1,2): image_page = resize_image(image_page,org_h_l_m, org_w_l_m ) @@ -4441,8 +4865,7 @@ class Eynollah: table_prediction = resize_image(table_prediction,org_h_l_m, org_w_l_m ) image_page_rotated = resize_image(image_page_rotated,org_h_l_m, org_w_l_m ) - self.logger.info("detection of marginals took %.1fs", time.time() - t1) - #print("text region early 2 marginal in %.1fs", time.time() - t0) + self.logger.info(f"Detection of marginals took {time.time() - t1:.1f}s") ## birdan sora chock chakir t1 = time.time() if not self.full_layout: @@ -4540,7 +4963,7 @@ class Eynollah: cx_bigest_d_big[0] = cx_bigest_d[ind_largest] cy_biggest_d_big[0] = cy_biggest_d[ind_largest] except Exception as why: - self.logger.error(why) + self.logger.error(str(why)) (h, w) = text_only.shape[:2] center = (w // 2.0, h // 2.0) @@ -4572,6 +4995,7 @@ class Eynollah: contours_only_text_parent_d_ordered = [] contours_only_text_parent_d = [] #contours_only_text_parent = [] + if not len(contours_only_text_parent): # stop early empty_marginals = [[]] * len(polygons_of_marginals) @@ -4579,14 +5003,14 @@ class Eynollah: pcgts = self.writer.build_pagexml_full_layout( [], [], page_coord, [], [], [], [], [], [], polygons_of_images, contours_tables, [], - polygons_of_marginals, empty_marginals, empty_marginals, [], [], [], - cont_page, polygons_lines_xml, [], [], []) + polygons_of_marginals, polygons_of_marginals, empty_marginals, empty_marginals, empty_marginals, empty_marginals, [], [], [], [], + cont_page, polygons_lines_xml) else: pcgts = self.writer.build_pagexml_no_full_layout( [], page_coord, [], [], [], [], polygons_of_images, - polygons_of_marginals, empty_marginals, empty_marginals, [], [], - cont_page, polygons_lines_xml, contours_tables, [], []) + polygons_of_marginals, polygons_of_marginals, empty_marginals, empty_marginals, empty_marginals, empty_marginals, [], [], [], + cont_page, polygons_lines_xml, contours_tables) return pcgts @@ -4678,8 +5102,11 @@ class Eynollah: num_col_classifier, scale_param, slope_deskew) all_found_textline_polygons_marginals = small_textlines_to_parent_adherence2( all_found_textline_polygons_marginals, textline_mask_tot_ea, num_col_classifier) - - #print("text region early 6 in %.1fs", time.time() - t0) + + mid_point_of_page_width = text_regions_p.shape[1] / 2. + polygons_of_marginals_left, polygons_of_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes_marginals_left, slopes_marginals_right = self.separate_marginals_to_left_and_right_and_order_from_top_to_down(polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes_marginals, mid_point_of_page_width) + + #print(len(polygons_of_marginals), len(ordered_left_marginals), len(ordered_right_marginals), 'marginals ordred') if self.full_layout: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: contours_only_text_parent_d_ordered = self.return_list_of_contours_with_desired_order( @@ -4710,10 +5137,10 @@ class Eynollah: pixel_img = 4 polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img) - all_found_textline_polygons = adhere_drop_capital_region_into_corresponding_textline( - text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, - all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, - kernel=KERNEL, curved_line=self.curved_line, textline_light=self.textline_light) + ##all_found_textline_polygons = adhere_drop_capital_region_into_corresponding_textline( + ##text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, + ##all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, + ##kernel=KERNEL, curved_line=self.curved_line, textline_light=self.textline_light) if not self.reading_order_machine_based: pixel_seps = 6 @@ -4758,7 +5185,17 @@ class Eynollah: t_order = time.time() if self.full_layout: + self.logger.info("Step 4/5: Reading Order Detection") + if self.reading_order_machine_based: + self.logger.info("Using machine-based detection") + if self.right2left: + self.logger.info("Right-to-left mode enabled") + if self.headers_off: + self.logger.info("Headers ignored in reading order") + + if self.reading_order_machine_based: + tror = time.time() order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( contours_only_text_parent, contours_only_text_parent_h, text_regions_p) else: @@ -4768,21 +5205,69 @@ class Eynollah: else: order_text_new, id_of_texts_tot = self.do_order_of_regions( contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d) - self.logger.info("detection of reading order took %.1fs", time.time() - t_order) + self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s") - if self.ocr: - ocr_all_textlines = [] + if self.ocr and not self.tr: + self.logger.info("Step 4.5/5: OCR Processing") + + if torch.cuda.is_available(): + self.logger.info("Using GPU acceleration") + else: + self.logger.info("Using CPU processing") + + gc.collect() + if len(all_found_textline_polygons)>0: + ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + else: + ocr_all_textlines = None + + if all_found_textline_polygons_marginals_left and len(all_found_textline_polygons_marginals_left)>0: + ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals_left, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + else: + ocr_all_textlines_marginals_left = None + + if all_found_textline_polygons_marginals_right and len(all_found_textline_polygons_marginals_right)>0: + ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals_right, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + else: + ocr_all_textlines_marginals_right = None + + if all_found_textline_polygons_h and len(all_found_textline_polygons)>0: + ocr_all_textlines_h = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_h, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + else: + ocr_all_textlines_h = None + + if polygons_of_drop_capitals and len(polygons_of_drop_capitals)>0: + ocr_all_textlines_drop = return_rnn_cnn_ocr_of_given_textlines(image_page, polygons_of_drop_capitals, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + else: + ocr_all_textlines_drop = None else: ocr_all_textlines = None + ocr_all_textlines_marginals_left = None + ocr_all_textlines_marginals_right = None + ocr_all_textlines_h = None + ocr_all_textlines_drop = None + + self.logger.info("Step 5/5: Output Generation") + pcgts = self.writer.build_pagexml_full_layout( contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, - polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, - all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, - cont_page, polygons_lines_xml, ocr_all_textlines, conf_contours_textregions, conf_contours_textregions_h) + polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals_left, polygons_of_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_lines_xml, ocr_all_textlines, ocr_all_textlines_h, ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right, ocr_all_textlines_drop, conf_contours_textregions, conf_contours_textregions_h) + return pcgts contours_only_text_parent_h = None + self.logger.info("Step 4/5: Reading Order Detection") + + if self.reading_order_machine_based: + self.logger.info("Using machine-based detection") + if self.right2left: + self.logger.info("Right-to-left mode enabled") + if self.headers_off: + self.logger.info("Headers ignored in reading order") + if self.reading_order_machine_based: order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( contours_only_text_parent, contours_only_text_parent_h, text_regions_p) @@ -4802,7 +5287,22 @@ class Eynollah: order_text_new, id_of_texts_tot = self.do_order_of_regions( contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) - if self.ocr: + if self.ocr and self.tr: + self.logger.info("Step 4.5/5: OCR Processing") + + if torch.cuda.is_available(): + self.logger.info("Using GPU acceleration") + else: + self.logger.info("Using CPU processing") + + if self.light_version: + self.logger.info("Using light version OCR") + + if self.textline_light: + self.logger.info("Using light text line detection for OCR") + + self.logger.info("Processing text lines...") + device = cuda.get_current_device() device.reset() gc.collect() @@ -4849,16 +5349,35 @@ class Eynollah: ocr_textline_in_textregion.append(text_ocr) ind_tot = ind_tot +1 ocr_all_textlines.append(ocr_textline_in_textregion) + + elif self.ocr and not self.tr: + gc.collect() + if len(all_found_textline_polygons)>0: + ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + + if all_found_textline_polygons_marginals_left and len(all_found_textline_polygons_marginals_left)>0: + ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals_left, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + + if all_found_textline_polygons_marginals_right and len(all_found_textline_polygons_marginals_right)>0: + ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals_right, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines = None - #print(ocr_all_textlines) - self.logger.info("detection of reading order took %.1fs", time.time() - t_order) + ocr_all_textlines_marginals_left = None + ocr_all_textlines_marginals_right = None + self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s") + + self.logger.info("Step 5/5: Output Generation") + self.logger.info("Generating PAGE-XML output") + pcgts = self.writer.build_pagexml_no_full_layout( txt_con_org, page_coord, order_text_new, id_of_texts_tot, - all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, - all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, - cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines, conf_contours_textregions) + all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals_left, polygons_of_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines, ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right, conf_contours_textregions) + + self.logger.info(f"Output file: {self.writer.output_filename}") + return pcgts @@ -4866,221 +5385,108 @@ class Eynollah_ocr: def __init__( self, dir_models, + model_name=None, dir_xmls=None, - dir_in=None, - dir_in_bin=None, - dir_out=None, - dir_out_image_text=None, tr_ocr=False, + batch_size=None, export_textline_images_and_text=False, do_not_mask_with_textline_contour=False, - draw_texts_on_image=False, - prediction_with_both_of_rgb_and_bin=False, + pref_of_dataset=None, + min_conf_value_of_textline_text : Optional[float]=None, logger=None, ): - self.dir_in = dir_in - self.dir_in_bin = dir_in_bin - self.dir_out = dir_out - self.dir_xmls = dir_xmls self.dir_models = dir_models + self.model_name = model_name self.tr_ocr = tr_ocr self.export_textline_images_and_text = export_textline_images_and_text self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour - self.draw_texts_on_image = draw_texts_on_image - self.dir_out_image_text = dir_out_image_text - self.prediction_with_both_of_rgb_and_bin = prediction_with_both_of_rgb_and_bin - if tr_ocr: - self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") - self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124" - self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) - self.model_ocr.to(self.device) - - else: - self.model_ocr_dir = dir_models + "/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" - model_ocr = load_model(self.model_ocr_dir , compile=False) - - self.prediction_model = tf.keras.models.Model( - model_ocr.get_layer(name = "image").input, - model_ocr.get_layer(name = "dense2").output) - - - with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: - characters = json.load(config_file) - - - AUTOTUNE = tf.data.AUTOTUNE - - # Mapping characters to integers. - char_to_num = StringLookup(vocabulary=list(characters), mask_token=None) - - # Mapping integers back to original characters. - self.num_to_char = StringLookup( - vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True - ) + self.pref_of_dataset = pref_of_dataset + self.logger = logger if logger else getLogger('eynollah') - def decode_batch_predictions(self, pred, max_len = 128): - # input_len is the product of the batch size and the - # number of time steps. - input_len = np.ones(pred.shape[0]) * pred.shape[1] - - # Decode CTC predictions using greedy search. - # decoded is a tuple with 2 elements. - decoded = tf.keras.backend.ctc_decode(pred, - input_length = input_len, - beam_width = 100) - # The outputs are in the first element of the tuple. - # Additionally, the first element is actually a list, - # therefore we take the first element of that list as well. - #print(decoded,'decoded') - decoded = decoded[0][0][:, :max_len] - - #print(decoded, decoded.shape,'decoded') - - output = [] - for d in decoded: - # Convert the predicted indices to the corresponding chars. - d = tf.strings.reduce_join(self.num_to_char(d)) - d = d.numpy().decode("utf-8") - output.append(d) - return output - - - def distortion_free_resize(self, image, img_size): - w, h = img_size - image = tf.image.resize(image, size=(h, w), preserve_aspect_ratio=True) - - # Check tha amount of padding needed to be done. - pad_height = h - tf.shape(image)[0] - pad_width = w - tf.shape(image)[1] - - # Only necessary if you want to do same amount of padding on both sides. - if pad_height % 2 != 0: - height = pad_height // 2 - pad_height_top = height + 1 - pad_height_bottom = height - else: - pad_height_top = pad_height_bottom = pad_height // 2 - - if pad_width % 2 != 0: - width = pad_width // 2 - pad_width_left = width + 1 - pad_width_right = width - else: - pad_width_left = pad_width_right = pad_width // 2 - - image = tf.pad( - image, - paddings=[ - [pad_height_top, pad_height_bottom], - [pad_width_left, pad_width_right], - [0, 0], - ], - ) - - image = tf.transpose(image, (1, 0, 2)) - image = tf.image.flip_left_right(image) - return image - - def return_start_and_end_of_common_text_of_textline_ocr_without_common_section(self, textline_image): - width = np.shape(textline_image)[1] - height = np.shape(textline_image)[0] - common_window = int(0.22*width) - - width1 = int ( width/2. - common_window ) - width2 = int ( width/2. + common_window ) - - img_sum = np.sum(textline_image[:,:,0], axis=0) - sum_smoothed = gaussian_filter1d(img_sum, 3) - - peaks_real, _ = find_peaks(sum_smoothed, height=0) - - if len(peaks_real)>35: - - #peaks_real = peaks_real[(peaks_realwidth1)] - argsort = np.argsort(sum_smoothed[peaks_real])[::-1] - peaks_real_top_six = peaks_real[argsort[:6]] - midpoint = textline_image.shape[1] / 2. - arg_closest = np.argmin(np.abs(peaks_real_top_six - midpoint)) - - #arg_max = np.argmax(sum_smoothed[peaks_real]) - - peaks_final = peaks_real_top_six[arg_closest]#peaks_real[arg_max] - - return peaks_final - else: - return None - - # Function to fit text inside the given area - def fit_text_single_line(self, draw, text, font_path, max_width, max_height): - initial_font_size = 50 - font_size = initial_font_size - while font_size > 10: # Minimum font size - font = ImageFont.truetype(font_path, font_size) - text_bbox = draw.textbbox((0, 0), text, font=font) # Get text bounding box - text_width = text_bbox[2] - text_bbox[0] - text_height = text_bbox[3] - text_bbox[1] - - if text_width <= max_width and text_height <= max_height: - return font # Return the best-fitting font - - font_size -= 2 # Reduce font size and retry - - return ImageFont.truetype(font_path, 10) # Smallest font fallback - - def return_textlines_split_if_needed(self, textline_image, textline_image_bin): - - split_point = self.return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image) - if split_point: - image1 = textline_image[:, :split_point,:]# image.crop((0, 0, width2, height)) - image2 = textline_image[:, split_point:,:]#image.crop((width1, 0, width, height)) - if self.prediction_with_both_of_rgb_and_bin: - image1_bin = textline_image_bin[:, :split_point,:]# image.crop((0, 0, width2, height)) - image2_bin = textline_image_bin[:, split_point:,:]#image.crop((width1, 0, width, height)) - return [image1, image2], [image1_bin, image2_bin] + if not export_textline_images_and_text: + if min_conf_value_of_textline_text: + self.min_conf_value_of_textline_text = float(min_conf_value_of_textline_text) else: - return [image1, image2], None + self.min_conf_value_of_textline_text = 0.3 + if tr_ocr: + self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + if self.model_name: + self.model_ocr_dir = self.model_name + else: + self.model_ocr_dir = dir_models + "/model_eynollah_ocr_trocr_20250919" + self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) + self.model_ocr.to(self.device) + if not batch_size: + self.b_s = 2 + else: + self.b_s = int(batch_size) + + else: + if self.model_name: + self.model_ocr_dir = self.model_name + else: + self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250904" + model_ocr = load_model(self.model_ocr_dir , compile=False) + + self.prediction_model = tf.keras.models.Model( + model_ocr.get_layer(name = "image").input, + model_ocr.get_layer(name = "dense2").output) + if not batch_size: + self.b_s = 8 + else: + self.b_s = int(batch_size) + + with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: + characters = json.load(config_file) + + AUTOTUNE = tf.data.AUTOTUNE + + # Mapping characters to integers. + char_to_num = StringLookup(vocabulary=list(characters), mask_token=None) + + # Mapping integers back to original characters. + self.num_to_char = StringLookup( + vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True + ) + self.end_character = len(characters) + 2 + + def run(self, overwrite: bool = False, + dir_in: Optional[str] = None, + dir_in_bin: Optional[str] = None, + image_filename: Optional[str] = None, + dir_xmls: Optional[str] = None, + dir_out_image_text: Optional[str] = None, + dir_out: Optional[str] = None, + ): + if dir_in: + ls_imgs = [os.path.join(dir_in, image_filename) + for image_filename in filter(is_image_filename, + os.listdir(dir_in))] else: - return None, None - def preprocess_and_resize_image_for_ocrcnn_model(self, img, image_height, image_width): - ratio = image_height /float(img.shape[0]) - w_ratio = int(ratio * img.shape[1]) - - if w_ratio <= image_width: - width_new = w_ratio - else: - width_new = image_width - - if width_new == 0: - width_new = img.shape[1] - - ##if width_new+32 >= image_width: - ##width_new = width_new - 32 - - ###patch_zero = np.zeros((32, 32, 3))#+255 - ###patch_zero[9:19,8:18,:] = 0 - - - img = resize_image(img, image_height, width_new) - img_fin = np.ones((image_height, image_width, 3))*255 - ###img_fin[:,:32,:] = patch_zero[:,:,:] - ###img_fin[:,32:32+width_new,:] = img[:,:,:] - img_fin[:,:width_new,:] = img[:,:,:] - img_fin = img_fin / 255. - return img_fin - - def run(self): - ls_imgs = os.listdir(self.dir_in) - + ls_imgs = [image_filename] + if self.tr_ocr: - b_s = 2 - for ind_img in ls_imgs: - t0 = time.time() - file_name = ind_img.split('.')[0] - dir_img = os.path.join(self.dir_in, ind_img) - dir_xml = os.path.join(self.dir_xmls, file_name+'.xml') - out_file_ocr = os.path.join(self.dir_out, file_name+'.xml') + tr_ocr_input_height_and_width = 384 + for dir_img in ls_imgs: + file_name = Path(dir_img).stem + dir_xml = os.path.join(dir_xmls, file_name+'.xml') + out_file_ocr = os.path.join(dir_out, file_name+'.xml') + + if os.path.exists(out_file_ocr): + if overwrite: + self.logger.warning("will overwrite existing output file '%s'", out_file_ocr) + else: + self.logger.warning("will skip input for existing output file '%s'", out_file_ocr) + continue + img = cv2.imread(dir_img) + + if dir_out_image_text: + out_image_with_text = os.path.join(dir_out_image_text, file_name+'.png') + image_text = Image.new("RGB", (img.shape[1], img.shape[0]), "white") + draw = ImageDraw.Draw(image_text) + total_bb_coordinates = [] ##file_name = Path(dir_xmls).stem tree1 = ET.parse(dir_xml, parser = ET.XMLParser(encoding="utf-8")) @@ -5098,8 +5504,12 @@ class Eynollah_ocr: cropped_lines = [] cropped_lines_region_indexer = [] cropped_lines_meging_indexing = [] + + extracted_texts = [] indexer_text_region = 0 + indexer_b_s = 0 + for nn in root1.iter(region_tags): for child_textregion in nn: if child_textregion.tag.endswith("TextLine"): @@ -5111,6 +5521,9 @@ class Eynollah_ocr: textline_coords = np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) x,y,w,h = cv2.boundingRect(textline_coords) + if dir_out_image_text: + total_bb_coordinates.append([x,y,w,h]) + h2w_ratio = h/float(w) img_poly_on_img = np.copy(img) @@ -5121,108 +5534,265 @@ class Eynollah_ocr: img_crop = img_poly_on_img[y:y+h, x:x+w, :] img_crop[mask_poly==0] = 255 + self.logger.debug("processing %d lines for '%s'", len(cropped_lines), nn.attrib['id']) if h2w_ratio > 0.1: - cropped_lines.append(img_crop) + cropped_lines.append(resize_image(img_crop, tr_ocr_input_height_and_width, tr_ocr_input_height_and_width) ) cropped_lines_meging_indexing.append(0) + indexer_b_s+=1 + if indexer_b_s==self.b_s: + imgs = cropped_lines[:] + cropped_lines = [] + indexer_b_s = 0 + + pixel_values_merged = self.processor(imgs, return_tensors="pt").pixel_values + generated_ids_merged = self.model_ocr.generate(pixel_values_merged.to(self.device)) + generated_text_merged = self.processor.batch_decode(generated_ids_merged, skip_special_tokens=True) + + extracted_texts = extracted_texts + generated_text_merged + else: - splited_images, _ = self.return_textlines_split_if_needed(img_crop, None) + splited_images, _ = return_textlines_split_if_needed(img_crop, None) #print(splited_images) if splited_images: - cropped_lines.append(splited_images[0]) + cropped_lines.append(resize_image(splited_images[0], tr_ocr_input_height_and_width, tr_ocr_input_height_and_width)) cropped_lines_meging_indexing.append(1) - cropped_lines.append(splited_images[1]) + indexer_b_s+=1 + + if indexer_b_s==self.b_s: + imgs = cropped_lines[:] + cropped_lines = [] + indexer_b_s = 0 + + pixel_values_merged = self.processor(imgs, return_tensors="pt").pixel_values + generated_ids_merged = self.model_ocr.generate(pixel_values_merged.to(self.device)) + generated_text_merged = self.processor.batch_decode(generated_ids_merged, skip_special_tokens=True) + + extracted_texts = extracted_texts + generated_text_merged + + + cropped_lines.append(resize_image(splited_images[1], tr_ocr_input_height_and_width, tr_ocr_input_height_and_width)) cropped_lines_meging_indexing.append(-1) + indexer_b_s+=1 + + if indexer_b_s==self.b_s: + imgs = cropped_lines[:] + cropped_lines = [] + indexer_b_s = 0 + + pixel_values_merged = self.processor(imgs, return_tensors="pt").pixel_values + generated_ids_merged = self.model_ocr.generate(pixel_values_merged.to(self.device)) + generated_text_merged = self.processor.batch_decode(generated_ids_merged, skip_special_tokens=True) + + extracted_texts = extracted_texts + generated_text_merged + else: cropped_lines.append(img_crop) cropped_lines_meging_indexing.append(0) + indexer_b_s+=1 + + if indexer_b_s==self.b_s: + imgs = cropped_lines[:] + cropped_lines = [] + indexer_b_s = 0 + + pixel_values_merged = self.processor(imgs, return_tensors="pt").pixel_values + generated_ids_merged = self.model_ocr.generate(pixel_values_merged.to(self.device)) + generated_text_merged = self.processor.batch_decode(generated_ids_merged, skip_special_tokens=True) + + extracted_texts = extracted_texts + generated_text_merged + + + indexer_text_region = indexer_text_region +1 - - extracted_texts = [] - n_iterations = math.ceil(len(cropped_lines) / b_s) - - for i in range(n_iterations): - if i==(n_iterations-1): - n_start = i*b_s - imgs = cropped_lines[n_start:] - else: - n_start = i*b_s - n_end = (i+1)*b_s - imgs = cropped_lines[n_start:n_end] + if indexer_b_s!=0: + imgs = cropped_lines[:] + cropped_lines = [] + indexer_b_s = 0 + pixel_values_merged = self.processor(imgs, return_tensors="pt").pixel_values generated_ids_merged = self.model_ocr.generate(pixel_values_merged.to(self.device)) generated_text_merged = self.processor.batch_decode(generated_ids_merged, skip_special_tokens=True) extracted_texts = extracted_texts + generated_text_merged + + ####extracted_texts = [] + ####n_iterations = math.ceil(len(cropped_lines) / self.b_s) - extracted_texts_merged = [extracted_texts[ind] if cropped_lines_meging_indexing[ind]==0 else extracted_texts[ind]+extracted_texts[ind+1] if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] + ####for i in range(n_iterations): + ####if i==(n_iterations-1): + ####n_start = i*self.b_s + ####imgs = cropped_lines[n_start:] + ####else: + ####n_start = i*self.b_s + ####n_end = (i+1)*self.b_s + ####imgs = cropped_lines[n_start:n_end] + ####pixel_values_merged = self.processor(imgs, return_tensors="pt").pixel_values + ####generated_ids_merged = self.model_ocr.generate(pixel_values_merged.to(self.device)) + ####generated_text_merged = self.processor.batch_decode(generated_ids_merged, skip_special_tokens=True) + + ####extracted_texts = extracted_texts + generated_text_merged + + del cropped_lines + gc.collect() + + extracted_texts_merged = [extracted_texts[ind] if cropped_lines_meging_indexing[ind]==0 else extracted_texts[ind]+" "+extracted_texts[ind+1] if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None] #print(extracted_texts_merged, len(extracted_texts_merged)) unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer) + + if dir_out_image_text: + + #font_path = "Charis-7.000/Charis-Regular.ttf" # Make sure this file exists! + font = importlib_resources.files(__package__) / "Charis-Regular.ttf" + with importlib_resources.as_file(font) as font: + font = ImageFont.truetype(font=font, size=40) + + for indexer_text, bb_ind in enumerate(total_bb_coordinates): + + + x_bb = bb_ind[0] + y_bb = bb_ind[1] + w_bb = bb_ind[2] + h_bb = bb_ind[3] + + font = fit_text_single_line(draw, extracted_texts_merged[indexer_text], font.path, w_bb, int(h_bb*0.4) ) + + ##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2) + + text_bbox = draw.textbbox((0, 0), extracted_texts_merged[indexer_text], font=font) + text_width = text_bbox[2] - text_bbox[0] + text_height = text_bbox[3] - text_bbox[1] + + text_x = x_bb + (w_bb - text_width) // 2 # Center horizontally + text_y = y_bb + (h_bb - text_height) // 2 # Center vertically + + # Draw the text + draw.text((text_x, text_y), extracted_texts_merged[indexer_text], fill="black", font=font) + image_text.save(out_image_with_text) #print(len(unique_cropped_lines_region_indexer), 'unique_cropped_lines_region_indexer') + #######text_by_textregion = [] + #######for ind in unique_cropped_lines_region_indexer: + #######extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind] + + #######text_by_textregion.append(" ".join(extracted_texts_merged_un)) + text_by_textregion = [] for ind in unique_cropped_lines_region_indexer: extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind] - - text_by_textregion.append(" ".join(extracted_texts_merged_un)) - - #print(len(text_by_textregion) , indexer_text_region, "text_by_textregion") - - - #print(time.time() - t0 ,'elapsed time') - - + if len(extracted_texts_merged_un)>1: + text_by_textregion_ind = "" + next_glue = "" + for indt in range(len(extracted_texts_merged_un)): + if extracted_texts_merged_un[indt].endswith('⸗') or extracted_texts_merged_un[indt].endswith('-') or extracted_texts_merged_un[indt].endswith('¬'): + text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt][:-1] + next_glue = "" + else: + text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt] + next_glue = " " + text_by_textregion.append(text_by_textregion_ind) + + else: + text_by_textregion.append(" ".join(extracted_texts_merged_un)) + + indexer = 0 indexer_textregion = 0 for nn in root1.iter(region_tags): - text_subelement_textregion = ET.SubElement(nn, 'TextEquiv') - unicode_textregion = ET.SubElement(text_subelement_textregion, 'Unicode') + #id_textregion = nn.attrib['id'] + #id_textregions.append(id_textregion) + #textregions_by_existing_ids.append(text_by_textregion[indexer_textregion]) + + is_textregion_text = False + for childtest in nn: + if childtest.tag.endswith("TextEquiv"): + is_textregion_text = True + + if not is_textregion_text: + text_subelement_textregion = ET.SubElement(nn, 'TextEquiv') + unicode_textregion = ET.SubElement(text_subelement_textregion, 'Unicode') has_textline = False for child_textregion in nn: if child_textregion.tag.endswith("TextLine"): - text_subelement = ET.SubElement(child_textregion, 'TextEquiv') - unicode_textline = ET.SubElement(text_subelement, 'Unicode') - unicode_textline.text = extracted_texts_merged[indexer] + + is_textline_text = False + for childtest2 in child_textregion: + if childtest2.tag.endswith("TextEquiv"): + is_textline_text = True + + + if not is_textline_text: + text_subelement = ET.SubElement(child_textregion, 'TextEquiv') + ##text_subelement.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}") + unicode_textline = ET.SubElement(text_subelement, 'Unicode') + unicode_textline.text = extracted_texts_merged[indexer] + else: + for childtest3 in child_textregion: + if childtest3.tag.endswith("TextEquiv"): + for child_uc in childtest3: + if child_uc.tag.endswith("Unicode"): + ##childtest3.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}") + child_uc.text = extracted_texts_merged[indexer] + indexer = indexer + 1 has_textline = True if has_textline: - unicode_textregion.text = text_by_textregion[indexer_textregion] + if is_textregion_text: + for child4 in nn: + if child4.tag.endswith("TextEquiv"): + for childtr_uc in child4: + if childtr_uc.tag.endswith("Unicode"): + childtr_uc.text = text_by_textregion[indexer_textregion] + else: + unicode_textregion.text = text_by_textregion[indexer_textregion] indexer_textregion = indexer_textregion + 1 - - + ###sample_order = [(id_to_order[tid], text) for tid, text in zip(id_textregions, textregions_by_existing_ids) if tid in id_to_order] + + ##ordered_texts_sample = [text for _, text in sorted(sample_order)] + ##tot_page_text = ' '.join(ordered_texts_sample) + + ##for page_element in root1.iter(link+'Page'): + ##text_page = ET.SubElement(page_element, 'TextEquiv') + ##unicode_textpage = ET.SubElement(text_page, 'Unicode') + ##unicode_textpage.text = tot_page_text + ET.register_namespace("",name_space) tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf8",default_namespace=None) - #print("Job done in %.1fs", time.time() - t0) else: - max_len = 512 - padding_token = 299 + ###max_len = 280#512#280#512 + ###padding_token = 1500#299#1500#299 image_width = 512#max_len * 4 image_height = 32 - b_s = 8 img_size=(image_width, image_height) - for ind_img in ls_imgs: - t0 = time.time() - file_name = ind_img.split('.')[0] - dir_img = os.path.join(self.dir_in, ind_img) - dir_xml = os.path.join(self.dir_xmls, file_name+'.xml') - out_file_ocr = os.path.join(self.dir_out, file_name+'.xml') + for dir_img in ls_imgs: + file_name = Path(dir_img).stem + dir_xml = os.path.join(dir_xmls, file_name+'.xml') + out_file_ocr = os.path.join(dir_out, file_name+'.xml') + + if os.path.exists(out_file_ocr): + if overwrite: + self.logger.warning("will overwrite existing output file '%s'", out_file_ocr) + else: + self.logger.warning("will skip input for existing output file '%s'", out_file_ocr) + continue + img = cv2.imread(dir_img) - if self.prediction_with_both_of_rgb_and_bin: + if dir_in_bin is not None: cropped_lines_bin = [] - dir_img_bin = os.path.join(self.dir_in_bin, file_name+'.png') + dir_img_bin = os.path.join(dir_in_bin, file_name+'.png') img_bin = cv2.imread(dir_img_bin) - if self.draw_texts_on_image: - out_image_with_text = os.path.join(self.dir_out_image_text, file_name+'.png') + if dir_out_image_text: + out_image_with_text = os.path.join(dir_out_image_text, file_name+'.png') image_text = Image.new("RGB", (img.shape[1], img.shape[0]), "white") draw = ImageDraw.Draw(image_text) total_bb_coordinates = [] @@ -5238,6 +5808,7 @@ class Eynollah_ocr: region_tags=np.unique([x for x in alltags if x.endswith('TextRegion')]) cropped_lines = [] + cropped_lines_ver_index = [] cropped_lines_region_indexer = [] cropped_lines_meging_indexing = [] @@ -5245,6 +5816,10 @@ class Eynollah_ocr: indexer_text_region = 0 indexer_textlines = 0 for nn in root1.iter(region_tags): + try: + type_textregion = nn.attrib['type'] + except: + type_textregion = 'paragraph' for child_textregion in nn: if child_textregion.tag.endswith("TextLine"): for child_textlines in child_textregion: @@ -5255,126 +5830,324 @@ class Eynollah_ocr: x,y,w,h = cv2.boundingRect(textline_coords) - if self.draw_texts_on_image: - total_bb_coordinates.append([x,y,w,h]) + angle_radians = math.atan2(h, w) + # Convert to degrees + angle_degrees = math.degrees(angle_radians) + if type_textregion=='drop-capital': + angle_degrees = 0 + if dir_out_image_text: + total_bb_coordinates.append([x,y,w,h]) + w_scaled = w * image_height/float(h) img_poly_on_img = np.copy(img) - if self.prediction_with_both_of_rgb_and_bin: + if dir_in_bin is not None: img_poly_on_img_bin = np.copy(img_bin) img_crop_bin = img_poly_on_img_bin[y:y+h, x:x+w, :] mask_poly = np.zeros(img.shape) mask_poly = cv2.fillPoly(mask_poly, pts=[textline_coords], color=(1, 1, 1)) + mask_poly = mask_poly[y:y+h, x:x+w, :] img_crop = img_poly_on_img[y:y+h, x:x+w, :] - if not self.do_not_mask_with_textline_contour: - img_crop[mask_poly==0] = 255 - if self.prediction_with_both_of_rgb_and_bin: - img_crop_bin[mask_poly==0] = 255 + + if self.export_textline_images_and_text: + if not self.do_not_mask_with_textline_contour: + img_crop[mask_poly==0] = 255 + + else: + #print(file_name, angle_degrees,w*h , mask_poly[:,:,0].sum(), mask_poly[:,:,0].sum() /float(w*h) , 'didi') + + if angle_degrees > 3: + better_des_slope = get_orientation_moments(textline_coords) + + img_crop = rotate_image_with_padding(img_crop, better_des_slope ) + + if dir_in_bin is not None: + img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope ) + + mask_poly = rotate_image_with_padding(mask_poly, better_des_slope ) + mask_poly = mask_poly.astype('uint8') + + #new bounding box + x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_poly[:,:,0]) + + mask_poly = mask_poly[y_n:y_n+h_n, x_n:x_n+w_n, :] + img_crop = img_crop[y_n:y_n+h_n, x_n:x_n+w_n, :] + + if not self.do_not_mask_with_textline_contour: + img_crop[mask_poly==0] = 255 + + if dir_in_bin is not None: + img_crop_bin = img_crop_bin[y_n:y_n+h_n, x_n:x_n+w_n, :] + if not self.do_not_mask_with_textline_contour: + img_crop_bin[mask_poly==0] = 255 + + if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 90: + if dir_in_bin is not None: + img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin) + else: + img_crop, _ = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) + + + else: + better_des_slope = 0 + if not self.do_not_mask_with_textline_contour: + img_crop[mask_poly==0] = 255 + if dir_in_bin is not None: + if not self.do_not_mask_with_textline_contour: + img_crop_bin[mask_poly==0] = 255 + if type_textregion=='drop-capital': + pass + else: + if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 90: + if dir_in_bin is not None: + img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin) + else: + img_crop, _ = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) if not self.export_textline_images_and_text: - if w_scaled < 1.5*image_width: - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) + if w_scaled < 750:#1.5*image_width: + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) cropped_lines.append(img_fin) + if abs(better_des_slope) > 45: + cropped_lines_ver_index.append(1) + else: + cropped_lines_ver_index.append(0) + cropped_lines_meging_indexing.append(0) - if self.prediction_with_both_of_rgb_and_bin: - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width) + if dir_in_bin is not None: + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width) cropped_lines_bin.append(img_fin) else: - if self.prediction_with_both_of_rgb_and_bin: - splited_images, splited_images_bin = self.return_textlines_split_if_needed(img_crop, img_crop_bin) - else: - splited_images, splited_images_bin = self.return_textlines_split_if_needed(img_crop, None) + splited_images, splited_images_bin = return_textlines_split_if_needed( + img_crop, img_crop_bin if dir_in_bin is not None else None) if splited_images: - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(splited_images[0], image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[0], image_height, image_width) cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(1) - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(splited_images[1], image_height, image_width) + + if abs(better_des_slope) > 45: + cropped_lines_ver_index.append(1) + else: + cropped_lines_ver_index.append(0) + + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[1], image_height, image_width) cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(-1) - if self.prediction_with_both_of_rgb_and_bin: - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(splited_images_bin[0], image_height, image_width) + if abs(better_des_slope) > 45: + cropped_lines_ver_index.append(1) + else: + cropped_lines_ver_index.append(0) + + if dir_in_bin is not None: + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images_bin[0], image_height, image_width) cropped_lines_bin.append(img_fin) - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(splited_images_bin[1], image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images_bin[1], image_height, image_width) cropped_lines_bin.append(img_fin) else: - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(0) - if self.prediction_with_both_of_rgb_and_bin: - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width) + if abs(better_des_slope) > 45: + cropped_lines_ver_index.append(1) + else: + cropped_lines_ver_index.append(0) + + if dir_in_bin is not None: + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width) cropped_lines_bin.append(img_fin) if self.export_textline_images_and_text: - if child_textlines.tag.endswith("TextEquiv"): - for cheild_text in child_textlines: - if cheild_text.tag.endswith("Unicode"): - textline_text = cheild_text.text - if textline_text: - with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.txt'), 'w') as text_file: - text_file.write(textline_text) - - cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.png'), img_crop ) - - indexer_textlines+=1 + if img_crop.shape[0]==0 or img_crop.shape[1]==0: + pass + else: + if child_textlines.tag.endswith("TextEquiv"): + for cheild_text in child_textlines: + if cheild_text.tag.endswith("Unicode"): + textline_text = cheild_text.text + if textline_text: + base_name = os.path.join(dir_out, file_name + '_line_' + str(indexer_textlines)) + if self.pref_of_dataset: + base_name += '_' + self.pref_of_dataset + if not self.do_not_mask_with_textline_contour: + base_name += '_masked' + + with open(base_name + '.txt', 'w') as text_file: + text_file.write(textline_text) + cv2.imwrite(base_name + '.png', img_crop) + indexer_textlines+=1 if not self.export_textline_images_and_text: indexer_text_region = indexer_text_region +1 if not self.export_textline_images_and_text: extracted_texts = [] + extracted_conf_value = [] - n_iterations = math.ceil(len(cropped_lines) / b_s) + n_iterations = math.ceil(len(cropped_lines) / self.b_s) for i in range(n_iterations): if i==(n_iterations-1): - n_start = i*b_s + n_start = i*self.b_s imgs = cropped_lines[n_start:] imgs = np.array(imgs) imgs = imgs.reshape(imgs.shape[0], image_height, image_width, 3) - if self.prediction_with_both_of_rgb_and_bin: + + ver_imgs = np.array( cropped_lines_ver_index[n_start:] ) + indices_ver = np.where(ver_imgs == 1)[0] + + #print(indices_ver, 'indices_ver') + if len(indices_ver)>0: + imgs_ver_flipped = imgs[indices_ver, : ,: ,:] + imgs_ver_flipped = imgs_ver_flipped[:,::-1,::-1,:] + #print(imgs_ver_flipped, 'imgs_ver_flipped') + + else: + imgs_ver_flipped = None + + if dir_in_bin is not None: imgs_bin = cropped_lines_bin[n_start:] imgs_bin = np.array(imgs_bin) imgs_bin = imgs_bin.reshape(imgs_bin.shape[0], image_height, image_width, 3) + + if len(indices_ver)>0: + imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:] + imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:] + #print(imgs_ver_flipped, 'imgs_ver_flipped') + + else: + imgs_bin_ver_flipped = None else: - n_start = i*b_s - n_end = (i+1)*b_s + n_start = i*self.b_s + n_end = (i+1)*self.b_s imgs = cropped_lines[n_start:n_end] - imgs = np.array(imgs).reshape(b_s, image_height, image_width, 3) + imgs = np.array(imgs).reshape(self.b_s, image_height, image_width, 3) - if self.prediction_with_both_of_rgb_and_bin: + ver_imgs = np.array( cropped_lines_ver_index[n_start:n_end] ) + indices_ver = np.where(ver_imgs == 1)[0] + #print(indices_ver, 'indices_ver') + + if len(indices_ver)>0: + imgs_ver_flipped = imgs[indices_ver, : ,: ,:] + imgs_ver_flipped = imgs_ver_flipped[:,::-1,::-1,:] + #print(imgs_ver_flipped, 'imgs_ver_flipped') + else: + imgs_ver_flipped = None + + + if dir_in_bin is not None: imgs_bin = cropped_lines_bin[n_start:n_end] - imgs_bin = np.array(imgs_bin).reshape(b_s, image_height, image_width, 3) + imgs_bin = np.array(imgs_bin).reshape(self.b_s, image_height, image_width, 3) + + + if len(indices_ver)>0: + imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:] + imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:] + #print(imgs_ver_flipped, 'imgs_ver_flipped') + else: + imgs_bin_ver_flipped = None + self.logger.debug("processing next %d lines", len(imgs)) preds = self.prediction_model.predict(imgs, verbose=0) - if self.prediction_with_both_of_rgb_and_bin: + + if len(indices_ver)>0: + preds_flipped = self.prediction_model.predict(imgs_ver_flipped, verbose=0) + preds_max_fliped = np.max(preds_flipped, axis=2 ) + preds_max_args_flipped = np.argmax(preds_flipped, axis=2 ) + pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=self.end_character + masked_means_flipped = np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / np.sum(pred_max_not_unk_mask_bool_flipped, axis=1) + masked_means_flipped[np.isnan(masked_means_flipped)] = 0 + + preds_max = np.max(preds, axis=2 ) + preds_max_args = np.argmax(preds, axis=2 ) + pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character + + masked_means = np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / np.sum(pred_max_not_unk_mask_bool, axis=1) + masked_means[np.isnan(masked_means)] = 0 + + masked_means_ver = masked_means[indices_ver] + #print(masked_means_ver, 'pred_max_not_unk') + + indices_where_flipped_conf_value_is_higher = np.where(masked_means_flipped > masked_means_ver)[0] + + #print(indices_where_flipped_conf_value_is_higher, 'indices_where_flipped_conf_value_is_higher') + if len(indices_where_flipped_conf_value_is_higher)>0: + indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher] + preds[indices_to_be_replaced,:,:] = preds_flipped[indices_where_flipped_conf_value_is_higher, :, :] + if dir_in_bin is not None: preds_bin = self.prediction_model.predict(imgs_bin, verbose=0) + + if len(indices_ver)>0: + preds_flipped = self.prediction_model.predict(imgs_bin_ver_flipped, verbose=0) + preds_max_fliped = np.max(preds_flipped, axis=2 ) + preds_max_args_flipped = np.argmax(preds_flipped, axis=2 ) + pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=self.end_character + masked_means_flipped = np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / np.sum(pred_max_not_unk_mask_bool_flipped, axis=1) + masked_means_flipped[np.isnan(masked_means_flipped)] = 0 + + preds_max = np.max(preds, axis=2 ) + preds_max_args = np.argmax(preds, axis=2 ) + pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character + + masked_means = np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / np.sum(pred_max_not_unk_mask_bool, axis=1) + masked_means[np.isnan(masked_means)] = 0 + + masked_means_ver = masked_means[indices_ver] + #print(masked_means_ver, 'pred_max_not_unk') + + indices_where_flipped_conf_value_is_higher = np.where(masked_means_flipped > masked_means_ver)[0] + + #print(indices_where_flipped_conf_value_is_higher, 'indices_where_flipped_conf_value_is_higher') + if len(indices_where_flipped_conf_value_is_higher)>0: + indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher] + preds_bin[indices_to_be_replaced,:,:] = preds_flipped[indices_where_flipped_conf_value_is_higher, :, :] + preds = (preds + preds_bin) / 2. + - pred_texts = self.decode_batch_predictions(preds) + pred_texts = decode_batch_predictions(preds, self.num_to_char) + + preds_max = np.max(preds, axis=2 ) + preds_max_args = np.argmax(preds, axis=2 ) + pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character + masked_means = np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / np.sum(pred_max_not_unk_mask_bool, axis=1) for ib in range(imgs.shape[0]): pred_texts_ib = pred_texts[ib].replace("[UNK]", "") - extracted_texts.append(pred_texts_ib) - + if masked_means[ib] >= self.min_conf_value_of_textline_text: + extracted_texts.append(pred_texts_ib) + extracted_conf_value.append(masked_means[ib]) + else: + extracted_texts.append("") + extracted_conf_value.append(0) + del cropped_lines + if dir_in_bin is not None: + del cropped_lines_bin + gc.collect() + extracted_texts_merged = [extracted_texts[ind] if cropped_lines_meging_indexing[ind]==0 else extracted_texts[ind]+" "+extracted_texts[ind+1] if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] + + extracted_conf_value_merged = [extracted_conf_value[ind] if cropped_lines_meging_indexing[ind]==0 else (extracted_conf_value[ind]+extracted_conf_value[ind+1])/2. if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] + extracted_conf_value_merged = [extracted_conf_value_merged[ind_cfm] for ind_cfm in range(len(extracted_texts_merged)) if extracted_texts_merged[ind_cfm] is not None] extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None] unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer) - if self.draw_texts_on_image: + if dir_out_image_text: - font_path = "NotoSans-Regular.ttf" # Make sure this file exists! - font = ImageFont.truetype(font_path, 40) + #font_path = "Charis-7.000/Charis-Regular.ttf" # Make sure this file exists! + font = importlib_resources.files(__package__) / "Charis-Regular.ttf" + with importlib_resources.as_file(font) as font: + font = ImageFont.truetype(font=font, size=40) for indexer_text, bb_ind in enumerate(total_bb_coordinates): @@ -5384,7 +6157,7 @@ class Eynollah_ocr: w_bb = bb_ind[2] h_bb = bb_ind[3] - font = self.fit_text_single_line(draw, extracted_texts_merged[indexer_text], font_path, w_bb, int(h_bb*0.4) ) + font = fit_text_single_line(draw, extracted_texts_merged[indexer_text], font.path, w_bb, int(h_bb*0.4) ) ##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2) @@ -5402,11 +6175,40 @@ class Eynollah_ocr: text_by_textregion = [] for ind in unique_cropped_lines_region_indexer: extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind] - text_by_textregion.append("".join(extracted_texts_merged_un)) + if len(extracted_texts_merged_un)>1: + text_by_textregion_ind = "" + next_glue = "" + for indt in range(len(extracted_texts_merged_un)): + if extracted_texts_merged_un[indt].endswith('⸗') or extracted_texts_merged_un[indt].endswith('-') or extracted_texts_merged_un[indt].endswith('¬'): + text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt][:-1] + next_glue = "" + else: + text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt] + next_glue = " " + text_by_textregion.append(text_by_textregion_ind) + + else: + text_by_textregion.append(" ".join(extracted_texts_merged_un)) + #print(text_by_textregion, 'text_by_textregiontext_by_textregiontext_by_textregiontext_by_textregiontext_by_textregion') + + ###index_tot_regions = [] + ###tot_region_ref = [] + + ###for jj in root1.iter(link+'RegionRefIndexed'): + ###index_tot_regions.append(jj.attrib['index']) + ###tot_region_ref.append(jj.attrib['regionRef']) + + ###id_to_order = {tid: ro for tid, ro in zip(tot_region_ref, index_tot_regions)} + + #id_textregions = [] + #textregions_by_existing_ids = [] indexer = 0 indexer_textregion = 0 for nn in root1.iter(region_tags): + #id_textregion = nn.attrib['id'] + #id_textregions.append(id_textregion) + #textregions_by_existing_ids.append(text_by_textregion[indexer_textregion]) is_textregion_text = False for childtest in nn: @@ -5430,6 +6232,7 @@ class Eynollah_ocr: if not is_textline_text: text_subelement = ET.SubElement(child_textregion, 'TextEquiv') + text_subelement.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}") unicode_textline = ET.SubElement(text_subelement, 'Unicode') unicode_textline.text = extracted_texts_merged[indexer] else: @@ -5437,6 +6240,7 @@ class Eynollah_ocr: if childtest3.tag.endswith("TextEquiv"): for child_uc in childtest3: if child_uc.tag.endswith("Unicode"): + childtest3.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}") child_uc.text = extracted_texts_merged[indexer] indexer = indexer + 1 @@ -5451,7 +6255,17 @@ class Eynollah_ocr: else: unicode_textregion.text = text_by_textregion[indexer_textregion] indexer_textregion = indexer_textregion + 1 - + + ###sample_order = [(id_to_order[tid], text) for tid, text in zip(id_textregions, textregions_by_existing_ids) if tid in id_to_order] + + ##ordered_texts_sample = [text for _, text in sorted(sample_order)] + ##tot_page_text = ' '.join(ordered_texts_sample) + + ##for page_element in root1.iter(link+'Page'): + ##text_page = ET.SubElement(page_element, 'TextEquiv') + ##unicode_textpage = ET.SubElement(text_page, 'Unicode') + ##unicode_textpage.text = tot_page_text + ET.register_namespace("",name_space) tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf8",default_namespace=None) #print("Job done in %.1fs", time.time() - t0) diff --git a/src/eynollah/image_enhancer.py b/src/eynollah/image_enhancer.py new file mode 100644 index 0000000..89dde16 --- /dev/null +++ b/src/eynollah/image_enhancer.py @@ -0,0 +1,731 @@ +""" +Image enhancer. The output can be written as same scale of input or in new predicted scale. +""" + +from logging import Logger +import os +import time +from typing import Optional +import atexit +from functools import partial +from pathlib import Path +from multiprocessing import cpu_count +import gc +import cv2 +import numpy as np +from ocrd_utils import getLogger, tf_disable_interactive_logs +import tensorflow as tf +from skimage.morphology import skeletonize +from tensorflow.keras.models import load_model +from .utils.resize import resize_image +from .utils.pil_cv2 import pil2cv +from .utils import ( + is_image_filename, + crop_image_inside_box +) + +DPI_THRESHOLD = 298 +KERNEL = np.ones((5, 5), np.uint8) + + +class Enhancer: + def __init__( + self, + dir_models : str, + num_col_upper : Optional[int] = None, + num_col_lower : Optional[int] = None, + save_org_scale : bool = False, + logger : Optional[Logger] = None, + ): + self.input_binary = False + self.light_version = False + self.save_org_scale = save_org_scale + if num_col_upper: + self.num_col_upper = int(num_col_upper) + else: + self.num_col_upper = num_col_upper + if num_col_lower: + self.num_col_lower = int(num_col_lower) + else: + self.num_col_lower = num_col_lower + + self.logger = logger if logger else getLogger('enhancement') + self.dir_models = dir_models + self.model_dir_of_binarization = dir_models + "/eynollah-binarization_20210425" + self.model_dir_of_enhancement = dir_models + "/eynollah-enhancement_20210425" + self.model_dir_of_col_classifier = dir_models + "/eynollah-column-classifier_20210425" + self.model_page_dir = dir_models + "/model_eynollah_page_extraction_20250915" + + try: + for device in tf.config.list_physical_devices('GPU'): + tf.config.experimental.set_memory_growth(device, True) + except: + self.logger.warning("no GPU device available") + + self.model_page = self.our_load_model(self.model_page_dir) + self.model_classifier = self.our_load_model(self.model_dir_of_col_classifier) + self.model_enhancement = self.our_load_model(self.model_dir_of_enhancement) + self.model_bin = self.our_load_model(self.model_dir_of_binarization) + + def cache_images(self, image_filename=None, image_pil=None, dpi=None): + ret = {} + if image_filename: + ret['img'] = cv2.imread(image_filename) + if self.light_version: + self.dpi = 100 + else: + self.dpi = 0#check_dpi(image_filename) + else: + ret['img'] = pil2cv(image_pil) + if self.light_version: + self.dpi = 100 + else: + self.dpi = 0#check_dpi(image_pil) + ret['img_grayscale'] = cv2.cvtColor(ret['img'], cv2.COLOR_BGR2GRAY) + for prefix in ('', '_grayscale'): + ret[f'img{prefix}_uint8'] = ret[f'img{prefix}'].astype(np.uint8) + self._imgs = ret + if dpi is not None: + self.dpi = dpi + + def reset_file_name_dir(self, image_filename, dir_out): + self.cache_images(image_filename=image_filename) + self.output_filename = os.path.join(dir_out, Path(image_filename).stem +'.png') + + def imread(self, grayscale=False, uint8=True): + key = 'img' + if grayscale: + key += '_grayscale' + if uint8: + key += '_uint8' + return self._imgs[key].copy() + + def isNaN(self, num): + return num != num + + @staticmethod + def our_load_model(model_file): + if model_file.endswith('.h5') and Path(model_file[:-3]).exists(): + # prefer SavedModel over HDF5 format if it exists + model_file = model_file[:-3] + try: + model = load_model(model_file, compile=False) + except: + model = load_model(model_file, compile=False, custom_objects={ + "PatchEncoder": PatchEncoder, "Patches": Patches}) + return model + + def predict_enhancement(self, img): + self.logger.debug("enter predict_enhancement") + + img_height_model = self.model_enhancement.layers[-1].output_shape[1] + img_width_model = self.model_enhancement.layers[-1].output_shape[2] + if img.shape[0] < img_height_model: + img = cv2.resize(img, (img.shape[1], img_width_model), interpolation=cv2.INTER_NEAREST) + if img.shape[1] < img_width_model: + img = cv2.resize(img, (img_height_model, img.shape[0]), interpolation=cv2.INTER_NEAREST) + margin = int(0.1 * img_width_model) + width_mid = img_width_model - 2 * margin + height_mid = img_height_model - 2 * margin + img = img / 255. + img_h = img.shape[0] + img_w = img.shape[1] + + prediction_true = np.zeros((img_h, img_w, 3)) + nxf = img_w / float(width_mid) + nyf = img_h / float(height_mid) + nxf = int(nxf) + 1 if nxf > int(nxf) else int(nxf) + nyf = int(nyf) + 1 if nyf > int(nyf) else int(nyf) + + for i in range(nxf): + for j in range(nyf): + if i == 0: + index_x_d = i * width_mid + index_x_u = index_x_d + img_width_model + else: + index_x_d = i * width_mid + index_x_u = index_x_d + img_width_model + if j == 0: + index_y_d = j * height_mid + index_y_u = index_y_d + img_height_model + else: + index_y_d = j * height_mid + index_y_u = index_y_d + img_height_model + + if index_x_u > img_w: + index_x_u = img_w + index_x_d = img_w - img_width_model + if index_y_u > img_h: + index_y_u = img_h + index_y_d = img_h - img_height_model + + img_patch = img[np.newaxis, index_y_d:index_y_u, index_x_d:index_x_u, :] + label_p_pred = self.model_enhancement.predict(img_patch, verbose=0) + seg = label_p_pred[0, :, :, :] * 255 + + if i == 0 and j == 0: + prediction_true[index_y_d + 0:index_y_u - margin, + index_x_d + 0:index_x_u - margin] = \ + seg[0:-margin or None, + 0:-margin or None] + elif i == nxf - 1 and j == nyf - 1: + prediction_true[index_y_d + margin:index_y_u - 0, + index_x_d + margin:index_x_u - 0] = \ + seg[margin:, + margin:] + elif i == 0 and j == nyf - 1: + prediction_true[index_y_d + margin:index_y_u - 0, + index_x_d + 0:index_x_u - margin] = \ + seg[margin:, + 0:-margin or None] + elif i == nxf - 1 and j == 0: + prediction_true[index_y_d + 0:index_y_u - margin, + index_x_d + margin:index_x_u - 0] = \ + seg[0:-margin or None, + margin:] + elif i == 0 and j != 0 and j != nyf - 1: + prediction_true[index_y_d + margin:index_y_u - margin, + index_x_d + 0:index_x_u - margin] = \ + seg[margin:-margin or None, + 0:-margin or None] + elif i == nxf - 1 and j != 0 and j != nyf - 1: + prediction_true[index_y_d + margin:index_y_u - margin, + index_x_d + margin:index_x_u - 0] = \ + seg[margin:-margin or None, + margin:] + elif i != 0 and i != nxf - 1 and j == 0: + prediction_true[index_y_d + 0:index_y_u - margin, + index_x_d + margin:index_x_u - margin] = \ + seg[0:-margin or None, + margin:-margin or None] + elif i != 0 and i != nxf - 1 and j == nyf - 1: + prediction_true[index_y_d + margin:index_y_u - 0, + index_x_d + margin:index_x_u - margin] = \ + seg[margin:, + margin:-margin or None] + else: + prediction_true[index_y_d + margin:index_y_u - margin, + index_x_d + margin:index_x_u - margin] = \ + seg[margin:-margin or None, + margin:-margin or None] + + prediction_true = prediction_true.astype(int) + return prediction_true + + def calculate_width_height_by_columns(self, img, num_col, width_early, label_p_pred): + self.logger.debug("enter calculate_width_height_by_columns") + if num_col == 1: + img_w_new = 2000 + elif num_col == 2: + img_w_new = 2400 + elif num_col == 3: + img_w_new = 3000 + elif num_col == 4: + img_w_new = 4000 + elif num_col == 5: + img_w_new = 5000 + elif num_col == 6: + img_w_new = 6500 + else: + img_w_new = width_early + img_h_new = img_w_new * img.shape[0] // img.shape[1] + + if img_h_new >= 8000: + img_new = np.copy(img) + num_column_is_classified = False + else: + img_new = resize_image(img, img_h_new, img_w_new) + num_column_is_classified = True + + return img_new, num_column_is_classified + + def early_page_for_num_of_column_classification(self,img_bin): + self.logger.debug("enter early_page_for_num_of_column_classification") + if self.input_binary: + img = np.copy(img_bin).astype(np.uint8) + else: + img = self.imread() + img = cv2.GaussianBlur(img, (5, 5), 0) + img_page_prediction = self.do_prediction(False, img, self.model_page) + + imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY) + _, thresh = cv2.threshold(imgray, 0, 255, 0) + thresh = cv2.dilate(thresh, KERNEL, iterations=3) + contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + if len(contours)>0: + cnt_size = np.array([cv2.contourArea(contours[j]) + for j in range(len(contours))]) + cnt = contours[np.argmax(cnt_size)] + box = cv2.boundingRect(cnt) + else: + box = [0, 0, img.shape[1], img.shape[0]] + cropped_page, page_coord = crop_image_inside_box(box, img) + + self.logger.debug("exit early_page_for_num_of_column_classification") + return cropped_page, page_coord + + def calculate_width_height_by_columns_1_2(self, img, num_col, width_early, label_p_pred): + self.logger.debug("enter calculate_width_height_by_columns") + if num_col == 1: + img_w_new = 1000 + else: + img_w_new = 1300 + img_h_new = img_w_new * img.shape[0] // img.shape[1] + + if label_p_pred[0][int(num_col - 1)] < 0.9 and img_w_new < width_early: + img_new = np.copy(img) + num_column_is_classified = False + #elif label_p_pred[0][int(num_col - 1)] < 0.8 and img_h_new >= 8000: + elif img_h_new >= 8000: + img_new = np.copy(img) + num_column_is_classified = False + else: + img_new = resize_image(img, img_h_new, img_w_new) + num_column_is_classified = True + + return img_new, num_column_is_classified + + def resize_and_enhance_image_with_column_classifier(self, light_version): + self.logger.debug("enter resize_and_enhance_image_with_column_classifier") + dpi = 0#self.dpi + self.logger.info("Detected %s DPI", dpi) + if self.input_binary: + img = self.imread() + prediction_bin = self.do_prediction(True, img, self.model_bin, n_batch_inference=5) + prediction_bin = 255 * (prediction_bin[:,:,0]==0) + prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2).astype(np.uint8) + img= np.copy(prediction_bin) + img_bin = prediction_bin + else: + img = self.imread() + self.h_org, self.w_org = img.shape[:2] + img_bin = None + + width_early = img.shape[1] + t1 = time.time() + _, page_coord = self.early_page_for_num_of_column_classification(img_bin) + + self.image_page_org_size = img[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3], :] + self.page_coord = page_coord + + if self.num_col_upper and not self.num_col_lower: + num_col = self.num_col_upper + label_p_pred = [np.ones(6)] + elif self.num_col_lower and not self.num_col_upper: + num_col = self.num_col_lower + label_p_pred = [np.ones(6)] + elif not self.num_col_upper and not self.num_col_lower: + if self.input_binary: + img_in = np.copy(img) + img_in = img_in / 255.0 + img_in = cv2.resize(img_in, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = img_in.reshape(1, 448, 448, 3) + else: + img_1ch = self.imread(grayscale=True) + width_early = img_1ch.shape[1] + img_1ch = img_1ch[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] + + img_1ch = img_1ch / 255.0 + img_1ch = cv2.resize(img_1ch, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = np.zeros((1, img_1ch.shape[0], img_1ch.shape[1], 3)) + img_in[0, :, :, 0] = img_1ch[:, :] + img_in[0, :, :, 1] = img_1ch[:, :] + img_in[0, :, :, 2] = img_1ch[:, :] + + label_p_pred = self.model_classifier.predict(img_in, verbose=0) + num_col = np.argmax(label_p_pred[0]) + 1 + elif (self.num_col_upper and self.num_col_lower) and (self.num_col_upper!=self.num_col_lower): + if self.input_binary: + img_in = np.copy(img) + img_in = img_in / 255.0 + img_in = cv2.resize(img_in, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = img_in.reshape(1, 448, 448, 3) + else: + img_1ch = self.imread(grayscale=True) + width_early = img_1ch.shape[1] + img_1ch = img_1ch[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] + + img_1ch = img_1ch / 255.0 + img_1ch = cv2.resize(img_1ch, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = np.zeros((1, img_1ch.shape[0], img_1ch.shape[1], 3)) + img_in[0, :, :, 0] = img_1ch[:, :] + img_in[0, :, :, 1] = img_1ch[:, :] + img_in[0, :, :, 2] = img_1ch[:, :] + + label_p_pred = self.model_classifier.predict(img_in, verbose=0) + num_col = np.argmax(label_p_pred[0]) + 1 + + if num_col > self.num_col_upper: + num_col = self.num_col_upper + label_p_pred = [np.ones(6)] + if num_col < self.num_col_lower: + num_col = self.num_col_lower + label_p_pred = [np.ones(6)] + else: + num_col = self.num_col_upper + label_p_pred = [np.ones(6)] + + self.logger.info("Found %d columns (%s)", num_col, np.around(label_p_pred, decimals=5)) + + if dpi < DPI_THRESHOLD: + if light_version and num_col in (1,2): + img_new, num_column_is_classified = self.calculate_width_height_by_columns_1_2( + img, num_col, width_early, label_p_pred) + else: + img_new, num_column_is_classified = self.calculate_width_height_by_columns( + img, num_col, width_early, label_p_pred) + if light_version: + image_res = np.copy(img_new) + else: + image_res = self.predict_enhancement(img_new) + is_image_enhanced = True + + else: + num_column_is_classified = True + image_res = np.copy(img) + is_image_enhanced = False + + self.logger.debug("exit resize_and_enhance_image_with_column_classifier") + return is_image_enhanced, img, image_res, num_col, num_column_is_classified, img_bin + def do_prediction( + self, patches, img, model, + n_batch_inference=1, marginal_of_patch_percent=0.1, + thresholding_for_some_classes_in_light_version=False, + thresholding_for_artificial_class_in_light_version=False, thresholding_for_fl_light_version=False, threshold_art_class_textline=0.1): + + self.logger.debug("enter do_prediction") + img_height_model = model.layers[-1].output_shape[1] + img_width_model = model.layers[-1].output_shape[2] + + if not patches: + img_h_page = img.shape[0] + img_w_page = img.shape[1] + img = img / float(255.0) + img = resize_image(img, img_height_model, img_width_model) + + label_p_pred = model.predict(img[np.newaxis], verbose=0) + seg = np.argmax(label_p_pred, axis=3)[0] + + if thresholding_for_artificial_class_in_light_version: + seg_art = label_p_pred[0,:,:,2] + + seg_art[seg_art0] =1 + + skeleton_art = skeletonize(seg_art) + skeleton_art = skeleton_art*1 + + seg[skeleton_art==1]=2 + + if thresholding_for_fl_light_version: + seg_header = label_p_pred[0,:,:,2] + + seg_header[seg_header<0.2] = 0 + seg_header[seg_header>0] =1 + + seg[seg_header==1]=2 + + seg_color = np.repeat(seg[:, :, np.newaxis], 3, axis=2) + prediction_true = resize_image(seg_color, img_h_page, img_w_page).astype(np.uint8) + return prediction_true + + if img.shape[0] < img_height_model: + img = resize_image(img, img_height_model, img.shape[1]) + if img.shape[1] < img_width_model: + img = resize_image(img, img.shape[0], img_width_model) + + self.logger.debug("Patch size: %sx%s", img_height_model, img_width_model) + margin = int(marginal_of_patch_percent * img_height_model) + width_mid = img_width_model - 2 * margin + height_mid = img_height_model - 2 * margin + img = img / 255. + #img = img.astype(np.float16) + img_h = img.shape[0] + img_w = img.shape[1] + prediction_true = np.zeros((img_h, img_w, 3)) + mask_true = np.zeros((img_h, img_w)) + nxf = img_w / float(width_mid) + nyf = img_h / float(height_mid) + nxf = int(nxf) + 1 if nxf > int(nxf) else int(nxf) + nyf = int(nyf) + 1 if nyf > int(nyf) else int(nyf) + + list_i_s = [] + list_j_s = [] + list_x_u = [] + list_x_d = [] + list_y_u = [] + list_y_d = [] + + batch_indexer = 0 + img_patch = np.zeros((n_batch_inference, img_height_model, img_width_model, 3)) + for i in range(nxf): + for j in range(nyf): + if i == 0: + index_x_d = i * width_mid + index_x_u = index_x_d + img_width_model + else: + index_x_d = i * width_mid + index_x_u = index_x_d + img_width_model + if j == 0: + index_y_d = j * height_mid + index_y_u = index_y_d + img_height_model + else: + index_y_d = j * height_mid + index_y_u = index_y_d + img_height_model + if index_x_u > img_w: + index_x_u = img_w + index_x_d = img_w - img_width_model + if index_y_u > img_h: + index_y_u = img_h + index_y_d = img_h - img_height_model + + list_i_s.append(i) + list_j_s.append(j) + list_x_u.append(index_x_u) + list_x_d.append(index_x_d) + list_y_d.append(index_y_d) + list_y_u.append(index_y_u) + + img_patch[batch_indexer,:,:,:] = img[index_y_d:index_y_u, index_x_d:index_x_u, :] + batch_indexer += 1 + + if (batch_indexer == n_batch_inference or + # last batch + i == nxf - 1 and j == nyf - 1): + self.logger.debug("predicting patches on %s", str(img_patch.shape)) + label_p_pred = model.predict(img_patch, verbose=0) + seg = np.argmax(label_p_pred, axis=3) + + if thresholding_for_some_classes_in_light_version: + seg_not_base = label_p_pred[:,:,:,4] + seg_not_base[seg_not_base>0.03] =1 + seg_not_base[seg_not_base<1] =0 + + seg_line = label_p_pred[:,:,:,3] + seg_line[seg_line>0.1] =1 + seg_line[seg_line<1] =0 + + seg_background = label_p_pred[:,:,:,0] + seg_background[seg_background>0.25] =1 + seg_background[seg_background<1] =0 + + seg[seg_not_base==1]=4 + seg[seg_background==1]=0 + seg[(seg_line==1) & (seg==0)]=3 + if thresholding_for_artificial_class_in_light_version: + seg_art = label_p_pred[:,:,:,2] + + seg_art[seg_art0] =1 + + ##seg[seg_art==1]=2 + + indexer_inside_batch = 0 + for i_batch, j_batch in zip(list_i_s, list_j_s): + seg_in = seg[indexer_inside_batch] + + if thresholding_for_artificial_class_in_light_version: + seg_in_art = seg_art[indexer_inside_batch] + + index_y_u_in = list_y_u[indexer_inside_batch] + index_y_d_in = list_y_d[indexer_inside_batch] + + index_x_u_in = list_x_u[indexer_inside_batch] + index_x_d_in = list_x_d[indexer_inside_batch] + + if i_batch == 0 and j_batch == 0: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin] = \ + seg_in[0:-margin or None, + 0:-margin or None, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[0:-margin or None, + 0:-margin or None] + + elif i_batch == nxf - 1 and j_batch == nyf - 1: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - 0] = \ + seg_in[margin:, + margin:, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[margin:, + margin:] + + elif i_batch == 0 and j_batch == nyf - 1: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + 0:index_x_u_in - margin] = \ + seg_in[margin:, + 0:-margin or None, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[margin:, + 0:-margin or None] + + elif i_batch == nxf - 1 and j_batch == 0: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0] = \ + seg_in[0:-margin or None, + margin:, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[0:-margin or None, + margin:] + + elif i_batch == 0 and j_batch != 0 and j_batch != nyf - 1: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin] = \ + seg_in[margin:-margin or None, + 0:-margin or None, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[margin:-margin or None, + 0:-margin or None] + + elif i_batch == nxf - 1 and j_batch != 0 and j_batch != nyf - 1: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0] = \ + seg_in[margin:-margin or None, + margin:, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[margin:-margin or None, + margin:] + + elif i_batch != 0 and i_batch != nxf - 1 and j_batch == 0: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin] = \ + seg_in[0:-margin or None, + margin:-margin or None, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[0:-margin or None, + margin:-margin or None] + + elif i_batch != 0 and i_batch != nxf - 1 and j_batch == nyf - 1: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - margin] = \ + seg_in[margin:, + margin:-margin or None, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[margin:, + margin:-margin or None] + + else: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin] = \ + seg_in[margin:-margin or None, + margin:-margin or None, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[margin:-margin or None, + margin:-margin or None] + indexer_inside_batch += 1 + + + list_i_s = [] + list_j_s = [] + list_x_u = [] + list_x_d = [] + list_y_u = [] + list_y_d = [] + + batch_indexer = 0 + img_patch[:] = 0 + + prediction_true = prediction_true.astype(np.uint8) + + if thresholding_for_artificial_class_in_light_version: + kernel_min = np.ones((3, 3), np.uint8) + prediction_true[:,:,0][prediction_true[:,:,0]==2] = 0 + + skeleton_art = skeletonize(prediction_true[:,:,1]) + skeleton_art = skeleton_art*1 + + skeleton_art = skeleton_art.astype('uint8') + + skeleton_art = cv2.dilate(skeleton_art, kernel_min, iterations=1) + + prediction_true[:,:,0][skeleton_art==1]=2 + #del model + gc.collect() + return prediction_true + + def run_enhancement(self, light_version): + t_in = time.time() + self.logger.info("Resizing and enhancing image...") + is_image_enhanced, img_org, img_res, num_col_classifier, num_column_is_classified, img_bin = \ + self.resize_and_enhance_image_with_column_classifier(light_version) + + self.logger.info("Image was %senhanced.", '' if is_image_enhanced else 'not ') + return img_res, is_image_enhanced, num_col_classifier, num_column_is_classified + + + def run_single(self): + t0 = time.time() + img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(light_version=False) + + return img_res + + + def run(self, + overwrite: bool = False, + image_filename: Optional[str] = None, + dir_in: Optional[str] = None, + dir_out: Optional[str] = None, + ): + """ + Get image and scales, then extract the page of scanned image + """ + self.logger.debug("enter run") + t0_tot = time.time() + + if dir_in: + ls_imgs = [os.path.join(dir_in, image_filename) + for image_filename in filter(is_image_filename, + os.listdir(dir_in))] + elif image_filename: + ls_imgs = [image_filename] + else: + raise ValueError("run requires either a single image filename or a directory") + + for img_filename in ls_imgs: + self.logger.info(img_filename) + t0 = time.time() + + self.reset_file_name_dir(img_filename, dir_out) + #print("text region early -11 in %.1fs", time.time() - t0) + + if os.path.exists(self.output_filename): + if overwrite: + self.logger.warning("will overwrite existing output file '%s'", self.output_filename) + else: + self.logger.warning("will skip input for existing output file '%s'", self.output_filename) + continue + + image_enhanced = self.run_single() + if self.save_org_scale: + image_enhanced = resize_image(image_enhanced, self.h_org, self.w_org) + + cv2.imwrite(self.output_filename, image_enhanced) + diff --git a/src/eynollah/mb_ro_on_layout.py b/src/eynollah/mb_ro_on_layout.py new file mode 100644 index 0000000..45db8e4 --- /dev/null +++ b/src/eynollah/mb_ro_on_layout.py @@ -0,0 +1,813 @@ +""" +Image enhancer. The output can be written as same scale of input or in new predicted scale. +""" + +from logging import Logger +import os +import time +from typing import Optional +import atexit +from functools import partial +from pathlib import Path +from multiprocessing import cpu_count +import xml.etree.ElementTree as ET +import cv2 +import numpy as np +from ocrd_utils import getLogger +import statistics +import tensorflow as tf +from tensorflow.keras.models import load_model +from .utils.resize import resize_image + +from .utils.contour import ( + find_new_features_of_contours, + return_contours_of_image, + return_parent_contours, +) +from .utils import is_xml_filename + +DPI_THRESHOLD = 298 +KERNEL = np.ones((5, 5), np.uint8) + + +class machine_based_reading_order_on_layout: + def __init__( + self, + dir_models : str, + logger : Optional[Logger] = None, + ): + self.logger = logger if logger else getLogger('mbreorder') + self.dir_models = dir_models + self.model_reading_order_dir = dir_models + "/model_eynollah_reading_order_20250824" + + try: + for device in tf.config.list_physical_devices('GPU'): + tf.config.experimental.set_memory_growth(device, True) + except: + self.logger.warning("no GPU device available") + + self.model_reading_order = self.our_load_model(self.model_reading_order_dir) + self.light_version = True + + @staticmethod + def our_load_model(model_file): + if model_file.endswith('.h5') and Path(model_file[:-3]).exists(): + # prefer SavedModel over HDF5 format if it exists + model_file = model_file[:-3] + try: + model = load_model(model_file, compile=False) + except: + model = load_model(model_file, compile=False, custom_objects={ + "PatchEncoder": PatchEncoder, "Patches": Patches}) + return model + + def read_xml(self, xml_file): + tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding='utf-8')) + root1=tree1.getroot() + alltags=[elem.tag for elem in root1.iter()] + link=alltags[0].split('}')[0]+'}' + + index_tot_regions = [] + tot_region_ref = [] + + for jj in root1.iter(link+'Page'): + y_len=int(jj.attrib['imageHeight']) + x_len=int(jj.attrib['imageWidth']) + + for jj in root1.iter(link+'RegionRefIndexed'): + index_tot_regions.append(jj.attrib['index']) + tot_region_ref.append(jj.attrib['regionRef']) + + if (link+'PrintSpace' in alltags) or (link+'Border' in alltags): + co_printspace = [] + if link+'PrintSpace' in alltags: + region_tags_printspace = np.unique([x for x in alltags if x.endswith('PrintSpace')]) + elif link+'Border' in alltags: + region_tags_printspace = np.unique([x for x in alltags if x.endswith('Border')]) + + for tag in region_tags_printspace: + if link+'PrintSpace' in alltags: + tag_endings_printspace = ['}PrintSpace','}printspace'] + elif link+'Border' in alltags: + tag_endings_printspace = ['}Border','}border'] + + if tag.endswith(tag_endings_printspace[0]) or tag.endswith(tag_endings_printspace[1]): + for nn in root1.iter(tag): + c_t_in = [] + sumi = 0 + for vv in nn.iter(): + # check the format of coords + if vv.tag == link + 'Coords': + coords = bool(vv.attrib) + if coords: + p_h = vv.attrib['points'].split(' ') + c_t_in.append( + np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])) + break + else: + pass + + if vv.tag == link + 'Point': + c_t_in.append([int(float(vv.attrib['x'])), int(float(vv.attrib['y']))]) + sumi += 1 + elif vv.tag != link + 'Point' and sumi >= 1: + break + co_printspace.append(np.array(c_t_in)) + img_printspace = np.zeros( (y_len,x_len,3) ) + img_printspace=cv2.fillPoly(img_printspace, pts =co_printspace, color=(1,1,1)) + img_printspace = img_printspace.astype(np.uint8) + + imgray = cv2.cvtColor(img_printspace, cv2.COLOR_BGR2GRAY) + _, thresh = cv2.threshold(imgray, 0, 255, 0) + contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))]) + cnt = contours[np.argmax(cnt_size)] + x, y, w, h = cv2.boundingRect(cnt) + + bb_coord_printspace = [x, y, w, h] + + else: + bb_coord_printspace = None + + + region_tags=np.unique([x for x in alltags if x.endswith('Region')]) + co_text_paragraph=[] + co_text_drop=[] + co_text_heading=[] + co_text_header=[] + co_text_marginalia=[] + co_text_catch=[] + co_text_page_number=[] + co_text_signature_mark=[] + co_sep=[] + co_img=[] + co_table=[] + co_graphic=[] + co_graphic_text_annotation=[] + co_graphic_decoration=[] + co_noise=[] + + co_text_paragraph_text=[] + co_text_drop_text=[] + co_text_heading_text=[] + co_text_header_text=[] + co_text_marginalia_text=[] + co_text_catch_text=[] + co_text_page_number_text=[] + co_text_signature_mark_text=[] + co_sep_text=[] + co_img_text=[] + co_table_text=[] + co_graphic_text=[] + co_graphic_text_annotation_text=[] + co_graphic_decoration_text=[] + co_noise_text=[] + + id_paragraph = [] + id_header = [] + id_heading = [] + id_marginalia = [] + + for tag in region_tags: + if tag.endswith('}TextRegion') or tag.endswith('}Textregion'): + for nn in root1.iter(tag): + for child2 in nn: + tag2 = child2.tag + if tag2.endswith('}TextEquiv') or tag2.endswith('}TextEquiv'): + for childtext2 in child2: + if childtext2.tag.endswith('}Unicode') or childtext2.tag.endswith('}Unicode'): + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + co_text_drop_text.append(childtext2.text) + elif "type" in nn.attrib and nn.attrib['type']=='heading': + co_text_heading_text.append(childtext2.text) + elif "type" in nn.attrib and nn.attrib['type']=='signature-mark': + co_text_signature_mark_text.append(childtext2.text) + elif "type" in nn.attrib and nn.attrib['type']=='header': + co_text_header_text.append(childtext2.text) + ###elif "type" in nn.attrib and nn.attrib['type']=='catch-word': + ###co_text_catch_text.append(childtext2.text) + ###elif "type" in nn.attrib and nn.attrib['type']=='page-number': + ###co_text_page_number_text.append(childtext2.text) + elif "type" in nn.attrib and nn.attrib['type']=='marginalia': + co_text_marginalia_text.append(childtext2.text) + else: + co_text_paragraph_text.append(childtext2.text) + c_t_in_drop=[] + c_t_in_paragraph=[] + c_t_in_heading=[] + c_t_in_header=[] + c_t_in_page_number=[] + c_t_in_signature_mark=[] + c_t_in_catch=[] + c_t_in_marginalia=[] + + + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + + coords=bool(vv.attrib) + if coords: + #print('birda1') + p_h=vv.attrib['points'].split(' ') + + + + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + + c_t_in_drop.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + elif "type" in nn.attrib and nn.attrib['type']=='heading': + ##id_heading.append(nn.attrib['id']) + c_t_in_heading.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + elif "type" in nn.attrib and nn.attrib['type']=='signature-mark': + + c_t_in_signature_mark.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + #print(c_t_in_paragraph) + elif "type" in nn.attrib and nn.attrib['type']=='header': + #id_header.append(nn.attrib['id']) + c_t_in_header.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + ###elif "type" in nn.attrib and nn.attrib['type']=='catch-word': + ###c_t_in_catch.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + ###elif "type" in nn.attrib and nn.attrib['type']=='page-number': + + ###c_t_in_page_number.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + elif "type" in nn.attrib and nn.attrib['type']=='marginalia': + #id_marginalia.append(nn.attrib['id']) + + c_t_in_marginalia.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + else: + #id_paragraph.append(nn.attrib['id']) + + c_t_in_paragraph.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + break + else: + pass + + + if vv.tag==link+'Point': + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + + c_t_in_drop.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + elif "type" in nn.attrib and nn.attrib['type']=='heading': + #id_heading.append(nn.attrib['id']) + c_t_in_heading.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + + elif "type" in nn.attrib and nn.attrib['type']=='signature-mark': + + c_t_in_signature_mark.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + elif "type" in nn.attrib and nn.attrib['type']=='header': + #id_header.append(nn.attrib['id']) + c_t_in_header.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + + ###elif "type" in nn.attrib and nn.attrib['type']=='catch-word': + ###c_t_in_catch.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + ###sumi+=1 + + ###elif "type" in nn.attrib and nn.attrib['type']=='page-number': + + ###c_t_in_page_number.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + ###sumi+=1 + + elif "type" in nn.attrib and nn.attrib['type']=='marginalia': + #id_marginalia.append(nn.attrib['id']) + + c_t_in_marginalia.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + else: + #id_paragraph.append(nn.attrib['id']) + c_t_in_paragraph.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + elif vv.tag!=link+'Point' and sumi>=1: + break + + if len(c_t_in_drop)>0: + co_text_drop.append(np.array(c_t_in_drop)) + if len(c_t_in_paragraph)>0: + co_text_paragraph.append(np.array(c_t_in_paragraph)) + id_paragraph.append(nn.attrib['id']) + if len(c_t_in_heading)>0: + co_text_heading.append(np.array(c_t_in_heading)) + id_heading.append(nn.attrib['id']) + + if len(c_t_in_header)>0: + co_text_header.append(np.array(c_t_in_header)) + id_header.append(nn.attrib['id']) + if len(c_t_in_page_number)>0: + co_text_page_number.append(np.array(c_t_in_page_number)) + if len(c_t_in_catch)>0: + co_text_catch.append(np.array(c_t_in_catch)) + + if len(c_t_in_signature_mark)>0: + co_text_signature_mark.append(np.array(c_t_in_signature_mark)) + + if len(c_t_in_marginalia)>0: + co_text_marginalia.append(np.array(c_t_in_marginalia)) + id_marginalia.append(nn.attrib['id']) + + + elif tag.endswith('}GraphicRegion') or tag.endswith('}graphicregion'): + for nn in root1.iter(tag): + c_t_in=[] + c_t_in_text_annotation=[] + c_t_in_decoration=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + + if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': + c_t_in_text_annotation.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + elif "type" in nn.attrib and nn.attrib['type']=='decoration': + c_t_in_decoration.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + else: + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + break + else: + pass + + + if vv.tag==link+'Point': + if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': + c_t_in_text_annotation.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + elif "type" in nn.attrib and nn.attrib['type']=='decoration': + c_t_in_decoration.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + else: + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + if len(c_t_in_text_annotation)>0: + co_graphic_text_annotation.append(np.array(c_t_in_text_annotation)) + if len(c_t_in_decoration)>0: + co_graphic_decoration.append(np.array(c_t_in_decoration)) + if len(c_t_in)>0: + co_graphic.append(np.array(c_t_in)) + + + + elif tag.endswith('}ImageRegion') or tag.endswith('}imageregion'): + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + elif vv.tag!=link+'Point' and sumi>=1: + break + co_img.append(np.array(c_t_in)) + co_img_text.append(' ') + + + elif tag.endswith('}SeparatorRegion') or tag.endswith('}separatorregion'): + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + elif vv.tag!=link+'Point' and sumi>=1: + break + co_sep.append(np.array(c_t_in)) + + + + elif tag.endswith('}TableRegion') or tag.endswith('}tableregion'): + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + elif vv.tag!=link+'Point' and sumi>=1: + break + co_table.append(np.array(c_t_in)) + co_table_text.append(' ') + + elif tag.endswith('}NoiseRegion') or tag.endswith('}noiseregion'): + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + elif vv.tag!=link+'Point' and sumi>=1: + break + co_noise.append(np.array(c_t_in)) + co_noise_text.append(' ') + + img = np.zeros( (y_len,x_len,3) ) + img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=(1,1,1)) + + img_poly=cv2.fillPoly(img, pts =co_text_heading, color=(2,2,2)) + img_poly=cv2.fillPoly(img, pts =co_text_header, color=(2,2,2)) + img_poly=cv2.fillPoly(img, pts =co_text_marginalia, color=(3,3,3)) + img_poly=cv2.fillPoly(img, pts =co_img, color=(4,4,4)) + img_poly=cv2.fillPoly(img, pts =co_sep, color=(5,5,5)) + + return tree1, root1, bb_coord_printspace, id_paragraph, id_header+id_heading, co_text_paragraph, co_text_header+co_text_heading,\ + tot_region_ref,x_len, y_len,index_tot_regions, img_poly + + def return_indexes_of_contours_loctaed_inside_another_list_of_contours(self, contours, contours_loc, cx_main_loc, cy_main_loc, indexes_loc): + indexes_of_located_cont = [] + center_x_coordinates_of_located = [] + center_y_coordinates_of_located = [] + #M_main_tot = [cv2.moments(contours_loc[j]) + #for j in range(len(contours_loc))] + #cx_main_loc = [(M_main_tot[j]["m10"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] + #cy_main_loc = [(M_main_tot[j]["m01"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] + + for ij in range(len(contours)): + results = [cv2.pointPolygonTest(contours[ij], (cx_main_loc[ind], cy_main_loc[ind]), False) + for ind in range(len(cy_main_loc)) ] + results = np.array(results) + indexes_in = np.where((results == 0) | (results == 1)) + indexes = indexes_loc[indexes_in]# [(results == 0) | (results == 1)]#np.where((results == 0) | (results == 1)) + + indexes_of_located_cont.append(indexes) + center_x_coordinates_of_located.append(np.array(cx_main_loc)[indexes_in] ) + center_y_coordinates_of_located.append(np.array(cy_main_loc)[indexes_in] ) + + return indexes_of_located_cont, center_x_coordinates_of_located, center_y_coordinates_of_located + + def do_order_of_regions_with_model(self, contours_only_text_parent, contours_only_text_parent_h, text_regions_p): + height1 =672#448 + width1 = 448#224 + + height2 =672#448 + width2= 448#224 + + height3 =672#448 + width3 = 448#224 + + inference_bs = 3 + + ver_kernel = np.ones((5, 1), dtype=np.uint8) + hor_kernel = np.ones((1, 5), dtype=np.uint8) + + + min_cont_size_to_be_dilated = 10 + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + cx_conts, cy_conts, x_min_conts, x_max_conts, y_min_conts, y_max_conts, _ = find_new_features_of_contours(contours_only_text_parent) + args_cont_located = np.array(range(len(contours_only_text_parent))) + + diff_y_conts = np.abs(y_max_conts[:]-y_min_conts) + diff_x_conts = np.abs(x_max_conts[:]-x_min_conts) + + mean_x = statistics.mean(diff_x_conts) + median_x = statistics.median(diff_x_conts) + + + diff_x_ratio= diff_x_conts/mean_x + + args_cont_located_excluded = args_cont_located[diff_x_ratio>=1.3] + args_cont_located_included = args_cont_located[diff_x_ratio<1.3] + + contours_only_text_parent_excluded = [contours_only_text_parent[ind] for ind in range(len(contours_only_text_parent)) if diff_x_ratio[ind]>=1.3]#contours_only_text_parent[diff_x_ratio>=1.3] + contours_only_text_parent_included = [contours_only_text_parent[ind] for ind in range(len(contours_only_text_parent)) if diff_x_ratio[ind]<1.3]#contours_only_text_parent[diff_x_ratio<1.3] + + + cx_conts_excluded = [cx_conts[ind] for ind in range(len(cx_conts)) if diff_x_ratio[ind]>=1.3]#cx_conts[diff_x_ratio>=1.3] + cx_conts_included = [cx_conts[ind] for ind in range(len(cx_conts)) if diff_x_ratio[ind]<1.3]#cx_conts[diff_x_ratio<1.3] + + cy_conts_excluded = [cy_conts[ind] for ind in range(len(cy_conts)) if diff_x_ratio[ind]>=1.3]#cy_conts[diff_x_ratio>=1.3] + cy_conts_included = [cy_conts[ind] for ind in range(len(cy_conts)) if diff_x_ratio[ind]<1.3]#cy_conts[diff_x_ratio<1.3] + + #print(diff_x_ratio, 'ratio') + text_regions_p = text_regions_p.astype('uint8') + + if len(contours_only_text_parent_excluded)>0: + textregion_par = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1])).astype('uint8') + textregion_par = cv2.fillPoly(textregion_par, pts=contours_only_text_parent_included, color=(1,1)) + else: + textregion_par = (text_regions_p[:,:]==1)*1 + textregion_par = textregion_par.astype('uint8') + + text_regions_p_textregions_dilated = cv2.erode(textregion_par , hor_kernel, iterations=2) + text_regions_p_textregions_dilated = cv2.dilate(text_regions_p_textregions_dilated , ver_kernel, iterations=4) + text_regions_p_textregions_dilated = cv2.erode(text_regions_p_textregions_dilated , hor_kernel, iterations=1) + text_regions_p_textregions_dilated = cv2.dilate(text_regions_p_textregions_dilated , ver_kernel, iterations=5) + text_regions_p_textregions_dilated[text_regions_p[:,:]>1] = 0 + + + contours_only_dilated, hir_on_text_dilated = return_contours_of_image(text_regions_p_textregions_dilated) + contours_only_dilated = return_parent_contours(contours_only_dilated, hir_on_text_dilated) + + indexes_of_located_cont, center_x_coordinates_of_located, center_y_coordinates_of_located = self.return_indexes_of_contours_loctaed_inside_another_list_of_contours(contours_only_dilated, contours_only_text_parent_included, cx_conts_included, cy_conts_included, args_cont_located_included) + + + if len(args_cont_located_excluded)>0: + for ind in args_cont_located_excluded: + indexes_of_located_cont.append(np.array([ind])) + contours_only_dilated.append(contours_only_text_parent[ind]) + center_y_coordinates_of_located.append(0) + + array_list = [np.array([elem]) if isinstance(elem, int) else elem for elem in indexes_of_located_cont] + flattened_array = np.concatenate([arr.ravel() for arr in array_list]) + #print(len( np.unique(flattened_array)), 'indexes_of_located_cont uniques') + + missing_textregions = list( set(np.array(range(len(contours_only_text_parent))) ) - set(np.unique(flattened_array)) ) + #print(missing_textregions, 'missing_textregions') + + for ind in missing_textregions: + indexes_of_located_cont.append(np.array([ind])) + contours_only_dilated.append(contours_only_text_parent[ind]) + center_y_coordinates_of_located.append(0) + + + if contours_only_text_parent_h: + for vi in range(len(contours_only_text_parent_h)): + indexes_of_located_cont.append(int(vi+len(contours_only_text_parent))) + + array_list = [np.array([elem]) if isinstance(elem, int) else elem for elem in indexes_of_located_cont] + flattened_array = np.concatenate([arr.ravel() for arr in array_list]) + + y_len = text_regions_p.shape[0] + x_len = text_regions_p.shape[1] + + img_poly = np.zeros((y_len,x_len), dtype='uint8') + ###img_poly[text_regions_p[:,:]==1] = 1 + ###img_poly[text_regions_p[:,:]==2] = 2 + ###img_poly[text_regions_p[:,:]==3] = 4 + ###img_poly[text_regions_p[:,:]==6] = 5 + + ##img_poly[text_regions_p[:,:]==1] = 1 + ##img_poly[text_regions_p[:,:]==2] = 2 + ##img_poly[text_regions_p[:,:]==3] = 3 + ##img_poly[text_regions_p[:,:]==4] = 4 + ##img_poly[text_regions_p[:,:]==5] = 5 + + img_poly = np.copy(text_regions_p) + + img_header_and_sep = np.zeros((y_len,x_len), dtype='uint8') + if contours_only_text_parent_h: + _, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, _ = find_new_features_of_contours( + contours_only_text_parent_h) + for j in range(len(cy_main)): + img_header_and_sep[int(y_max_main[j]):int(y_max_main[j])+12, + int(x_min_main[j]):int(x_max_main[j])] = 1 + co_text_all_org = contours_only_text_parent + contours_only_text_parent_h + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + co_text_all = contours_only_dilated + contours_only_text_parent_h + else: + co_text_all = contours_only_text_parent + contours_only_text_parent_h + else: + co_text_all_org = contours_only_text_parent + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + co_text_all = contours_only_dilated + else: + co_text_all = contours_only_text_parent + + if not len(co_text_all): + return [], [] + + labels_con = np.zeros((int(y_len /6.), int(x_len/6.), len(co_text_all)), dtype=bool) + + co_text_all = [(i/6).astype(int) for i in co_text_all] + for i in range(len(co_text_all)): + img = labels_con[:,:,i].astype(np.uint8) + + #img = cv2.resize(img, (int(img.shape[1]/6), int(img.shape[0]/6)), interpolation=cv2.INTER_NEAREST) + + cv2.fillPoly(img, pts=[co_text_all[i]], color=(1,)) + labels_con[:,:,i] = img + + + labels_con = resize_image(labels_con.astype(np.uint8), height1, width1).astype(bool) + img_header_and_sep = resize_image(img_header_and_sep, height1, width1) + img_poly = resize_image(img_poly, height3, width3) + + + + input_1 = np.zeros((inference_bs, height1, width1, 3)) + ordered = [list(range(len(co_text_all)))] + index_update = 0 + #print(labels_con.shape[2],"number of regions for reading order") + while index_update>=0: + ij_list = ordered.pop(index_update) + i = ij_list.pop(0) + + ante_list = [] + post_list = [] + tot_counter = 0 + batch = [] + for j in ij_list: + img1 = labels_con[:,:,i].astype(float) + img2 = labels_con[:,:,j].astype(float) + img1[img_poly==5] = 2 + img2[img_poly==5] = 2 + img1[img_header_and_sep==1] = 3 + img2[img_header_and_sep==1] = 3 + + input_1[len(batch), :, :, 0] = img1 / 3. + input_1[len(batch), :, :, 2] = img2 / 3. + input_1[len(batch), :, :, 1] = img_poly / 5. + + tot_counter += 1 + batch.append(j) + if tot_counter % inference_bs == 0 or tot_counter == len(ij_list): + y_pr = self.model_reading_order.predict(input_1 , verbose=0) + for jb, j in enumerate(batch): + if y_pr[jb][0]>=0.5: + post_list.append(j) + else: + ante_list.append(j) + batch = [] + + if len(ante_list): + ordered.insert(index_update, ante_list) + index_update += 1 + ordered.insert(index_update, [i]) + if len(post_list): + ordered.insert(index_update + 1, post_list) + + index_update = -1 + for index_next, ij_list in enumerate(ordered): + if len(ij_list) > 1: + index_update = index_next + break + + ordered = [i[0] for i in ordered] + + ##id_all_text = np.array(id_all_text)[index_sort] + + + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + org_contours_indexes = [] + for ind in range(len(ordered)): + region_with_curr_order = ordered[ind] + if region_with_curr_order < len(contours_only_dilated): + if np.isscalar(indexes_of_located_cont[region_with_curr_order]): + org_contours_indexes = org_contours_indexes + [indexes_of_located_cont[region_with_curr_order]] + else: + arg_sort_located_cont = np.argsort(center_y_coordinates_of_located[region_with_curr_order]) + org_contours_indexes = org_contours_indexes + list(np.array(indexes_of_located_cont[region_with_curr_order])[arg_sort_located_cont]) ##org_contours_indexes + list ( + else: + org_contours_indexes = org_contours_indexes + [indexes_of_located_cont[region_with_curr_order]] + + region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] + return org_contours_indexes, region_ids + else: + region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] + return ordered, region_ids + + + + + def run(self, + overwrite: bool = False, + xml_filename: Optional[str] = None, + dir_in: Optional[str] = None, + dir_out: Optional[str] = None, + ): + """ + Get image and scales, then extract the page of scanned image + """ + self.logger.debug("enter run") + t0_tot = time.time() + + if dir_in: + ls_xmls = [os.path.join(dir_in, xml_filename) + for xml_filename in filter(is_xml_filename, + os.listdir(dir_in))] + elif xml_filename: + ls_xmls = [xml_filename] + else: + raise ValueError("run requires either a single image filename or a directory") + + for xml_filename in ls_xmls: + self.logger.info(xml_filename) + t0 = time.time() + + file_name = Path(xml_filename).stem + (tree_xml, root_xml, bb_coord_printspace, id_paragraph, id_header, + co_text_paragraph, co_text_header, tot_region_ref, + x_len, y_len, index_tot_regions, img_poly) = self.read_xml(xml_filename) + + id_all_text = id_paragraph + id_header + + order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model(co_text_paragraph, co_text_header, img_poly[:,:,0]) + + id_all_text = np.array(id_all_text)[order_text_new] + + alltags=[elem.tag for elem in root_xml.iter()] + + + + link=alltags[0].split('}')[0]+'}' + name_space = alltags[0].split('}')[0] + name_space = name_space.split('{')[1] + + page_element = root_xml.find(link+'Page') + + + old_ro = root_xml.find(".//{*}ReadingOrder") + + if old_ro is not None: + page_element.remove(old_ro) + + #print(old_ro, 'old_ro') + ro_subelement = ET.Element('ReadingOrder') + + ro_subelement2 = ET.SubElement(ro_subelement, 'OrderedGroup') + ro_subelement2.set('id', "ro357564684568544579089") + + for index, id_text in enumerate(id_all_text): + new_element_2 = ET.SubElement(ro_subelement2, 'RegionRefIndexed') + new_element_2.set('regionRef', id_all_text[index]) + new_element_2.set('index', str(index)) + + if (link+'PrintSpace' in alltags) or (link+'Border' in alltags): + page_element.insert(1, ro_subelement) + else: + page_element.insert(0, ro_subelement) + + alltags=[elem.tag for elem in root_xml.iter()] + + ET.register_namespace("",name_space) + tree_xml.write(os.path.join(dir_out, file_name+'.xml'), + xml_declaration=True, + method='xml', + encoding="utf8", + default_namespace=None) + + #sys.exit() + diff --git a/src/eynollah/ocrd-tool.json b/src/eynollah/ocrd-tool.json index af5e03f..fbc6c1a 100644 --- a/src/eynollah/ocrd-tool.json +++ b/src/eynollah/ocrd-tool.json @@ -82,13 +82,23 @@ } }, "resources": [ + { + "url": "https://zenodo.org/records/17194824/files/models_layout_v0_5_0.tar.gz?download=1", + "name": "models_layout_v0_5_0", + "type": "archive", + "path_in_archive": "models_layout_v0_5_0", + "size": 3525684179, + "description": "Models for layout detection, reading order detection, textline detection, page extraction, column classification, table detection, binarization, image enhancement", + "version_range": ">= v0.5.0" + }, { "description": "models for eynollah (TensorFlow SavedModel format)", "url": "https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz", "name": "default", "size": 1894627041, "type": "archive", - "path_in_archive": "models_eynollah" + "path_in_archive": "models_eynollah", + "version_range": ">= v0.3.0, < v0.5.0" } ] }, diff --git a/src/eynollah/processor.py b/src/eynollah/processor.py index c2922c1..12c7356 100644 --- a/src/eynollah/processor.py +++ b/src/eynollah/processor.py @@ -1,6 +1,7 @@ +from functools import cached_property from typing import Optional from ocrd_models import OcrdPage -from ocrd import Processor, OcrdPageResult +from ocrd import OcrdPageResultImage, Processor, OcrdPageResult from .eynollah import Eynollah, EynollahXmlWriter @@ -9,8 +10,8 @@ class EynollahProcessor(Processor): # already employs GPU (without singleton process atm) max_workers = 1 - @property - def executable(self): + @cached_property + def executable(self) -> str: return 'ocrd-eynollah-segment' def setup(self) -> None: @@ -20,7 +21,6 @@ class EynollahProcessor(Processor): "and parameter 'light_version' (faster+simpler method for main region detection and deskewing)") self.eynollah = Eynollah( self.resolve_resource(self.parameter['models']), - logger=self.logger, allow_enhancement=self.parameter['allow_enhancement'], curved_line=self.parameter['curved_line'], right2left=self.parameter['right_to_left'], @@ -33,6 +33,7 @@ class EynollahProcessor(Processor): headers_off=self.parameter['headers_off'], tables=self.parameter['tables'], ) + self.eynollah.logger = self.logger self.eynollah.plotter = None def shutdown(self): diff --git a/src/eynollah/sbb_binarize.py b/src/eynollah/sbb_binarize.py index f43b6ba..3716987 100644 --- a/src/eynollah/sbb_binarize.py +++ b/src/eynollah/sbb_binarize.py @@ -16,6 +16,7 @@ import tensorflow as tf from tensorflow.keras.models import load_model from tensorflow.python.keras import backend as tensorflow_backend +from .utils import is_image_filename def resize_image(img_in, input_height, input_width): return cv2.resize(img_in, (input_width, input_height), interpolation=cv2.INTER_NEAREST) @@ -314,8 +315,8 @@ class SbbBinarizer: prediction_true = prediction_true.astype(np.uint8) return prediction_true[:,:,0] - def run(self, image=None, image_path=None, save=None, use_patches=False, dir_in=None, dir_out=None): - print(dir_in,'dir_in') + def run(self, image=None, image_path=None, output=None, use_patches=False, dir_in=None): + # print(dir_in,'dir_in') if not dir_in: if (image is not None and image_path is not None) or \ (image is None and image_path is None): @@ -343,11 +344,11 @@ class SbbBinarizer: kernel = np.ones((5, 5), np.uint8) img_last[:, :][img_last[:, :] > 0] = 255 img_last = (img_last[:, :] == 0) * 255 - if save: - cv2.imwrite(save, img_last) + if output: + cv2.imwrite(output, img_last) return img_last else: - ls_imgs = os.listdir(dir_in) + ls_imgs = list(filter(is_image_filename, os.listdir(dir_in))) for image_name in ls_imgs: image_stem = image_name.split('.')[0] print(image_name,'image_name') @@ -374,4 +375,4 @@ class SbbBinarizer: img_last[:, :][img_last[:, :] > 0] = 255 img_last = (img_last[:, :] == 0) * 255 - cv2.imwrite(os.path.join(dir_out,image_stem+'.png'), img_last) + cv2.imwrite(os.path.join(output, image_stem + '.png'), img_last) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index c5962f8..6eeabd0 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -992,7 +992,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light( (regions_model_full[:,:,0]==2)).sum() pixels_main = all_pixels - pixels_header - if (pixels_header/float(pixels_main)>=0.3) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ): + if ( (pixels_header/float(pixels_main)>=0.6) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ) and ( (length_con[ii]/float(height_con[ii]) )<=3 )) or ( (pixels_header/float(pixels_main)>=0.3) and ( (length_con[ii]/float(height_con[ii]) )>=3 ) ): regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=2 contours_only_text_parent_head.append(con) if contours_only_text_parent_d_ordered is not None: @@ -1801,8 +1801,8 @@ def return_boxes_of_images_by_order_of_reading_new( #print(y_type_2_up,x_starting_up,x_ending_up,'didid') nodes_in = [] for ij in range(len(x_starting_up)): - nodes_in = nodes_in + list(range(x_starting_up[ij], - x_ending_up[ij])) + nodes_in = nodes_in + list(range(int(x_starting_up[ij]), + int(x_ending_up[ij]))) nodes_in = np.unique(nodes_in) #print(nodes_in,'nodes_in') @@ -1825,8 +1825,8 @@ def return_boxes_of_images_by_order_of_reading_new( elif len(y_diff_main_separator_up)==0: nodes_in = [] for ij in range(len(x_starting_up)): - nodes_in = nodes_in + list(range(x_starting_up[ij], - x_ending_up[ij])) + nodes_in = nodes_in + list(range(int(x_starting_up[ij]), + int(x_ending_up[ij]))) nodes_in = np.unique(nodes_in) #print(nodes_in,'nodes_in2') #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') @@ -1866,8 +1866,8 @@ def return_boxes_of_images_by_order_of_reading_new( columns_covered_by_mothers = [] for dj in range(len(x_start_without_mother)): columns_covered_by_mothers = columns_covered_by_mothers + \ - list(range(x_start_without_mother[dj], - x_end_without_mother[dj])) + list(range(int(x_start_without_mother[dj]), + int(x_end_without_mother[dj]))) columns_covered_by_mothers = list(set(columns_covered_by_mothers)) all_columns=np.arange(len(peaks_neg_tot)-1) @@ -1909,8 +1909,8 @@ def return_boxes_of_images_by_order_of_reading_new( columns_covered_by_mothers = [] for dj in range(len(x_start_without_mother)): columns_covered_by_mothers = columns_covered_by_mothers + \ - list(range(x_start_without_mother[dj], - x_end_without_mother[dj])) + list(range(int(x_start_without_mother[dj]), + int(x_end_without_mother[dj]))) columns_covered_by_mothers = list(set(columns_covered_by_mothers)) all_columns=np.arange(len(peaks_neg_tot)-1) @@ -1926,8 +1926,8 @@ def return_boxes_of_images_by_order_of_reading_new( columns_covered_by_with_child_no_mothers = [] for dj in range(len(x_end_with_child_without_mother)): columns_covered_by_with_child_no_mothers = columns_covered_by_with_child_no_mothers + \ - list(range(x_start_with_child_without_mother[dj], - x_end_with_child_without_mother[dj])) + list(range(int(x_start_with_child_without_mother[dj]), + int(x_end_with_child_without_mother[dj]))) columns_covered_by_with_child_no_mothers = list(set(columns_covered_by_with_child_no_mothers)) all_columns = np.arange(len(peaks_neg_tot)-1) @@ -1970,8 +1970,8 @@ def return_boxes_of_images_by_order_of_reading_new( columns_covered_by_mothers = [] for dj in range(len(x_starting_all_between_nm_wc)): columns_covered_by_mothers = columns_covered_by_mothers + \ - list(range(x_starting_all_between_nm_wc[dj], - x_ending_all_between_nm_wc[dj])) + list(range(int(x_starting_all_between_nm_wc[dj]), + int(x_ending_all_between_nm_wc[dj]))) columns_covered_by_mothers = list(set(columns_covered_by_mothers)) all_columns=np.arange(i_s_nc, x_end_biggest_column) @@ -1979,8 +1979,8 @@ def return_boxes_of_images_by_order_of_reading_new( should_longest_line_be_extended=0 if (len(x_diff_all_between_nm_wc) > 0 and - set(list(range(x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest])) + + set(list(range(int(x_starting_all_between_nm_wc[biggest]), + int(x_ending_all_between_nm_wc[biggest]))) + list(columns_not_covered)) != set(all_columns)): should_longest_line_be_extended=1 index_lines_so_close_to_top_separator = \ @@ -2012,7 +2012,7 @@ def return_boxes_of_images_by_order_of_reading_new( x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, np.array(columns_not_covered) + 1) ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) - for column in range(i_s_nc, x_end_biggest_column): + for column in range(int(i_s_nc), int(x_end_biggest_column)): ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') @@ -2064,7 +2064,7 @@ def return_boxes_of_images_by_order_of_reading_new( x_end_itself=x_end_copy.pop(il) #print(y_copy,'y_copy2') - for column in range(x_start_itself, x_end_itself+1): + for column in range(int(x_start_itself), int(x_end_itself)+1): #print(column,'cols') y_in_cols=[] for yic in range(len(y_copy)): @@ -2095,11 +2095,11 @@ def return_boxes_of_images_by_order_of_reading_new( all_columns = np.arange(len(peaks_neg_tot)-1) columns_covered_by_lines_covered_more_than_2col = [] for dj in range(len(x_starting)): - if set(list(range(x_starting[dj],x_ending[dj]))) == set(all_columns): + if set(list(range(int(x_starting[dj]),int(x_ending[dj]) ))) == set(all_columns): pass else: columns_covered_by_lines_covered_more_than_2col = columns_covered_by_lines_covered_more_than_2col + \ - list(range(x_starting[dj],x_ending[dj])) + list(range(int(x_starting[dj]),int(x_ending[dj]) )) columns_covered_by_lines_covered_more_than_2col = list(set(columns_covered_by_lines_covered_more_than_2col)) columns_not_covered = list(set(all_columns) - set(columns_covered_by_lines_covered_more_than_2col)) @@ -2124,7 +2124,7 @@ def return_boxes_of_images_by_order_of_reading_new( x_ending = np.append(x_ending, np.array(columns_not_covered) + 1) ind_args=np.array(range(len(y_type_2))) - #ind_args=np.array(ind_args) + for column in range(len(peaks_neg_tot)-1): #print(column,'column') ind_args_in_col=ind_args[x_starting==column] @@ -2155,8 +2155,7 @@ def return_boxes_of_images_by_order_of_reading_new( x_start_itself=x_start_copy.pop(il) x_end_itself=x_end_copy.pop(il) - #print(y_copy,'y_copy2') - for column in range(x_start_itself, x_end_itself+1): + for column in range(int(x_start_itself), int(x_end_itself)+1): #print(column,'cols') y_in_cols=[] for yic in range(len(y_copy)): @@ -2195,3 +2194,14 @@ def return_boxes_of_images_by_order_of_reading_new( return boxes, peaks_neg_tot_tables_new else: return boxes, peaks_neg_tot_tables + +def is_image_filename(fname: str) -> bool: + return fname.lower().endswith(('.jpg', + '.jpeg', + '.png', + '.tif', + '.tiff', + )) + +def is_xml_filename(fname: str) -> bool: + return fname.lower().endswith('.xml') diff --git a/src/eynollah/utils/marginals.py b/src/eynollah/utils/marginals.py index a29e50d..ac8dc1d 100644 --- a/src/eynollah/utils/marginals.py +++ b/src/eynollah/utils/marginals.py @@ -10,7 +10,6 @@ def get_marginals(text_with_lines, text_regions, num_col, slope_deskew, light_ve mask_marginals=np.zeros((text_with_lines.shape[0],text_with_lines.shape[1])) mask_marginals=mask_marginals.astype(np.uint8) - text_with_lines=text_with_lines.astype(np.uint8) ##text_with_lines=cv2.erode(text_with_lines,self.kernel,iterations=3) @@ -26,8 +25,12 @@ def get_marginals(text_with_lines, text_regions, num_col, slope_deskew, light_ve text_with_lines=resize_image(text_with_lines,int(text_with_lines.shape[0]*1.8),text_with_lines.shape[1]) text_with_lines=cv2.erode(text_with_lines,kernel,iterations=7) text_with_lines=resize_image(text_with_lines,text_with_lines_eroded.shape[0],text_with_lines_eroded.shape[1]) - - + + + if light_version: + kernel_hor = np.ones((1, 5), dtype=np.uint8) + text_with_lines = cv2.erode(text_with_lines,kernel_hor,iterations=6) + text_with_lines_y=text_with_lines.sum(axis=0) text_with_lines_y_eroded=text_with_lines_eroded.sum(axis=0) @@ -40,8 +43,10 @@ def get_marginals(text_with_lines, text_regions, num_col, slope_deskew, light_ve elif thickness_along_y_percent>=30 and thickness_along_y_percent<50: min_textline_thickness=20 else: - min_textline_thickness=40 - + if light_version: + min_textline_thickness=45 + else: + min_textline_thickness=40 if thickness_along_y_percent>=14: diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index 0322579..ead5cfb 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -5,6 +5,8 @@ import numpy as np import cv2 from scipy.signal import find_peaks from scipy.ndimage import gaussian_filter1d +from multiprocessing import Process, Queue, cpu_count +from multiprocessing import Pool from .rotate import rotate_image from .resize import resize_image from .contour import ( @@ -1466,7 +1468,7 @@ def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, main_page=False, logger=None, plotter=None, map=map): if main_page and plotter: plotter.save_plot_of_textline_density(img_patch_org) - + img_int=np.zeros((img_patch_org.shape[0],img_patch_org.shape[1])) img_int[:,:]=img_patch_org[:,:]#img_patch_org[:,:,0] @@ -1487,7 +1489,7 @@ def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, angles = np.linspace(angle - 22.5, angle + 22.5, n_tot_angles) angle = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) elif main_page: - angles = np.linspace(-12, 12, n_tot_angles)#np.array([0 , 45 , 90 , -45]) + angles = np.array (list(np.linspace(-12, -7, int(n_tot_angles/4))) + list(np.linspace(-6, 6, n_tot_angles- 2* int(n_tot_angles/4))) + list(np.linspace(7, 12, int(n_tot_angles/4))))#np.linspace(-12, 12, n_tot_angles)#np.array([0 , 45 , 90 , -45]) angle = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) early_slope_edge=11 @@ -1526,6 +1528,107 @@ def get_smallest_skew(img, sigma_des, angles, logger=None, plotter=None, map=map angle = 0 return angle + +def return_deskew_slop_old_mp(img_patch_org, sigma_des,n_tot_angles=100, + main_page=False, logger=None, plotter=None): + if main_page and plotter: + plotter.save_plot_of_textline_density(img_patch_org) + + img_int=np.zeros((img_patch_org.shape[0],img_patch_org.shape[1])) + img_int[:,:]=img_patch_org[:,:]#img_patch_org[:,:,0] + + max_shape=np.max(img_int.shape) + img_resized=np.zeros((int( max_shape*(1.1) ) , int( max_shape*(1.1) ) )) + + onset_x=int((img_resized.shape[1]-img_int.shape[1])/2.) + onset_y=int((img_resized.shape[0]-img_int.shape[0])/2.) + + img_resized[ onset_y:onset_y+img_int.shape[0] , onset_x:onset_x+img_int.shape[1] ]=img_int[:,:] + + if main_page and img_patch_org.shape[1] > img_patch_org.shape[0]: + angles = np.array([-45, 0, 45, 90,]) + angle = get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=plotter) + + angles = np.linspace(angle - 22.5, angle + 22.5, n_tot_angles) + angle = get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=plotter) + elif main_page: + angles = np.linspace(-12, 12, n_tot_angles)#np.array([0 , 45 , 90 , -45]) + angle = get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=plotter) + + early_slope_edge=11 + if abs(angle) > early_slope_edge: + if angle < 0: + angles = np.linspace(-90, -12, n_tot_angles) + else: + angles = np.linspace(90, 12, n_tot_angles) + angle = get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=plotter) + else: + angles = np.linspace(-25, 25, int(0.5 * n_tot_angles) + 10) + angle = get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=plotter) + + early_slope_edge=22 + if abs(angle) > early_slope_edge: + if angle < 0: + angles = np.linspace(-90, -25, int(0.5 * n_tot_angles) + 10) + else: + angles = np.linspace(90, 25, int(0.5 * n_tot_angles) + 10) + angle = get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=plotter) + + return angle + +def do_image_rotation_omp(queue_of_all_params,angles_per_process, img_resized, sigma_des): + vars_per_each_subprocess = [] + angles_per_each_subprocess = [] + for mv in range(len(angles_per_process)): + img_rot=rotate_image(img_resized,angles_per_process[mv]) + img_rot[img_rot!=0]=1 + try: + var_spectrum=find_num_col_deskew(img_rot,sigma_des,20.3 ) + except: + var_spectrum=0 + vars_per_each_subprocess.append(var_spectrum) + angles_per_each_subprocess.append(angles_per_process[mv]) + + queue_of_all_params.put([vars_per_each_subprocess, angles_per_each_subprocess]) + +def get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=None): + num_cores = cpu_count() + + queue_of_all_params = Queue() + processes = [] + nh = np.linspace(0, len(angles), num_cores + 1) + + for i in range(num_cores): + angles_per_process = angles[int(nh[i]) : int(nh[i + 1])] + processes.append(Process(target=do_image_rotation_omp, args=(queue_of_all_params, angles_per_process, img_resized, sigma_des))) + + for i in range(num_cores): + processes[i].start() + + var_res=[] + all_angles = [] + for i in range(num_cores): + list_all_par = queue_of_all_params.get(True) + vars_for_subprocess = list_all_par[0] + angles_sub_process = list_all_par[1] + for j in range(len(vars_for_subprocess)): + var_res.append(vars_for_subprocess[j]) + all_angles.append(angles_sub_process[j]) + + for i in range(num_cores): + processes[i].join() + + if plotter: + plotter.save_plot_of_rotation_angle(all_angles, var_res) + + + try: + var_res=np.array(var_res) + ang_int=all_angles[np.argmax(var_res)]#angels_sorted[arg_final]#angels[arg_sort_early[arg_sort[arg_final]]]#angels[arg_fin] + except: + ang_int=0 + return ang_int + def do_work_of_slopes_new( box_text, contour, contour_par, index_r_con, textline_mask_tot_ea, image_page_rotated, slope_deskew, diff --git a/src/eynollah/utils/utils_ocr.py b/src/eynollah/utils/utils_ocr.py new file mode 100644 index 0000000..4fa99f7 --- /dev/null +++ b/src/eynollah/utils/utils_ocr.py @@ -0,0 +1,488 @@ +import numpy as np +import cv2 +import tensorflow as tf +from scipy.signal import find_peaks +from scipy.ndimage import gaussian_filter1d +import math +from PIL import Image, ImageDraw, ImageFont +from Bio import pairwise2 +from .resize import resize_image + +def decode_batch_predictions(pred, num_to_char, max_len = 128): + # input_len is the product of the batch size and the + # number of time steps. + input_len = np.ones(pred.shape[0]) * pred.shape[1] + + # Decode CTC predictions using greedy search. + # decoded is a tuple with 2 elements. + decoded = tf.keras.backend.ctc_decode(pred, + input_length = input_len, + beam_width = 100) + # The outputs are in the first element of the tuple. + # Additionally, the first element is actually a list, + # therefore we take the first element of that list as well. + #print(decoded,'decoded') + decoded = decoded[0][0][:, :max_len] + + #print(decoded, decoded.shape,'decoded') + + output = [] + for d in decoded: + # Convert the predicted indices to the corresponding chars. + d = tf.strings.reduce_join(num_to_char(d)) + d = d.numpy().decode("utf-8") + output.append(d) + return output + + +def distortion_free_resize(image, img_size): + w, h = img_size + image = tf.image.resize(image, size=(h, w), preserve_aspect_ratio=True) + + # Check tha amount of padding needed to be done. + pad_height = h - tf.shape(image)[0] + pad_width = w - tf.shape(image)[1] + + # Only necessary if you want to do same amount of padding on both sides. + if pad_height % 2 != 0: + height = pad_height // 2 + pad_height_top = height + 1 + pad_height_bottom = height + else: + pad_height_top = pad_height_bottom = pad_height // 2 + + if pad_width % 2 != 0: + width = pad_width // 2 + pad_width_left = width + 1 + pad_width_right = width + else: + pad_width_left = pad_width_right = pad_width // 2 + + image = tf.pad( + image, + paddings=[ + [pad_height_top, pad_height_bottom], + [pad_width_left, pad_width_right], + [0, 0], + ], + ) + + image = tf.transpose(image, (1, 0, 2)) + image = tf.image.flip_left_right(image) + return image + +def return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image): + width = np.shape(textline_image)[1] + height = np.shape(textline_image)[0] + common_window = int(0.06*width) + + width1 = int ( width/2. - common_window ) + width2 = int ( width/2. + common_window ) + + img_sum = np.sum(textline_image[:,:,0], axis=0) + sum_smoothed = gaussian_filter1d(img_sum, 3) + + peaks_real, _ = find_peaks(sum_smoothed, height=0) + if len(peaks_real)>70: + + peaks_real = peaks_real[(peaks_realwidth1)] + + arg_max = np.argmax(sum_smoothed[peaks_real]) + peaks_final = peaks_real[arg_max] + return peaks_final + else: + return None +# Function to fit text inside the given area +def fit_text_single_line(draw, text, font_path, max_width, max_height): + initial_font_size = 50 + font_size = initial_font_size + while font_size > 10: # Minimum font size + font = ImageFont.truetype(font_path, font_size) + text_bbox = draw.textbbox((0, 0), text, font=font) # Get text bounding box + text_width = text_bbox[2] - text_bbox[0] + text_height = text_bbox[3] - text_bbox[1] + + if text_width <= max_width and text_height <= max_height: + return font # Return the best-fitting font + + font_size -= 2 # Reduce font size and retry + + return ImageFont.truetype(font_path, 10) # Smallest font fallback + +def return_textlines_split_if_needed(textline_image, textline_image_bin=None): + + split_point = return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image) + if split_point: + image1 = textline_image[:, :split_point,:]# image.crop((0, 0, width2, height)) + image2 = textline_image[:, split_point:,:]#image.crop((width1, 0, width, height)) + if textline_image_bin is not None: + image1_bin = textline_image_bin[:, :split_point,:]# image.crop((0, 0, width2, height)) + image2_bin = textline_image_bin[:, split_point:,:]#image.crop((width1, 0, width, height)) + return [image1, image2], [image1_bin, image2_bin] + else: + return [image1, image2], None + else: + return None, None +def preprocess_and_resize_image_for_ocrcnn_model(img, image_height, image_width): + if img.shape[0]==0 or img.shape[1]==0: + img_fin = np.ones((image_height, image_width, 3)) + else: + ratio = image_height /float(img.shape[0]) + w_ratio = int(ratio * img.shape[1]) + + if w_ratio <= image_width: + width_new = w_ratio + else: + width_new = image_width + + if width_new == 0: + width_new = img.shape[1] + + + img = resize_image(img, image_height, width_new) + img_fin = np.ones((image_height, image_width, 3))*255 + + img_fin[:,:width_new,:] = img[:,:,:] + img_fin = img_fin / 255. + return img_fin + +def get_deskewed_contour_and_bb_and_image(contour, image, deskew_angle): + (h_in, w_in) = image.shape[:2] + center = (w_in // 2, h_in // 2) + + rotation_matrix = cv2.getRotationMatrix2D(center, deskew_angle, 1.0) + + cos_angle = abs(rotation_matrix[0, 0]) + sin_angle = abs(rotation_matrix[0, 1]) + new_w = int((h_in * sin_angle) + (w_in * cos_angle)) + new_h = int((h_in * cos_angle) + (w_in * sin_angle)) + + rotation_matrix[0, 2] += (new_w / 2) - center[0] + rotation_matrix[1, 2] += (new_h / 2) - center[1] + + deskewed_image = cv2.warpAffine(image, rotation_matrix, (new_w, new_h)) + + contour_points = np.array(contour, dtype=np.float32) + transformed_points = cv2.transform(np.array([contour_points]), rotation_matrix)[0] + + x, y, w, h = cv2.boundingRect(np.array(transformed_points, dtype=np.int32)) + cropped_textline = deskewed_image[y:y+h, x:x+w] + + return cropped_textline + +def rotate_image_with_padding(image, angle, border_value=(0,0,0)): + # Get image dimensions + (h, w) = image.shape[:2] + + # Calculate the center of the image + center = (w // 2, h // 2) + + # Get the rotation matrix + rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1.0) + + # Compute the new bounding dimensions + cos = abs(rotation_matrix[0, 0]) + sin = abs(rotation_matrix[0, 1]) + new_w = int((h * sin) + (w * cos)) + new_h = int((h * cos) + (w * sin)) + + # Adjust the rotation matrix to account for translation + rotation_matrix[0, 2] += (new_w / 2) - center[0] + rotation_matrix[1, 2] += (new_h / 2) - center[1] + + # Perform the rotation + try: + rotated_image = cv2.warpAffine(image, rotation_matrix, (new_w, new_h), borderValue=border_value) + except: + rotated_image = np.copy(image) + + return rotated_image + +def get_orientation_moments(contour): + moments = cv2.moments(contour) + if moments["mu20"] - moments["mu02"] == 0: # Avoid division by zero + return 90 if moments["mu11"] > 0 else -90 + else: + angle = 0.5 * np.arctan2(2 * moments["mu11"], moments["mu20"] - moments["mu02"]) + return np.degrees(angle) # Convert radians to degrees + + +def get_orientation_moments_of_mask(mask): + mask=mask.astype('uint8') + contours, _ = cv2.findContours(mask[:,:,0], cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + largest_contour = max(contours, key=cv2.contourArea) if contours else None + + moments = cv2.moments(largest_contour) + if moments["mu20"] - moments["mu02"] == 0: # Avoid division by zero + return 90 if moments["mu11"] > 0 else -90 + else: + angle = 0.5 * np.arctan2(2 * moments["mu11"], moments["mu20"] - moments["mu02"]) + return np.degrees(angle) # Convert radians to degrees + +def get_contours_and_bounding_boxes(mask): + # Find contours in the binary mask + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + largest_contour = max(contours, key=cv2.contourArea) if contours else None + + # Get the bounding rectangle for the contour + x, y, w, h = cv2.boundingRect(largest_contour) + #bounding_boxes.append((x, y, w, h)) + + return x, y, w, h + +def return_splitting_point_of_image(image_to_spliited): + width = np.shape(image_to_spliited)[1] + height = np.shape(image_to_spliited)[0] + common_window = int(0.03*width) + + width1 = int ( common_window) + width2 = int ( width - common_window ) + + img_sum = np.sum(image_to_spliited[:,:,0], axis=0) + sum_smoothed = gaussian_filter1d(img_sum, 1) + + peaks_real, _ = find_peaks(sum_smoothed, height=0) + peaks_real = peaks_real[(peaks_realwidth1)] + + arg_sort = np.argsort(sum_smoothed[peaks_real]) + peaks_sort_4 = peaks_real[arg_sort][::-1][:3] + + return np.sort(peaks_sort_4) + +def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved, img_bin_curved=None): + peaks_4 = return_splitting_point_of_image(img_curved) + if len(peaks_4)>0: + imgs_tot = [] + + for ind in range(len(peaks_4)+1): + if ind==0: + img = img_curved[:, :peaks_4[ind], :] + if img_bin_curved is not None: + img_bin = img_bin_curved[:, :peaks_4[ind], :] + mask = mask_curved[:, :peaks_4[ind], :] + elif ind==len(peaks_4): + img = img_curved[:, peaks_4[ind-1]:, :] + if img_bin_curved is not None: + img_bin = img_bin_curved[:, peaks_4[ind-1]:, :] + mask = mask_curved[:, peaks_4[ind-1]:, :] + else: + img = img_curved[:, peaks_4[ind-1]:peaks_4[ind], :] + if img_bin_curved is not None: + img_bin = img_bin_curved[:, peaks_4[ind-1]:peaks_4[ind], :] + mask = mask_curved[:, peaks_4[ind-1]:peaks_4[ind], :] + + or_ma = get_orientation_moments_of_mask(mask) + + if img_bin_curved is not None: + imgs_tot.append([img, mask, or_ma, img_bin] ) + else: + imgs_tot.append([img, mask, or_ma] ) + + + w_tot_des_list = [] + w_tot_des = 0 + imgs_deskewed_list = [] + imgs_bin_deskewed_list = [] + + for ind in range(len(imgs_tot)): + img_in = imgs_tot[ind][0] + mask_in = imgs_tot[ind][1] + ori_in = imgs_tot[ind][2] + if img_bin_curved is not None: + img_bin_in = imgs_tot[ind][3] + + if abs(ori_in)<45: + img_in_des = rotate_image_with_padding(img_in, ori_in, border_value=(255,255,255) ) + if img_bin_curved is not None: + img_bin_in_des = rotate_image_with_padding(img_bin_in, ori_in, border_value=(255,255,255) ) + mask_in_des = rotate_image_with_padding(mask_in, ori_in) + mask_in_des = mask_in_des.astype('uint8') + + #new bounding box + x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_in_des[:,:,0]) + + if w_n==0 or h_n==0: + img_in_des = np.copy(img_in) + if img_bin_curved is not None: + img_bin_in_des = np.copy(img_bin_in) + w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) + if w_relative==0: + w_relative = img_in_des.shape[1] + img_in_des = resize_image(img_in_des, 32, w_relative) + if img_bin_curved is not None: + img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative) + else: + mask_in_des = mask_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] + img_in_des = img_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] + if img_bin_curved is not None: + img_bin_in_des = img_bin_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] + + w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) + if w_relative==0: + w_relative = img_in_des.shape[1] + img_in_des = resize_image(img_in_des, 32, w_relative) + if img_bin_curved is not None: + img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative) + + + else: + img_in_des = np.copy(img_in) + if img_bin_curved is not None: + img_bin_in_des = np.copy(img_bin_in) + w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) + if w_relative==0: + w_relative = img_in_des.shape[1] + img_in_des = resize_image(img_in_des, 32, w_relative) + if img_bin_curved is not None: + img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative) + + w_tot_des+=img_in_des.shape[1] + w_tot_des_list.append(img_in_des.shape[1]) + imgs_deskewed_list.append(img_in_des) + if img_bin_curved is not None: + imgs_bin_deskewed_list.append(img_bin_in_des) + + + + + img_final_deskewed = np.zeros((32, w_tot_des, 3))+255 + if img_bin_curved is not None: + img_bin_final_deskewed = np.zeros((32, w_tot_des, 3))+255 + else: + img_bin_final_deskewed = None + + w_indexer = 0 + for ind in range(len(w_tot_des_list)): + img_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_deskewed_list[ind][:,:,:] + if img_bin_curved is not None: + img_bin_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_bin_deskewed_list[ind][:,:,:] + w_indexer = w_indexer+w_tot_des_list[ind] + return img_final_deskewed, img_bin_final_deskewed + else: + return img_curved, img_bin_curved + +def return_textline_contour_with_added_box_coordinate(textline_contour, box_ind): + textline_contour[:,0] = textline_contour[:,0] + box_ind[2] + textline_contour[:,1] = textline_contour[:,1] + box_ind[0] + return textline_contour + + +def return_rnn_cnn_ocr_of_given_textlines(image, all_found_textline_polygons, prediction_model, b_s_ocr, num_to_char, textline_light=False, curved_line=False): + max_len = 512 + padding_token = 299 + image_width = 512#max_len * 4 + image_height = 32 + ind_tot = 0 + #cv2.imwrite('./img_out.png', image_page) + ocr_all_textlines = [] + cropped_lines_region_indexer = [] + cropped_lines_meging_indexing = [] + cropped_lines = [] + indexer_text_region = 0 + + for indexing, ind_poly_first in enumerate(all_found_textline_polygons): + #ocr_textline_in_textregion = [] + if len(ind_poly_first)==0: + cropped_lines_region_indexer.append(indexer_text_region) + cropped_lines_meging_indexing.append(0) + img_fin = np.ones((image_height, image_width, 3))*1 + cropped_lines.append(img_fin) + + else: + for indexing2, ind_poly in enumerate(ind_poly_first): + cropped_lines_region_indexer.append(indexer_text_region) + if not (textline_light or curved_line): + ind_poly = copy.deepcopy(ind_poly) + box_ind = all_box_coord[indexing] + + ind_poly = return_textline_contour_with_added_box_coordinate(ind_poly, box_ind) + #print(ind_poly_copy) + ind_poly[ind_poly<0] = 0 + x, y, w, h = cv2.boundingRect(ind_poly) + + w_scaled = w * image_height/float(h) + + mask_poly = np.zeros(image.shape) + + img_poly_on_img = np.copy(image) + + mask_poly = cv2.fillPoly(mask_poly, pts=[ind_poly], color=(1, 1, 1)) + + + + mask_poly = mask_poly[y:y+h, x:x+w, :] + img_crop = img_poly_on_img[y:y+h, x:x+w, :] + + img_crop[mask_poly==0] = 255 + + if w_scaled < 640:#1.5*image_width: + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) + cropped_lines.append(img_fin) + cropped_lines_meging_indexing.append(0) + else: + splited_images, splited_images_bin = return_textlines_split_if_needed(img_crop, None) + + if splited_images: + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[0], image_height, image_width) + cropped_lines.append(img_fin) + cropped_lines_meging_indexing.append(1) + + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[1], image_height, image_width) + + cropped_lines.append(img_fin) + cropped_lines_meging_indexing.append(-1) + + else: + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) + cropped_lines.append(img_fin) + cropped_lines_meging_indexing.append(0) + + indexer_text_region+=1 + + extracted_texts = [] + + n_iterations = math.ceil(len(cropped_lines) / b_s_ocr) + + for i in range(n_iterations): + if i==(n_iterations-1): + n_start = i*b_s_ocr + imgs = cropped_lines[n_start:] + imgs = np.array(imgs) + imgs = imgs.reshape(imgs.shape[0], image_height, image_width, 3) + + + else: + n_start = i*b_s_ocr + n_end = (i+1)*b_s_ocr + imgs = cropped_lines[n_start:n_end] + imgs = np.array(imgs).reshape(b_s_ocr, image_height, image_width, 3) + + + preds = prediction_model.predict(imgs, verbose=0) + + pred_texts = decode_batch_predictions(preds, num_to_char) + + for ib in range(imgs.shape[0]): + pred_texts_ib = pred_texts[ib].replace("[UNK]", "") + extracted_texts.append(pred_texts_ib) + + extracted_texts_merged = [extracted_texts[ind] if cropped_lines_meging_indexing[ind]==0 else extracted_texts[ind]+" "+extracted_texts[ind+1] if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] + + extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None] + unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer) + + ocr_all_textlines = [] + for ind in unique_cropped_lines_region_indexer: + ocr_textline_in_textregion = [] + extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind] + for it_ind, text_textline in enumerate(extracted_texts_merged_un): + ocr_textline_in_textregion.append(text_textline) + ocr_all_textlines.append(ocr_textline_in_textregion) + return ocr_all_textlines + +def biopython_align(str1, str2): + alignments = pairwise2.align.globalms(str1, str2, 2, -1, -2, -2) + best_alignment = alignments[0] # Get the best alignment + return best_alignment.seqA, best_alignment.seqB diff --git a/src/eynollah/utils/xml.py b/src/eynollah/utils/xml.py index bd95702..13420df 100644 --- a/src/eynollah/utils/xml.py +++ b/src/eynollah/utils/xml.py @@ -46,16 +46,22 @@ def create_page_xml(imageFilename, height, width): )) return pcgts -def xml_reading_order(page, order_of_texts, id_of_marginalia): +def xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_marginalia_right): region_order = ReadingOrderType() og = OrderedGroupType(id="ro357564684568544579089") page.set_ReadingOrder(region_order) region_order.set_OrderedGroup(og) region_counter = EynollahIdCounter() + + for id_marginal in id_of_marginalia_left: + og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=id_marginal)) + region_counter.inc('region') + for idx_textregion, _ in enumerate(order_of_texts): og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=region_counter.region_id(order_of_texts[idx_textregion] + 1))) region_counter.inc('region') - for id_marginal in id_of_marginalia: + + for id_marginal in id_of_marginalia_right: og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=id_marginal)) region_counter.inc('region') diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index 92e353f..2f9caf3 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -56,10 +56,12 @@ class EynollahXmlWriter(): points_page_print = points_page_print + ' ' return points_page_print[:-1] - def serialize_lines_in_marginal(self, marginal_region, all_found_textline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, counter): + def serialize_lines_in_marginal(self, marginal_region, all_found_textline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, counter, ocr_all_textlines_textregion): for j in range(len(all_found_textline_polygons_marginals[marginal_idx])): coords = CoordsType() textline = TextLineType(id=counter.next_line_id, Coords=coords) + if ocr_all_textlines_textregion: + textline.set_TextEquiv( [ TextEquivType(Unicode=ocr_all_textlines_textregion[j]) ] ) marginal_region.add_TextLine(textline) marginal_region.set_orientation(-slopes_marginals[marginal_idx]) points_co = '' @@ -119,7 +121,7 @@ class EynollahXmlWriter(): points_co += ',' points_co += str(textline_y_coord) - if (self.curved_line or self.textline_light) and np.abs(slopes[region_idx]) <= 45: + if self.textline_light or (self.curved_line and np.abs(slopes[region_idx]) <= 45): if len(contour_textline) == 2: points_co += str(int((contour_textline[0] + page_coord[2]) / self.scale_x)) points_co += ',' @@ -128,7 +130,7 @@ class EynollahXmlWriter(): points_co += str(int((contour_textline[0][0] + page_coord[2]) / self.scale_x)) points_co += ',' points_co += str(int((contour_textline[0][1] + page_coord[0])/self.scale_y)) - elif (self.curved_line or self.textline_light) and np.abs(slopes[region_idx]) > 45: + elif self.curved_line and np.abs(slopes[region_idx]) > 45: if len(contour_textline)==2: points_co += str(int((contour_textline[0] + region_bboxes[2] + page_coord[2])/self.scale_x)) points_co += ',' @@ -168,7 +170,7 @@ class EynollahXmlWriter(): with open(self.output_filename, 'w') as f: f.write(to_xml(pcgts)) - def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines, conf_contours_textregion): + def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals_left, found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, conf_contours_textregion=None, skip_layout_reading_order=False): self.logger.debug('enter build_pagexml_no_full_layout') # create the file structure @@ -179,12 +181,13 @@ class EynollahXmlWriter(): counter = EynollahIdCounter() if len(found_polygons_text_region) > 0: _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) - id_of_marginalia = [_counter_marginals.next_region_id for _ in found_polygons_marginals] - xml_reading_order(page, order_of_texts, id_of_marginalia) + id_of_marginalia_left = [_counter_marginals.next_region_id for _ in found_polygons_marginals_left] + id_of_marginalia_right = [_counter_marginals.next_region_id for _ in found_polygons_marginals_right] + xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_marginalia_right) for mm in range(len(found_polygons_text_region)): textregion = TextRegionType(id=counter.next_region_id, type_='paragraph', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord), conf=conf_contours_textregion[mm]), + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord, skip_layout_reading_order), conf=conf_contours_textregion[mm]), ) #textregion.set_conf(conf_contours_textregion[mm]) page.add_TextRegion(textregion) @@ -193,12 +196,29 @@ class EynollahXmlWriter(): else: ocr_textlines = None self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, all_box_coord, slopes, counter, ocr_textlines) - - for mm in range(len(found_polygons_marginals)): + + for mm in range(len(found_polygons_marginals_left)): marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord))) + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_left[mm], page_coord))) page.add_TextRegion(marginal) - self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) + if ocr_all_textlines_marginals_left: + ocr_textlines = ocr_all_textlines_marginals_left[mm] + else: + ocr_textlines = None + + #print(ocr_textlines, mm, len(all_found_textline_polygons_marginals_left[mm]) ) + self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_left, mm, page_coord, all_box_coord_marginals_left, slopes_marginals_left, counter, ocr_textlines) + + for mm in range(len(found_polygons_marginals_right)): + marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_right[mm], page_coord))) + page.add_TextRegion(marginal) + if ocr_all_textlines_marginals_right: + ocr_textlines = ocr_all_textlines_marginals_right[mm] + else: + ocr_textlines = None + + self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_right, mm, page_coord, all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines) for mm in range(len(found_polygons_text_region_img)): img_region = ImageRegionType(id=counter.next_region_id, Coords=CoordsType()) @@ -242,7 +262,7 @@ class EynollahXmlWriter(): return pcgts - def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, ocr_all_textlines, conf_contours_textregion, conf_contours_textregion_h): + def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals_left,found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_lines_to_be_written_in_xml, ocr_all_textlines=None, ocr_all_textlines_h=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, ocr_all_textlines_drop=None, conf_contours_textregion=None, conf_contours_textregion_h=None): self.logger.debug('enter build_pagexml_full_layout') # create the file structure @@ -252,8 +272,9 @@ class EynollahXmlWriter(): counter = EynollahIdCounter() _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) - id_of_marginalia = [_counter_marginals.next_region_id for _ in found_polygons_marginals] - xml_reading_order(page, order_of_texts, id_of_marginalia) + id_of_marginalia_left = [_counter_marginals.next_region_id for _ in found_polygons_marginals_left] + id_of_marginalia_right = [_counter_marginals.next_region_id for _ in found_polygons_marginals_right] + xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_marginalia_right) for mm in range(len(found_polygons_text_region)): textregion = TextRegionType(id=counter.next_region_id, type_='paragraph', @@ -272,25 +293,43 @@ class EynollahXmlWriter(): Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord))) page.add_TextRegion(textregion) - if ocr_all_textlines: - ocr_textlines = ocr_all_textlines[mm] + if ocr_all_textlines_h: + ocr_textlines = ocr_all_textlines_h[mm] else: ocr_textlines = None self.serialize_lines_in_region(textregion, all_found_textline_polygons_h, mm, page_coord, all_box_coord_h, slopes_h, counter, ocr_textlines) - for mm in range(len(found_polygons_marginals)): + for mm in range(len(found_polygons_marginals_left)): marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord))) + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_left[mm], page_coord))) page.add_TextRegion(marginal) - self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) - + if ocr_all_textlines_marginals_left: + ocr_textlines = ocr_all_textlines_marginals_left[mm] + else: + ocr_textlines = None + self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_left, mm, page_coord, all_box_coord_marginals_left, slopes_marginals_left, counter, ocr_textlines) + + for mm in range(len(found_polygons_marginals_right)): + marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_right[mm], page_coord))) + page.add_TextRegion(marginal) + if ocr_all_textlines_marginals_right: + ocr_textlines = ocr_all_textlines_marginals_right[mm] + else: + ocr_textlines = None + self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_right, mm, page_coord, all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines) + for mm in range(len(found_polygons_drop_capitals)): dropcapital = TextRegionType(id=counter.next_region_id, type_='drop-capital', Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord))) page.add_TextRegion(dropcapital) - ###all_box_coord_drop = None - ###slopes_drop = None - ###self.serialize_lines_in_dropcapital(dropcapital, [found_polygons_drop_capitals[mm]], mm, page_coord, all_box_coord_drop, slopes_drop, counter, ocr_all_textlines_textregion=None) + all_box_coord_drop = None + slopes_drop = None + if ocr_all_textlines_drop: + ocr_textlines = ocr_all_textlines_drop[mm] + else: + ocr_textlines = None + self.serialize_lines_in_dropcapital(dropcapital, [found_polygons_drop_capitals[mm]], mm, page_coord, all_box_coord_drop, slopes_drop, counter, ocr_all_textlines_textregion=ocr_textlines) for mm in range(len(found_polygons_text_region_img)): page.add_ImageRegion(ImageRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_img[mm], page_coord)))) @@ -303,18 +342,28 @@ class EynollahXmlWriter(): return pcgts - def calculate_polygon_coords(self, contour, page_coord): + def calculate_polygon_coords(self, contour, page_coord, skip_layout_reading_order=False): self.logger.debug('enter calculate_polygon_coords') coords = '' for value_bbox in contour: - if len(value_bbox) == 2: - coords += str(int((value_bbox[0] + page_coord[2]) / self.scale_x)) - coords += ',' - coords += str(int((value_bbox[1] + page_coord[0]) / self.scale_y)) + if skip_layout_reading_order: + if len(value_bbox) == 2: + coords += str(int((value_bbox[0]) / self.scale_x)) + coords += ',' + coords += str(int((value_bbox[1]) / self.scale_y)) + else: + coords += str(int((value_bbox[0][0]) / self.scale_x)) + coords += ',' + coords += str(int((value_bbox[0][1]) / self.scale_y)) else: - coords += str(int((value_bbox[0][0] + page_coord[2]) / self.scale_x)) - coords += ',' - coords += str(int((value_bbox[0][1] + page_coord[0]) / self.scale_y)) + if len(value_bbox) == 2: + coords += str(int((value_bbox[0] + page_coord[2]) / self.scale_x)) + coords += ',' + coords += str(int((value_bbox[1] + page_coord[0]) / self.scale_y)) + else: + coords += str(int((value_bbox[0][0] + page_coord[2]) / self.scale_x)) + coords += ',' + coords += str(int((value_bbox[0][1] + page_coord[0]) / self.scale_y)) coords=coords + ' ' return coords[:-1] diff --git a/tests/resources/euler_rechenkunst01_1738_0025.xml b/tests/resources/euler_rechenkunst01_1738_0025.xml new file mode 100644 index 0000000..1a92f73 --- /dev/null +++ b/tests/resources/euler_rechenkunst01_1738_0025.xml @@ -0,0 +1,1626 @@ + + + OCR-D + 2016-09-29T14:32:09 + 2018-04-25T08:56:33 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 9 + + + 9 + + + 9 + + + + + + + + + der + + + + + rechten + + + + + gegen + + + + + der + + + + + lincken + + + + + Hand + + + + + bedeutet + + + der rechten gegen der lincken Hand bedeutet + + + + + + + + wie + + + + + folget: + + + wie folget: + + + der rechten gegen der lincken Hand bedeutet +wie folget: + + + + + + + + + I. + + + I. + + + I. + + + + + + + + + 0 + + + + + - + + + + + nichts + + + 0 - nichts + + + + + + + + 1 + + + + + - + + + + + eins + + + 1 - eins + + + + + + + + 2 + + + + + - + + + + + zwey + + + 2 - zwey + + + + + + + + 3 + + + + + - + + + + + drey + + + 3 - drey + + + + + + + + 4 + + + + + - + + + + + vier + + + 4 - vier + + + 0 - nichts +1 - eins +2 - zwey +3 - drey +4 - vier + + + + + + + + + 5 + + + + + - + + + + + fuͤnf + + + 5 - fuͤnf + + + + + + + + 6 + + + + + - + + + + + ſechs + + + 6 - ſechs + + + + + + + 7 + + + + + - + + + + + ſieben + + + 7 - ſieben + + + + + + + + 8 + + + + + - + + + + + acht + + + 8 - acht + + + + + + + + 9 + + + + + - + + + + + neun + + + 9 - neun + + + 5 - fuͤnf +6 - ſechs +7 - ſieben +8 - acht +9 - neun + + + + + + + + + Auf + + + + + der + + + + + zweyten + + + + + Stelle + + + + + aber + + + + + bedeutet. + + + Auf der zweyten Stelle aber bedeutet. + + + Auf der zweyten Stelle aber bedeutet. + + + + + + + + + II. + + + II. + + + II. + + + + + + + + + 0 + + + + + - + + + + + nichts + + + 0 - nichts + + + + + + + + 1 + + + + + - + + + + + zehen + + + 1 - zehen + + + + + + + + 2 + + + + + - + + + + + zwanzig + + + 2 - zwanzig + + + + + + + 3 + + + + + - + + + + + dreyßig + + + 3 - dreyßig + + + + + + + 4 + + + + + - + + + + + vierzig + + + 4 - vierzig + + 0 - nichts +1 - zehen +2 - zwanzig +3 - dreyßig +4 - vierzig + + + + + + + + + 5 + + + + + - + + + + + fuͤnfzig + + + 5 - fuͤnfzig + + + + + + + + 6 + + + + + - + + + + + ſechzig + + + 6 - ſechzig + + + + + + + 7 + + + + + - + + + + + ſiebenzig + + + 7 - ſiebenzig + + + + + + + 8 + + + + + - + + + + + achtzig + + + 8 - achtzig + + + + + + + 9 + + + + + - + + + + + neunzig + + + 9 - neunzig + + 5 - fuͤnfzig +6 - ſechzig +7 - ſiebenzig +8 - achtzig +9 - neunzig + + + + + + + + + Auf + + + + + der + + + + + dritten + + + + + Stelle + + + + + bedeutet. + + + Auf der dritten Stelle bedeutet. + + + Auf der dritten Stelle bedeutet. + + + + + + + + + III. + + + III. + + + III. + + + + + + + + + 0 + + + + + - + + + + + nichts + + + 0 - nichts + + + + + + + + 1 + + + + + - + + + + + hundert + + + 1 - hundert + + + + + + + + 2 + + + + + - + + + + + zwey + + + + + hundert + + + 2 - zwey hundert + + + + + + + + 3 + + + + + - + + + + + drey + + + + + hundert + + + 3 - drey hundert + + + + + + + + 4 + + + + + - + + + + + vier + + + + + hundert + + + 4 - vier hundert + + + 0 - nichts +1 - hundert +2 - zwey hundert +3 - drey hundert +4 - vier hundert + + + + + + + + + 5 + + + + + - + + + + + fuͤnf + + + + + hundert + + + 5 - fuͤnf hundert + + + + + + + + 6 + + + + + - + + + + + ſechs + + + + + hundert + + + 6 - ſechs hundert + + + + + + + 7 + + + + + - + + + + + ſieben + + + + + hundert + + + 7 - ſieben hundert + + + + + + + + 8 + + + + + - + + + + + acht + + + + + hundert + + + 8 - acht hundert + + + + + + + 9 + + + + + - + + + + + neun + + + + + hundert + + + 9 - neun hundert + + + 5 - fuͤnf hundert +6 - ſechs hundert +7 - ſieben hundert +8 - acht hundert +9 - neun hundert + + + + + + + + + Auf + + + + + der + + + + + vierten + + + + + Stelle + + + + + bedeutet. + + + Auf der vierten Stelle bedeutet. + + + Auf der vierten Stelle bedeutet. + + + + + + + + + IV. + + + IV. + + + IV. + + + + + + + + + 0 + + + + + - + + + + + nichts + + + 0 - nichts + + + + + + + + 1 + + + + + - + + + + + tauſend + + + 1 - tauſend + + + + + + + + 2 + + + + + - + + + + + zwey + + + + + tauſend + + + 2 - zwey tauſend + + + + + + + + 3 + + + + + - + + + + + drey + + + + + tauſend + + + 3 - drey tauſend + + + + + + + + 4 + + + + + - + + + + + vier + + + + + tauſend + + + 4 - vier tauſend + + + 0 - nichts +1 - tauſend +2 - zwey tauſend +3 - drey tauſend +4 - vier tauſend + + + + + + + + + 5 + + + + + - + + + + + fuͤnf + + + + + tauſend + + + 5 - fuͤnf tauſend + + + + + + + + 6 + + + + + - + + + + + ſechs + + + + + tauſend + + + 6 - ſechs tauſend + + + + + + + 7 + + + + + - + + + + + ſieben + + + + + tauſend + + + 7 - ſieben tauſend + + + + + + + + 8 + + + + + - + + + + + acht + + + + + tauſend + + + 8 - acht tauſend + + + + + + + 9 + + + + + - + + + + + neun + + + + + tauſend + + + 9 - neun tauſend + + 5 - fuͤnf tauſend +6 - ſechs tauſend +7 - ſieben tauſend +8 - acht tauſend +9 - neun tauſend + + + + + + + + + Auf + + + + + der + + + + + fuͤnften + + + + + Stelle + + + + + bedeutet. + + + Auf der fuͤnften Stelle bedeutet. + + + Auf der fuͤnften Stelle bedeutet. + + + + + + + + + V. + + + V. + + + V. + + + + + + + + + 0 + + + + + - + + + + + nichts + + + 0 - nichts + + + + + + + + 1 + + + + + - + + + + + zehen + + + + + tauſend + + + 1 - zehen tauſend + + + + + + + + 2 + + + + + - + + + + + zwanzig + + + + + tauſend + + + 2 - zwanzig tauſend + + + + + + + + 3 + + + + + - + + + + + dreyßig + + + + + tauſend + + + 3 - dreyßig tauſend + + + + + + + + 4 + + + + + - + + + + + vierzig + + + + + tauſend + + + 4 - vierzig tauſend + + + 0 - nichts +1 - zehen tauſend +2 - zwanzig tauſend +3 - dreyßig tauſend +4 - vierzig tauſend + + + + + + + + + 5 + + + + + - + + + + + fuͤnfzig + + + + + tauſend + + + 5 - fuͤnfzig tauſend + + + + + + + + 6 + + + + + - + + + + + ſechzig + + + + + tauſend + + + 6 - ſechzig tauſend + + + + + + + 7 + + + + + - + + + + + ſiebenzig + + + + + tauſend + + + 7 - ſiebenzig tauſend + + + + + + + + 8 + + + + + - + + + + + achtzig + + + + + tauſend + + + 8 - achtzig tauſend + + + + + + + 9 + + + + + - + + + + + neunzig + + + + + tauſend + + + 9 - neunzig tauſend + + + 5 - fuͤnfzig tauſend +6 - ſechzig tauſend +7 - ſiebenzig tauſend +8 - achtzig tauſend +9 - neunzig tauſend + + + + + + + + A + + + + + 5 + + + A 5 + + A 5 + + + + + + + + + Anf + + + Anf + + Anf + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/resources/kant_aufklaerung_1784_0020.xml b/tests/resources/kant_aufklaerung_1784_0020.xml new file mode 100644 index 0000000..47484cd --- /dev/null +++ b/tests/resources/kant_aufklaerung_1784_0020.xml @@ -0,0 +1,2129 @@ + + + OCR-D + 2016-09-20T11:09:27.431+02:00 + 2018-04-24T17:44:49.605+01:00 + + + + + + + + + + + + + + + + + + + + + + + ( + + + + + + + 484 + + + + + + + ) + + + + + ( 484 ) + + + + ( 484 ) + + + + + + + + + + + gewiegelt + + + + + + + worden + + + + + + + ; + + + + + + + ſo + + + + + + + ſchaͤdlich + + + + + + + iſt + + + + + + + es + + + + + + + Vorurtheile + + + + + + + zu + + + + + gewiegelt worden; ſo ſchaͤdlich iſt es Vorurtheile zu + + + + + + + + + + pflanzen + + + + + + + , + + + + + + + weil + + + + + + + ſie + + + + + + + ſich + + + + + + + zuletzt + + + + + + + an + + + + + + + denen + + + + + + + ſelbſt + + + + + + + raͤchen + + + + + + + , + + + + + pflanzen, weil ſie ſich zuletzt an denen ſelbſt raͤchen, + + + + + + + + + + die + + + + + + + , + + + + + + + oder + + + + + + + deren + + + + + + + Vorgaͤnger + + + + + + + , + + + + + + + ihre + + + + + + + Urheber + + + + + + + geweſen + + + + + die, oder deren Vorgaͤnger, ihre Urheber geweſen + + + + + + + + + + ſind + + + + + + + . + + + + + + + Daher + + + + + + + kann + + + + + + + ein + + + + + + + Publikum + + + + + + + nur + + + + + + + langſam + + + + + + + zur + + + + + ſind. Daher kann ein Publikum nur langſam zur + + + + + + + + + + Aufklaͤrung + + + + + + + gelangen + + + + + + + . + + + + + + + Durch + + + + + + + eine + + + + + + + Revolution + + + + + + + wird + + + + + Aufklaͤrung gelangen. Durch eine Revolution wird + + + + + + + + + + vielleicht + + + + + + + wohl + + + + + + + ein + + + + + + + Abfall + + + + + + + von + + + + + + + perſoͤnlichem + + + + + + + Despo- + + + + + vielleicht wohl ein Abfall von perſoͤnlichem Despo- + + + + + + + + + + tism + + + + + + + und + + + + + + + gewinnſuͤchtiger + + + + + + + oder + + + + + + + herrſchſüchtiger + + + + + + + Be + + + + + + + - + + + + + tism und gewinnſuͤchtiger oder herrſchſüchtiger Be- + + + + + + + + + + druͤkkung + + + + + + + , + + + + + + + aber + + + + + + + niemals + + + + + + + wahre + + + + + + + Reform + + + + + + + der + + + + + + + Den + + + + + + + - + + + + + druͤkkung, aber niemals wahre Reform der Den- + + + + + + + + + + kungsart + + + + + + + zu + + + + + + + Stande + + + + + + + kommen + + + + + + + ; + + + + + + + ſondern + + + + + + + neue + + + + + + + Vor + + + + + + + - + + + + + kungsart zu Stande kommen; ſondern neue Vor- + + + + + + + + + + urtheile + + + + + + + werden + + + + + + + , + + + + + + + eben + + + + + + + ſowohl + + + + + + + als + + + + + + + die + + + + + + + alten + + + + + + + , + + + + + + + zum + + + + + urtheile werden, eben ſowohl als die alten, zum + + + + + + + + + + Leitbande + + + + + + + des + + + + + + + gedankenloſen + + + + + + + großen + + + + + + + Haufens + + + + + Leitbande des gedankenloſen großen Haufens + + + + + + + + + + dienen + + + + + + + . + + + + + dienen. + + + + gewiegelt worden; ſo ſchaͤdlich iſt es Vorurtheile zu +pflanzen, weil ſie ſich zuletzt an denen ſelbſt raͤchen, +die, oder deren Vorgaͤnger, ihre Urheber geweſen +ſind. Daher kann ein Publikum nur langſam zur +Aufklaͤrung gelangen. Durch eine Revolution wird +vielleicht wohl ein Abfall von perſoͤnlichem Despo- +tism und gewinnſuͤchtiger oder herrſchſüchtiger Be- +druͤkkung, aber niemals wahre Reform der Den- +kungsart zu Stande kommen; ſondern neue Vor- +urtheile werden, eben ſowohl als die alten, zum +Leitbande des gedankenloſen großen Haufens +dienen. + + + + + + + + + + + Zu + + + + + + + dieſer + + + + + + + Aufklaͤrung + + + + + + + aber + + + + + + + wird + + + + + + + nichts + + + + + + + erfordert + + + + + Zu dieſer Aufklaͤrung aber wird nichts erfordert + + + + + + + + + + als + + + + + + + Freiheit + + + + + + + ; + + + + + + + und + + + + + + + zwar + + + + + + + die + + + + + + + unſchaͤdlichſte + + + + + + + unter + + + + + als Freiheit; und zwar die unſchaͤdlichſte unter + + + + + + + + + allem + + + + + + + , + + + + + + + was + + + + + + + nur + + + + + + + Freiheit + + + + + + + heißen + + + + + + + mag + + + + + + + , + + + + + + + naͤmlich + + + + + + + die + + + + + + + : + + + + + allem, was nur Freiheit heißen mag, naͤmlich die: + + + + + + + + + + von + + + + + + + ſeiner + + + + + + + Vernunft + + + + + + + in + + + + + + + allen + + + + + + + Stuͤkken + + + + + + + oͤffentlichen + + + + + von ſeiner Vernunft in allen Stuͤkken oͤffentlichen + + + + + + + + + + Gebrauch + + + + + + + zu + + + + + + + machen + + + + + + + . + + + + + + + Nun + + + + + + + hoͤre + + + + + + + ich + + + + + + + aber + + + + + + + von + + + + + + + al + + + + + + + - + + + + + Gebrauch zu machen. Nun hoͤre ich aber von al- + + + + + + + + + + len + + + + + + + Seiten + + + + + + + rufen + + + + + + + : + + + + + + + raͤſonnirt + + + + + + + nicht + + + + + + + ! + + + + + + + Der + + + + + + + Offi + + + + + + + - + + + + + len Seiten rufen: raͤſonnirt nicht! Der Offi- + + + + + + + + + + zier + + + + + + + ſagt + + + + + + + : + + + + + + + raͤſonnirt + + + + + + + nicht + + + + + + + , + + + + + + + ſondern + + + + + + + exercirt + + + + + + + ! + + + + + + + Der + + + + + zier ſagt: raͤſonnirt nicht, ſondern exercirt! Der + + + + + + + + + + Finanzrath + + + + + + + : + + + + + + + raͤſonnirt + + + + + + + nicht + + + + + + + , + + + + + + + ſondern + + + + + + + bezahlt + + + + + + + ! + + + + + + + Der + + + + + Finanzrath: raͤſonnirt nicht, ſondern bezahlt! Der + + + + + + + + + + Geiſtliche + + + + + + + : + + + + + + + raͤſonnirt + + + + + + + nicht + + + + + + + , + + + + + + + ſondern + + + + + + + glaubt + + + + + + + ! + + + + + + + ( + + + + + + + Nur + + + + + Geiſtliche: raͤſonnirt nicht, ſondern glaubt! (Nur + + + + + + + + + + ein + + + + + + + einziger + + + + + + + Herr + + + + + + + in + + + + + + + der + + + + + + + Welt + + + + + + + ſagt + + + + + + + : + + + + + + + raͤſonnirt + + + + + + + , + + + + + + + ſo + + + + + ein einziger Herr in der Welt ſagt: raͤſonnirt, ſo + + + + + + + + + + viel + + + + + + + ihr + + + + + + + wollt + + + + + + + , + + + + + + + und + + + + + + + woruͤber + + + + + + + ihr + + + + + + + wollt + + + + + + + ; + + + + + + + aber + + + + + + + ge + + + + + + + - + + + + + viel ihr wollt, und woruͤber ihr wollt; aber ge- + + + + + + + + + + horcht + + + + + + + ! + + + + + + + ) + + + + + + + Hier + + + + + + + iſt + + + + + + + uͤberall + + + + + + + Einſchraͤnkung + + + + + + + der + + + + + + + Frei + + + + + + + - + + + + + horcht!) Hier iſt uͤberall Einſchraͤnkung der Frei- + + + + + + + + + + heit + + + + + + + . + + + + + + + Welche + + + + + + + Einſchraͤnkung + + + + + + + aber + + + + + + + iſt + + + + + + + der + + + + + + + Aufklaͤ + + + + + + + - + + + + + heit. Welche Einſchraͤnkung aber iſt der Aufklaͤ- + + + + + + + + + + rung + + + + + + + hinderlich + + + + + + + ? + + + + + + + welche + + + + + + + nicht + + + + + + + , + + + + + + + ſondern + + + + + + + ihr + + + + + + + wohl + + + + + + + gar + + + + + rung hinderlich? welche nicht, ſondern ihr wohl gar + + + + + + + + + + befoͤrderlich + + + + + + + ? + + + + + + + + + + + + + + Ich + + + + + + + antworte + + + + + + + : + + + + + + + der + + + + + + + oͤffentliche + + + + + befoͤrderlich? — Ich antworte: der oͤffentliche + + + + + + + + + + Gebrauch + + + + + + + ſeiner + + + + + + + Vernunft + + + + + + + muß + + + + + + + jederzeit + + + + + + + frei + + + + + + + ſein + + + + + + + , + + + + + Gebrauch ſeiner Vernunft muß jederzeit frei ſein, + + + + + + + + + + und + + + + + + + der + + + + + + + allein + + + + + + + kann + + + + + + + Aufklaͤrung + + + + + + + unter + + + + + + + Menſchen + + + + + + + zu + + + + + und der allein kann Aufklaͤrung unter Menſchen zu + + + + + Zu dieſer Aufklaͤrung aber wird nichts erfordert +als Freiheit; und zwar die unſchaͤdlichſte unter +allem, was nur Freiheit heißen mag, naͤmlich die: +von ſeiner Vernunft in allen Stuͤkken oͤffentlichen +Gebrauch zu machen. Nun hoͤre ich aber von al- +len Seiten rufen: raͤſonnirt nicht! Der Offi- +zier ſagt: raͤſonnirt nicht, ſondern exercirt! Der +Finanzrath: raͤſonnirt nicht, ſondern bezahlt! Der +Geiſtliche: raͤſonnirt nicht, ſondern glaubt! (Nur +ein einziger Herr in der Welt ſagt: raͤſonnirt, ſo +viel ihr wollt, und woruͤber ihr wollt; aber ge- +horcht!) Hier iſt uͤberall Einſchraͤnkung der Frei- +heit. Welche Einſchraͤnkung aber iſt der Aufklaͤ- +rung hinderlich? welche nicht, ſondern ihr wohl gar +befoͤrderlich? — Ich antworte: der oͤffentliche +Gebrauch ſeiner Vernunft muß jederzeit frei ſein, +und der allein kann Aufklaͤrung unter Menſchen zu + + + + + + + + + + + Stan + + + + + + + - + + + + + Stan- + + + + + Stan- + + + + + + + + + + \ No newline at end of file diff --git a/tests/test_run.py b/tests/test_run.py index 607140e..be928a0 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -1,22 +1,30 @@ from os import environ from pathlib import Path +import pytest import logging from PIL import Image -from eynollah.cli import layout as layout_cli, binarization as binarization_cli +from eynollah.cli import ( + layout as layout_cli, + binarization as binarization_cli, + enhancement as enhancement_cli, + machine_based_reading_order as mbreorder_cli, + ocr as ocr_cli, +) from click.testing import CliRunner from ocrd_modelfactory import page_from_file from ocrd_models.constants import NAMESPACES as NS testdir = Path(__file__).parent.resolve() -EYNOLLAH_MODELS = environ.get('EYNOLLAH_MODELS', str(testdir.joinpath('..', 'models_eynollah').resolve())) -SBBBIN_MODELS = environ.get('SBBBIN_MODELS', str(testdir.joinpath('..', 'default-2021-03-09').resolve())) +MODELS_LAYOUT = environ.get('MODELS_LAYOUT', str(testdir.joinpath('..', 'models_layout_v0_5_0').resolve())) +MODELS_OCR = environ.get('MODELS_OCR', str(testdir.joinpath('..', 'models_ocr_v0_5_0').resolve())) +MODELS_BIN = environ.get('MODELS_BIN', str(testdir.joinpath('..', 'default-2021-03-09').resolve())) def test_run_eynollah_layout_filename(tmp_path, subtests, pytestconfig, caplog): infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml' args = [ - '-m', EYNOLLAH_MODELS, + '-m', MODELS_LAYOUT, '-i', str(infile), '-o', str(outfile.parent), # subtests write to same location @@ -44,8 +52,7 @@ def test_run_eynollah_layout_filename(tmp_path, subtests, pytestconfig, caplog): options=options): with caplog.filtering(only_eynollah): result = runner.invoke(layout_cli, args + options, catch_exceptions=False) - print(result) - assert result.exit_code == 0 + assert result.exit_code == 0, result.stdout logmsgs = [logrec.message for logrec in caplog.records] assert str(infile) in logmsgs assert outfile.exists() @@ -61,7 +68,7 @@ def test_run_eynollah_layout_directory(tmp_path, pytestconfig, caplog): indir = testdir.joinpath('resources') outdir = tmp_path args = [ - '-m', EYNOLLAH_MODELS, + '-m', MODELS_LAYOUT, '-di', str(indir), '-o', str(outdir), ] @@ -72,9 +79,8 @@ def test_run_eynollah_layout_directory(tmp_path, pytestconfig, caplog): return logrec.name == 'eynollah' runner = CliRunner() with caplog.filtering(only_eynollah): - result = runner.invoke(layout_cli, args) - print(result) - assert result.exit_code == 0 + result = runner.invoke(layout_cli, args, catch_exceptions=False) + assert result.exit_code == 0, result.stdout logmsgs = [logrec.message for logrec in caplog.records] assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Job done in')]) == 2 assert any(logmsg for logmsg in logmsgs if logmsg.startswith('All jobs done in')) @@ -84,10 +90,12 @@ def test_run_eynollah_binarization_filename(tmp_path, subtests, pytestconfig, ca infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png') args = [ - '-m', SBBBIN_MODELS, - str(infile), - str(outfile), + '-m', MODELS_BIN, + '-i', str(infile), + '-o', str(outfile), ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) caplog.set_level(logging.INFO) def only_eynollah(logrec): return logrec.name == 'SbbBinarizer' @@ -99,9 +107,8 @@ def test_run_eynollah_binarization_filename(tmp_path, subtests, pytestconfig, ca with subtests.test(#msg="test CLI", options=options): with caplog.filtering(only_eynollah): - result = runner.invoke(binarization_cli, args + options) - print(result) - assert result.exit_code == 0 + result = runner.invoke(binarization_cli, args + options, catch_exceptions=False) + assert result.exit_code == 0, result.stdout logmsgs = [logrec.message for logrec in caplog.records] assert any(True for logmsg in logmsgs if logmsg.startswith('Predicting')) assert outfile.exists() @@ -115,18 +122,193 @@ def test_run_eynollah_binarization_directory(tmp_path, subtests, pytestconfig, c indir = testdir.joinpath('resources') outdir = tmp_path args = [ - '-m', SBBBIN_MODELS, + '-m', MODELS_BIN, '-di', str(indir), - '-do', str(outdir), + '-o', str(outdir), ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) caplog.set_level(logging.INFO) def only_eynollah(logrec): return logrec.name == 'SbbBinarizer' runner = CliRunner() with caplog.filtering(only_eynollah): - result = runner.invoke(binarization_cli, args) - print(result) - assert result.exit_code == 0 + result = runner.invoke(binarization_cli, args, catch_exceptions=False) + assert result.exit_code == 0, result.stdout logmsgs = [logrec.message for logrec in caplog.records] assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Predicting')]) == 2 assert len(list(outdir.iterdir())) == 2 + +def test_run_eynollah_enhancement_filename(tmp_path, subtests, pytestconfig, caplog): + infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') + outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png') + args = [ + '-m', MODELS_LAYOUT, + '-i', str(infile), + '-o', str(outfile.parent), + # subtests write to same location + '--overwrite', + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.INFO) + def only_eynollah(logrec): + return logrec.name == 'enhancement' + runner = CliRunner() + for options in [ + [], # defaults + ["-sos"], + ]: + with subtests.test(#msg="test CLI", + options=options): + with caplog.filtering(only_eynollah): + result = runner.invoke(enhancement_cli, args + options, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + assert any(True for logmsg in logmsgs if logmsg.startswith('Image was enhanced')), logmsgs + assert outfile.exists() + with Image.open(infile) as original_img: + original_size = original_img.size + with Image.open(outfile) as enhanced_img: + enhanced_size = enhanced_img.size + assert (original_size == enhanced_size) == ("-sos" in options) + +def test_run_eynollah_enhancement_directory(tmp_path, subtests, pytestconfig, caplog): + indir = testdir.joinpath('resources') + outdir = tmp_path + args = [ + '-m', MODELS_LAYOUT, + '-di', str(indir), + '-o', str(outdir), + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.INFO) + def only_eynollah(logrec): + return logrec.name == 'enhancement' + runner = CliRunner() + with caplog.filtering(only_eynollah): + result = runner.invoke(enhancement_cli, args, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Image was enhanced')]) == 2 + assert len(list(outdir.iterdir())) == 2 + +def test_run_eynollah_mbreorder_filename(tmp_path, subtests, pytestconfig, caplog): + infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.xml') + outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml') + args = [ + '-m', MODELS_LAYOUT, + '-i', str(infile), + '-o', str(outfile.parent), + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.INFO) + def only_eynollah(logrec): + return logrec.name == 'mbreorder' + runner = CliRunner() + with caplog.filtering(only_eynollah): + result = runner.invoke(mbreorder_cli, args, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + # FIXME: mbreorder has no logging! + #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs + assert outfile.exists() + #in_tree = page_from_file(str(infile)).etree + #in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) + out_tree = page_from_file(str(outfile)).etree + out_order = out_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) + #assert len(out_order) >= 2, "result is inaccurate" + #assert in_order != out_order + assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3'] + +def test_run_eynollah_mbreorder_directory(tmp_path, subtests, pytestconfig, caplog): + indir = testdir.joinpath('resources') + outdir = tmp_path + args = [ + '-m', MODELS_LAYOUT, + '-di', str(indir), + '-o', str(outdir), + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.INFO) + def only_eynollah(logrec): + return logrec.name == 'mbreorder' + runner = CliRunner() + with caplog.filtering(only_eynollah): + result = runner.invoke(mbreorder_cli, args, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + # FIXME: mbreorder has no logging! + #assert len([logmsg for logmsg in logmsgs if logmsg.startswith('???')]) == 2 + assert len(list(outdir.iterdir())) == 2 + +def test_run_eynollah_ocr_filename(tmp_path, subtests, pytestconfig, caplog): + infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') + outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml') + outrenderfile = tmp_path.joinpath('render').joinpath('kant_aufklaerung_1784_0020.png') + outrenderfile.parent.mkdir() + args = [ + '-m', MODELS_OCR, + '-i', str(infile), + '-dx', str(infile.parent), + '-o', str(outfile.parent), + # subtests write to same location + '--overwrite', + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.DEBUG) + def only_eynollah(logrec): + return logrec.name == 'eynollah' + runner = CliRunner() + for options in [ + # kba Fri Sep 26 12:53:49 CEST 2025 + # Disabled until NHWC/NCHW error in https://github.com/qurator-spk/eynollah/actions/runs/18019655200/job/51273541895 debugged + # [], # defaults + # ["-doit", str(outrenderfile.parent)], + ["-trocr"], + ]: + with subtests.test(#msg="test CLI", + options=options): + with caplog.filtering(only_eynollah): + result = runner.invoke(ocr_cli, args + options, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + # FIXME: ocr has no logging! + #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs + assert outfile.exists() + if "-doit" in options: + assert outrenderfile.exists() + #in_tree = page_from_file(str(infile)).etree + #in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) + out_tree = page_from_file(str(outfile)).etree + out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS) + assert len(out_texts) >= 2, ("result is inaccurate", out_texts) + assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts) + +@pytest.mark.skip("Disabled until NHWC/NCHW error in https://github.com/qurator-spk/eynollah/actions/runs/18019655200/job/51273541895 debugged") +def test_run_eynollah_ocr_directory(tmp_path, subtests, pytestconfig, caplog): + indir = testdir.joinpath('resources') + outdir = tmp_path + args = [ + '-m', MODELS_OCR, + '-di', str(indir), + '-dx', str(indir), + '-o', str(outdir), + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.INFO) + def only_eynollah(logrec): + return logrec.name == 'eynollah' + runner = CliRunner() + with caplog.filtering(only_eynollah): + result = runner.invoke(ocr_cli, args, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + # FIXME: ocr has no logging! + #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs + assert len(list(outdir.iterdir())) == 2