diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index d2b7057..0000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,51 +0,0 @@ -version: 2 - -jobs: - - build-python37: - machine: - - image: ubuntu-2004:2023.02.1 - - steps: - - checkout - - restore_cache: - keys: - - model-cache - - run: make models - - save_cache: - key: model-cache - paths: - models_eynollah.tar.gz - models_eynollah - - run: - name: "Set Python Version" - command: pyenv install -s 3.7.16 && pyenv global 3.7.16 - - run: make install - - run: make smoke-test - - build-python38: - machine: - - image: ubuntu-2004:2023.02.1 - steps: - - checkout - - restore_cache: - keys: - - model-cache - - run: make models - - save_cache: - key: model-cache - paths: - models_eynollah.tar.gz - models_eynollah - - run: - name: "Set Python Version" - command: pyenv install -s 3.8.16 && pyenv global 3.8.16 - - run: make install - - run: make smoke-test - -workflows: - version: 2 - build: - jobs: - # - build-python37 - - build-python38 diff --git a/.github/workflows/test-eynollah.yml b/.github/workflows/test-eynollah.yml index 30c9729..98ddc06 100644 --- a/.github/workflows/test-eynollah.yml +++ b/.github/workflows/test-eynollah.yml @@ -1,7 +1,7 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions -name: Python package +name: Test on: [push] @@ -14,8 +14,8 @@ jobs: python-version: ['3.8', '3.9', '3.10', '3.11'] steps: - - uses: actions/checkout@v2 - - uses: actions/cache@v2 + - uses: actions/checkout@v4 + - uses: actions/cache@v4 id: model_cache with: path: models_eynollah @@ -24,7 +24,7 @@ jobs: if: steps.model_cache.outputs.cache-hit != 'true' run: make models - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/CHANGELOG.md b/CHANGELOG.md index da2e1c0..cf6263d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,14 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [0.3.1] - 2024-08-27 + +Fixed: + + * regression in OCR-D processor, #106 + * Expected Ptrcv::UMat for argument 'contour', #110 + * Memory usage explosion with very narrow images (e.g. book spine), #67 + ## [0.3.0] - 2023-05-13 Changed: @@ -117,6 +125,8 @@ Fixed: Initial release +[0.3.1]: ../../compare/v0.3.1...v0.3.0 +[0.3.0]: ../../compare/v0.3.0...v0.2.0 [0.2.0]: ../../compare/v0.2.0...v0.1.0 [0.1.0]: ../../compare/v0.1.0...v0.0.11 [0.0.11]: ../../compare/v0.0.11...v0.0.10 diff --git a/Makefile b/Makefile index 525e6c3..4b43564 100644 --- a/Makefile +++ b/Makefile @@ -24,12 +24,15 @@ models: models_eynollah models_eynollah: models_eynollah.tar.gz # tar xf models_eynollah_renamed.tar.gz --transform 's/models_eynollah_renamed/models_eynollah/' # tar xf models_eynollah_renamed.tar.gz - tar xf 2022-04-05.SavedModel.tar.gz --transform 's/models_eynollah_renamed/models_eynollah/' + # tar xf models_eynollah_renamed_savedmodel.tar.gz --transform 's/models_eynollah_renamed_savedmodel/models_eynollah/' + tar xf models_eynollah.tar.gz models_eynollah.tar.gz: # wget 'https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz' # wget 'https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz' - wget 'https://ocr-d.kba.cloud/2022-04-05.SavedModel.tar.gz' + # wget 'https://ocr-d.kba.cloud/2022-04-05.SavedModel.tar.gz' + # wget 'https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed_savedmodel.tar.gz' + wget https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz # Install with pip install: diff --git a/README.md b/README.md index b095edb..1720f7f 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # Eynollah -> Document Layout Analysis (segmentation) using pre-trained models and heuristics +> Document Layout Analysis with Deep Learning and Heuristics [![PyPI Version](https://img.shields.io/pypi/v/eynollah)](https://pypi.org/project/eynollah/) -[![CircleCI Build Status](https://circleci.com/gh/qurator-spk/eynollah.svg?style=shield)](https://circleci.com/gh/qurator-spk/eynollah) [![GH Actions Test](https://github.com/qurator-spk/eynollah/actions/workflows/test-eynollah.yml/badge.svg)](https://github.com/qurator-spk/eynollah/actions/workflows/test-eynollah.yml) [![License: ASL](https://img.shields.io/github/license/qurator-spk/eynollah)](https://opensource.org/license/apache-2-0/) +[![DOI](https://img.shields.io/badge/DOI-10.1145%2F3604951.3605513-red)](https://doi.org/10.1145/3604951.3605513) ![](https://user-images.githubusercontent.com/952378/102350683-8a74db80-3fa5-11eb-8c7e-f743f7d6eae2.jpg) @@ -14,17 +14,18 @@ * Support for various image optimization operations: * cropping (border detection), binarization, deskewing, dewarping, scaling, enhancing, resizing * Text line segmentation to bounding boxes or polygons (contours) including for curved lines and vertical text -* Detection of reading order +* Detection of reading order (left-to-right or right-to-left) * Output in [PAGE-XML](https://github.com/PRImA-Research-Lab/PAGE-XML) * [OCR-D](https://github.com/qurator-spk/eynollah#use-as-ocr-d-processor) interface +:warning: Development is currently focused on achieving the best possible quality of results for a wide variety of historical documents and therefore processing can be very slow. We aim to improve this, but contributions are welcome. + ## Installation -Python versions `3.8-3.11` with Tensorflow versions >=`2.12` on Linux are currently supported. Unfortunately we can not currently support Windows or MacOS. -Windows users may be able to successfully run the tool through [WSL](https://learn.microsoft.com/en-us/windows/wsl/). +Python `3.8-3.11` with Tensorflow `2.12-2.15` on Linux are currently supported. For (limited) GPU support the CUDA toolkit needs to be installed. -You can either install via +You can either install from PyPI ``` pip install eynollah @@ -40,45 +41,48 @@ cd eynollah; pip install -e . Alternatively, you can run `make install` or `make install-dev` for editable installation. ## Models -Pre-trained models can be downloaded from [qurator-data.de](https://qurator-data.de/eynollah/). +Pre-trained models can be downloaded from [qurator-data.de](https://qurator-data.de/eynollah/) or [huggingface](https://huggingface.co/SBB?search_models=eynollah). -In case you want to train your own model to use with Eynollah, have a look at [sbb_pixelwise_segmentation](https://github.com/qurator-spk/sbb_pixelwise_segmentation). +## Train +🚧 **Work in progress** + +In case you want to train your own model, have a look at [`sbb_pixelwise_segmentation`](https://github.com/qurator-spk/sbb_pixelwise_segmentation). ## Usage The command-line interface can be called like this: ```sh eynollah \ - -i \ + -i | -di \ -o \ - -m \ + -m \ [OPTIONS] ``` The following options can be used to further configure the processing: -| option | description | -|----------|:-------------| -| `-fl` | full layout analysis including all steps and segmentation classes | -| `-light` | lighter and faster but simpler method for main region detection and deskewing | -| `-tab` | apply table detection | -| `-ae` | apply enhancement (the resulting image is saved to the output directory) | -| `-as` | apply scaling | -| `-cl` | apply contour detection for curved text lines instead of bounding boxes | -| `-ib` | apply binarization (the resulting image is saved to the output directory) | -| `-ep` | enable plotting (MUST always be used with `-sl`, `-sd`, `-sa`, `-si` or `-ae`) | -| `-ho` | ignore headers for reading order dectection | -| `-di ` | process all images in a directory in batch mode | -| `-si ` | save image regions detected to this directory | -| `-sd ` | save deskewed image to this directory | -| `-sl ` | save layout prediction as plot to this directory | -| `-sp ` | save cropped page image to this directory | -| `-sa ` | save all (plot, enhanced/binary image, layout) to this directory | - -If no option is set, the tool will perform layout detection of main regions (background, text, images, separators and marginals). -The tool produces better quality output when RGB images are used as input than greyscale or binarized images. +| option | description | +|-------------------|:-------------------------------------------------------------------------------| +| `-fl` | full layout analysis including all steps and segmentation classes | +| `-light` | lighter and faster but simpler method for main region detection and deskewing | +| `-tab` | apply table detection | +| `-ae` | apply enhancement (the resulting image is saved to the output directory) | +| `-as` | apply scaling | +| `-cl` | apply contour detection for curved text lines instead of bounding boxes | +| `-ib` | apply binarization (the resulting image is saved to the output directory) | +| `-ep` | enable plotting (MUST always be used with `-sl`, `-sd`, `-sa`, `-si` or `-ae`) | +| `-ho` | ignore headers for reading order dectection | +| `-si ` | save image regions detected to this directory | +| `-sd ` | save deskewed image to this directory | +| `-sl ` | save layout prediction as plot to this directory | +| `-sp ` | save cropped page image to this directory | +| `-sa ` | save all (plot, enhanced/binary image, layout) to this directory | + +If no option is set, the tool performs layout detection of main regions (background, text, images, separators and marginals). +The best output quality is produced when RGB images are used as input rather than greyscale or binarized images. #### Use as OCR-D processor +🚧 **Work in progress** Eynollah ships with a CLI interface to be used as [OCR-D](https://ocr-d.de) processor. @@ -96,11 +100,14 @@ ocrd-eynollah-segment -I OCR-D-IMG-BIN -O SEG-LINE -P models uses the original (RGB) image despite any binarization that may have occured in previous OCR-D processing steps +#### Additional documentation +Please check the [wiki](https://github.com/qurator-spk/eynollah/wiki). + ## How to cite If you find this tool useful in your work, please consider citing our paper: ```bibtex -@inproceedings{rezanezhad2023eynollah, +@inproceedings{hip23rezanezhad, title = {Document Layout Analysis with Deep Learning and Heuristics}, author = {Rezanezhad, Vahid and Baierer, Konstantin and Gerber, Mike and Labusch, Kai and Neudecker, Clemens}, booktitle = {Proceedings of the 7th International Workshop on Historical Document Imaging and Processing {HIP} 2023, diff --git a/ocrd-tool.json b/ocrd-tool.json index 5c48493..711a192 120000 --- a/ocrd-tool.json +++ b/ocrd-tool.json @@ -1 +1 @@ -qurator/eynollah/ocrd-tool.json \ No newline at end of file +src/eynollah/ocrd-tool.json \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..67a420d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,43 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel", "setuptools-ocrd"] + +[project] +name = "eynollah" +authors = [ + {name = "Vahid Rezanezhad"}, + {name = "Staatsbibliothek zu Berlin - Preußischer Kulturbesitz"}, +] +description = "Document Layout Analysis" +readme = "README.md" +license.file = "LICENSE" +requires-python = ">=3.8" +keywords = ["document layout analysis", "image segmentation"] + +dynamic = ["dependencies", "version"] + +classifiers = [ + "Development Status :: 4 - Beta", + "Environment :: Console", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Topic :: Scientific/Engineering :: Image Processing", +] + +[project.scripts] +eynollah = "eynollah.cli:main" +ocrd-eynollah-segment = "eynollah.ocrd_cli:main" + +[project.urls] +Homepage = "https://github.com/qurator-spk/eynollah" +Repository = "https://github.com/qurator-spk/eynollah.git" + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +"*" = ["*.json", '*.yml', '*.xml', '*.xsd'] diff --git a/qurator/__init__.py b/qurator/__init__.py deleted file mode 100644 index 5284146..0000000 --- a/qurator/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__import__("pkg_resources").declare_namespace(__name__) diff --git a/qurator/eynollah/__init__.py b/qurator/eynollah/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/qurator/eynollah/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/requirements.txt b/requirements.txt index 530dac2..f01d319 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ ocrd >= 2.23.3 numpy <1.24.0 scikit-learn >= 0.23.2 -tensorflow >=2.12.0 +tensorflow == 2.12.1 imutils >= 0.5.3 matplotlib setuptools >= 50 diff --git a/setup.py b/setup.py deleted file mode 100644 index 9abf158..0000000 --- a/setup.py +++ /dev/null @@ -1,28 +0,0 @@ -from setuptools import setup, find_packages -from json import load - -install_requires = open('requirements.txt').read().split('\n') -with open('ocrd-tool.json', 'r', encoding='utf-8') as f: - version = load(f)['version'] - -setup( - name='eynollah', - version=version, - long_description=open('README.md').read(), - long_description_content_type='text/markdown', - author='Vahid Rezanezhad', - url='https://github.com/qurator-spk/eynollah', - license='Apache License 2.0', - namespace_packages=['qurator'], - packages=find_packages(exclude=['tests']), - install_requires=install_requires, - package_data={ - '': ['*.json'] - }, - entry_points={ - 'console_scripts': [ - 'eynollah=qurator.eynollah.cli:main', - 'ocrd-eynollah-segment=qurator.eynollah.ocrd_cli:main', - ] - }, -) diff --git a/qurator/.gitkeep b/src/eynollah/__init__.py similarity index 100% rename from qurator/.gitkeep rename to src/eynollah/__init__.py diff --git a/qurator/eynollah/cli.py b/src/eynollah/cli.py similarity index 97% rename from qurator/eynollah/cli.py rename to src/eynollah/cli.py index 9aba31d..82505ed 100644 --- a/qurator/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -1,7 +1,7 @@ import sys import click from ocrd_utils import initLogging, setOverrideLogLevel -from qurator.eynollah.eynollah import Eynollah +from eynollah.eynollah import Eynollah @click.command() @@ -209,9 +209,11 @@ def main( light_version=light_version, ignore_page_extraction=ignore_page_extraction, ) - eynollah.run() - #pcgts = eynollah.run() - ##eynollah.writer.write_pagexml(pcgts) + if dir_in: + eynollah.run() + else: + pcgts = eynollah.run() + eynollah.writer.write_pagexml(pcgts) if __name__ == "__main__": main() diff --git a/qurator/eynollah/eynollah.py b/src/eynollah/eynollah.py similarity index 98% rename from qurator/eynollah/eynollah.py rename to src/eynollah/eynollah.py index 6c3fa3e..0081643 100644 --- a/qurator/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -29,7 +29,8 @@ warnings.filterwarnings("ignore") from scipy.signal import find_peaks import matplotlib.pyplot as plt from scipy.ndimage import gaussian_filter1d -from tensorflow.python.keras.backend import set_session +# use tf1 compatibility for keras backend +from tensorflow.compat.v1.keras.backend import set_session from tensorflow.keras import layers from .utils.contour import ( @@ -257,7 +258,7 @@ class Eynollah: config.gpu_options.allow_growth = True session = tf.compat.v1.Session(config=config) set_session(session) - + self.model_page = self.our_load_model(self.model_page_dir) self.model_classifier = self.our_load_model(self.model_dir_of_col_classifier) self.model_bin = self.our_load_model(self.model_dir_of_binarization) @@ -265,9 +266,9 @@ class Eynollah: self.model_region = self.our_load_model(self.model_region_dir_p_ens_light_only_images_extraction) #self.model_region_fl_np = self.our_load_model(self.model_region_dir_fully_np) #self.model_region_fl = self.our_load_model(self.model_region_dir_fully) - + self.ls_imgs = os.listdir(self.dir_in) - + if dir_in and not (light_version or self.extract_only_images): config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True @@ -286,7 +287,6 @@ class Eynollah: self.ls_imgs = os.listdir(self.dir_in) - def _cache_images(self, image_filename=None, image_pil=None): ret = {} @@ -482,7 +482,7 @@ class Eynollah: num_column_is_classified = True return img_new, num_column_is_classified - + def calculate_width_height_by_columns_extract_only_images(self, img, num_col, width_early, label_p_pred): self.logger.debug("enter calculate_width_height_by_columns") if num_col == 1: @@ -552,7 +552,7 @@ class Eynollah: is_image_enhanced = True return img, img_new, is_image_enhanced - + def resize_and_enhance_image_with_column_classifier(self,light_version): self.logger.debug("enter resize_and_enhance_image_with_column_classifier") dpi = self.dpi @@ -610,7 +610,7 @@ class Eynollah: num_col = np.argmax(label_p_pred[0]) + 1 self.logger.info("Found %d columns (%s)", num_col, np.around(label_p_pred, decimals=5)) - + if not self.extract_only_images: if dpi < DPI_THRESHOLD: img_new, num_column_is_classified = self.calculate_width_height_by_columns(img, num_col, width_early, label_p_pred) @@ -624,9 +624,6 @@ class Eynollah: image_res = np.copy(img) is_image_enhanced = False else: - #img_new, num_column_is_classified = self.calculate_width_height_by_columns_extract_only_images(img, num_col, width_early, label_p_pred) - #image_res = np.copy(img_new) - #is_image_enhanced = True num_column_is_classified = True image_res = np.copy(img) is_image_enhanced = False @@ -925,6 +922,7 @@ class Eynollah: seg_not_base[seg_not_base<1] =0 + seg_test = label_p_pred[0,:,:,1] ##seg2 = -label_p_pred[0,:,:,2] @@ -1623,7 +1621,7 @@ class Eynollah: q.put(slopes_sub) poly.put(poly_sub) box_sub.put(boxes_sub_new) - + def get_regions_light_v_extract_only_images(self,img,is_image_enhanced, num_col_classifier): self.logger.debug("enter get_regions_extract_images_only") erosion_hurts = False @@ -1646,7 +1644,7 @@ class Eynollah: img_h_new = int(img.shape[0] / float(img.shape[1]) * img_w_new) img_resized = resize_image(img,img_h_new, img_w_new ) - + if not self.dir_in: @@ -1654,61 +1652,61 @@ class Eynollah: prediction_regions_org = self.do_prediction_new_concept(True, img_resized, model_region) else: prediction_regions_org = self.do_prediction_new_concept(True, img_resized, self.model_region) - + #plt.imshow(prediction_regions_org[:,:,0]) #plt.show() - + prediction_regions_org = resize_image(prediction_regions_org,img_height_h, img_width_h ) - + image_page, page_coord, cont_page = self.extract_page() - - + + prediction_regions_org = prediction_regions_org[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] - + prediction_regions_org=prediction_regions_org[:,:,0] - + mask_lines_only = (prediction_regions_org[:,:] ==3)*1 - + mask_texts_only = (prediction_regions_org[:,:] ==1)*1 - + mask_images_only=(prediction_regions_org[:,:] ==2)*1 - + polygons_lines_xml, hir_lines_xml = return_contours_of_image(mask_lines_only) polygons_lines_xml = textline_con_fil = filter_contours_area_of_image(mask_lines_only, polygons_lines_xml, hir_lines_xml, max_area=1, min_area=0.00001) - - + + polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) - + polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) - + text_regions_p_true = np.zeros(prediction_regions_org.shape) - + text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_lines, color=(3,3,3)) - + text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 - + text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_texts, color=(1,1,1)) - - - + + + text_regions_p_true[text_regions_p_true.shape[0]-15:text_regions_p_true.shape[0], :] = 0 text_regions_p_true[:, text_regions_p_true.shape[1]-15:text_regions_p_true.shape[1]] = 0 - + ##polygons_of_images = return_contours_of_interested_region(text_regions_p_true, 2, 0.0001) polygons_of_images = return_contours_of_interested_region(text_regions_p_true, 2, 0.001) - + image_boundary_of_doc = np.zeros((text_regions_p_true.shape[0], text_regions_p_true.shape[1])) - + ###image_boundary_of_doc[:6, :] = 1 ###image_boundary_of_doc[text_regions_p_true.shape[0]-6:text_regions_p_true.shape[0], :] = 1 - + ###image_boundary_of_doc[:, :6] = 1 ###image_boundary_of_doc[:, text_regions_p_true.shape[1]-6:text_regions_p_true.shape[1]] = 1 - + #plt.imshow(image_boundary_of_doc) #plt.show() - + polygons_of_images_fin = [] for ploy_img_ind in polygons_of_images: """ @@ -1737,9 +1735,9 @@ class Eynollah: box = [x, y, w, h] _, page_coord_img = crop_image_inside_box(box, text_regions_p_true) #cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]])) - + polygons_of_images_fin.append(np.array([[page_coord_img[2], page_coord_img[0]], [page_coord_img[3], page_coord_img[0]], [page_coord_img[3], page_coord_img[1]], [page_coord_img[2], page_coord_img[1]]]) ) - + return text_regions_p_true, erosion_hurts, polygons_lines_xml, polygons_of_images_fin, image_page, page_coord, cont_page def get_regions_light_v(self,img,is_image_enhanced, num_col_classifier): self.logger.debug("enter get_regions_light_v") @@ -2592,7 +2590,7 @@ class Eynollah: prediction_table_erode = cv2.erode(prediction_table[:,:,0], KERNEL, iterations=20) prediction_table_erode = cv2.dilate(prediction_table_erode, KERNEL, iterations=20) return prediction_table_erode.astype(np.int16) - + def run_graphics_and_columns_light(self, text_regions_p_1, textline_mask_tot_ea, num_col_classifier, num_column_is_classified, erosion_hurts): img_g = self.imread(grayscale=True, uint8=True) @@ -3004,26 +3002,26 @@ class Eynollah: if self.dir_in: self.reset_file_name_dir(os.path.join(self.dir_in,img_name)) - + if self.extract_only_images: img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version) self.logger.info("Enhancing took %.1fs ", time.time() - t0) - + text_regions_p_1 ,erosion_hurts, polygons_lines_xml,polygons_of_images,image_page, page_coord, cont_page = self.get_regions_light_v_extract_only_images(img_res, is_image_enhanced, num_col_classifier) - + pcgts = self.writer.build_pagexml_no_full_layout([], page_coord, [], [], [], [], polygons_of_images, [], [], [], [], [], cont_page, [], []) - + if self.plotter: self.plotter.write_images_into_directory(polygons_of_images, image_page) #plt.imshow(text_regions_p_1) #plt.show() - + self.writer.write_pagexml(pcgts) else: img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version) self.logger.info("Enhancing took %.1fs ", time.time() - t0) - + t1 = time.time() if self.light_version: text_regions_p_1 ,erosion_hurts, polygons_lines_xml, textline_mask_tot_ea = self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier) @@ -3042,7 +3040,7 @@ class Eynollah: self.run_graphics_and_columns(text_regions_p_1, num_col_classifier, num_column_is_classified, erosion_hurts) self.logger.info("Graphics detection took %.1fs ", time.time() - t1) #self.logger.info('cont_page %s', cont_page) - + if not num_col: self.logger.info("No columns detected, outputting an empty PAGE-XML") pcgts = self.writer.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], [], [], cont_page, [], []) @@ -3076,13 +3074,13 @@ class Eynollah: text_only = ((img_revised_tab[:, :] == 1)) * 1 if np.abs(slope_deskew) >= SLOPE_THRESHOLD: text_only_d = ((text_regions_p_1_n[:, :] == 1)) * 1 - - + + min_con_area = 0.000005 if np.abs(slope_deskew) >= SLOPE_THRESHOLD: contours_only_text, hir_on_text = return_contours_of_image(text_only) contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text) - + if len(contours_only_text_parent) > 0: areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1]) @@ -3102,7 +3100,7 @@ class Eynollah: areas_cnt_text_d = np.array([cv2.contourArea(c) for c in contours_only_text_parent_d]) areas_cnt_text_d = areas_cnt_text_d / float(text_only_d.shape[0] * text_only_d.shape[1]) - + if len(areas_cnt_text_d)>0: contours_biggest_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)] index_con_parents_d = np.argsort(areas_cnt_text_d) @@ -3122,7 +3120,7 @@ class Eynollah: cy_biggest_d_last5 = cy_biggest_d[-len(cx_bigest_d):] dists_d = [math.sqrt((cx_bigest_big[0]-cx_bigest_d_last5[j])**2 + (cy_biggest_big[0]-cy_biggest_d_last5[j])**2) for j in range(len(cy_biggest_d_last5))] ind_largest = len(cx_bigest_d) - len(cx_bigest_d) + np.argmin(dists_d) - + cx_bigest_d_big[0] = cx_bigest_d[ind_largest] cy_biggest_d_big[0] = cy_biggest_d[ind_largest] except Exception as why: @@ -3151,7 +3149,7 @@ class Eynollah: contours_only_text_parent_d_ordered = [] contours_only_text_parent_d = [] contours_only_text_parent = [] - + else: contours_only_text_parent_d_ordered = [] contours_only_text_parent_d = [] @@ -3159,7 +3157,7 @@ class Eynollah: else: contours_only_text, hir_on_text = return_contours_of_image(text_only) contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text) - + if len(contours_only_text_parent) > 0: areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1]) @@ -3185,7 +3183,7 @@ class Eynollah: txt_con_org = get_textregion_contours_in_org_image(contours_only_text_parent, self.image, slope_first) boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent) boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals) - + if not self.curved_line: if self.light_version: if self.textline_light: @@ -3199,13 +3197,13 @@ class Eynollah: slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew) else: - + scale_param = 1 all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew) all_found_textline_polygons = small_textlines_to_parent_adherence2(all_found_textline_polygons, textline_mask_tot_ea, num_col_classifier) all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew) all_found_textline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_textline_polygons_marginals, textline_mask_tot_ea, num_col_classifier) - + if self.full_layout: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) @@ -3224,12 +3222,12 @@ class Eynollah: if self.plotter: self.plotter.save_plot_of_layout(text_regions_p, image_page) self.plotter.save_plot_of_layout_all(text_regions_p, image_page) - + pixel_img = 4 polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img) all_found_textline_polygons = adhere_drop_capital_region_into_corresponding_textline(text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, kernel=KERNEL, curved_line=self.curved_line) pixel_lines = 6 - + if not self.headers_off: if np.abs(slope_deskew) < SLOPE_THRESHOLD: @@ -3250,12 +3248,12 @@ class Eynollah: else: regions_without_separators_d = regions_without_separators_d.astype(np.uint8) regions_without_separators_d = cv2.erode(regions_without_separators_d[:, :], KERNEL, iterations=6) - + if np.abs(slope_deskew) < SLOPE_THRESHOLD: boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new(splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left) else: - boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new(splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left) + boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new(splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left) #print(boxes_d,'boxes_d') #img_once = np.zeros((textline_mask_tot_d.shape[0],textline_mask_tot_d.shape[1])) @@ -3273,19 +3271,23 @@ class Eynollah: else: order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d) - pcgts = self.writer.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_xml) - self.logger.info("Job done in %.1fs", time.time() - t0) - ##return pcgts + pcgts = self.writer.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_xml) + self.logger.info("Job done in %.1fs", time.time() - t0) + if not self.dir_in: + return pcgts + else: + contours_only_text_parent_h = None + if np.abs(slope_deskew) < SLOPE_THRESHOLD: + order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) else: - contours_only_text_parent_h = None - if np.abs(slope_deskew) < SLOPE_THRESHOLD: - order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) - else: - contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) - order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) - pcgts = self.writer.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_xml, contours_tables) - self.logger.info("Job done in %.1fs", time.time() - t0) - ##return pcgts + contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) + order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) + pcgts = self.writer.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_xml, contours_tables) + self.logger.info("Job done in %.1fs", time.time() - t0) + if not self.dir_in: + return pcgts + + if self.dir_in: self.writer.write_pagexml(pcgts) #self.logger.info("Job done in %.1fs", time.time() - t0) if self.dir_in: diff --git a/qurator/eynollah/ocrd-tool.json b/src/eynollah/ocrd-tool.json similarity index 99% rename from qurator/eynollah/ocrd-tool.json rename to src/eynollah/ocrd-tool.json index 8a2cb95..4551168 100644 --- a/qurator/eynollah/ocrd-tool.json +++ b/src/eynollah/ocrd-tool.json @@ -1,5 +1,5 @@ { - "version": "0.3.0", + "version": "0.3.1", "git_url": "https://github.com/qurator-spk/eynollah", "tools": { "ocrd-eynollah-segment": { diff --git a/qurator/eynollah/ocrd_cli.py b/src/eynollah/ocrd_cli.py similarity index 100% rename from qurator/eynollah/ocrd_cli.py rename to src/eynollah/ocrd_cli.py diff --git a/qurator/eynollah/plot.py b/src/eynollah/plot.py similarity index 100% rename from qurator/eynollah/plot.py rename to src/eynollah/plot.py diff --git a/qurator/eynollah/processor.py b/src/eynollah/processor.py similarity index 97% rename from qurator/eynollah/processor.py rename to src/eynollah/processor.py index ccec456..1bd190e 100644 --- a/qurator/eynollah/processor.py +++ b/src/eynollah/processor.py @@ -42,7 +42,7 @@ class EynollahProcessor(Processor): page = pcgts.get_Page() # XXX loses DPI information # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename + image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename eynollah_kwargs = { 'dir_models': self.resolve_resource(self.parameter['models']), 'allow_enhancement': False, diff --git a/qurator/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py similarity index 100% rename from qurator/eynollah/utils/__init__.py rename to src/eynollah/utils/__init__.py diff --git a/qurator/eynollah/utils/contour.py b/src/eynollah/utils/contour.py similarity index 100% rename from qurator/eynollah/utils/contour.py rename to src/eynollah/utils/contour.py diff --git a/qurator/eynollah/utils/counter.py b/src/eynollah/utils/counter.py similarity index 100% rename from qurator/eynollah/utils/counter.py rename to src/eynollah/utils/counter.py diff --git a/qurator/eynollah/utils/drop_capitals.py b/src/eynollah/utils/drop_capitals.py similarity index 100% rename from qurator/eynollah/utils/drop_capitals.py rename to src/eynollah/utils/drop_capitals.py diff --git a/qurator/eynollah/utils/is_nan.py b/src/eynollah/utils/is_nan.py similarity index 100% rename from qurator/eynollah/utils/is_nan.py rename to src/eynollah/utils/is_nan.py diff --git a/qurator/eynollah/utils/marginals.py b/src/eynollah/utils/marginals.py similarity index 100% rename from qurator/eynollah/utils/marginals.py rename to src/eynollah/utils/marginals.py diff --git a/qurator/eynollah/utils/pil_cv2.py b/src/eynollah/utils/pil_cv2.py similarity index 100% rename from qurator/eynollah/utils/pil_cv2.py rename to src/eynollah/utils/pil_cv2.py diff --git a/qurator/eynollah/utils/resize.py b/src/eynollah/utils/resize.py similarity index 100% rename from qurator/eynollah/utils/resize.py rename to src/eynollah/utils/resize.py diff --git a/qurator/eynollah/utils/rotate.py b/src/eynollah/utils/rotate.py similarity index 100% rename from qurator/eynollah/utils/rotate.py rename to src/eynollah/utils/rotate.py diff --git a/qurator/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py similarity index 100% rename from qurator/eynollah/utils/separate_lines.py rename to src/eynollah/utils/separate_lines.py diff --git a/qurator/eynollah/utils/xml.py b/src/eynollah/utils/xml.py similarity index 100% rename from qurator/eynollah/utils/xml.py rename to src/eynollah/utils/xml.py diff --git a/qurator/eynollah/writer.py b/src/eynollah/writer.py similarity index 100% rename from qurator/eynollah/writer.py rename to src/eynollah/writer.py diff --git a/tests/test_counter.py b/tests/test_counter.py index 8ef0756..42bf074 100644 --- a/tests/test_counter.py +++ b/tests/test_counter.py @@ -1,5 +1,5 @@ from tests.base import main -from qurator.eynollah.utils.counter import EynollahIdCounter +from eynollah.utils.counter import EynollahIdCounter def test_counter_string(): c = EynollahIdCounter() diff --git a/tests/test_dpi.py b/tests/test_dpi.py index 510ffc5..3376bf4 100644 --- a/tests/test_dpi.py +++ b/tests/test_dpi.py @@ -1,6 +1,6 @@ import cv2 from pathlib import Path -from qurator.eynollah.utils.pil_cv2 import check_dpi +from eynollah.utils.pil_cv2 import check_dpi from tests.base import main def test_dpi(): diff --git a/tests/test_run.py b/tests/test_run.py index b1137e7..2596dad 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -2,7 +2,7 @@ from os import environ from pathlib import Path from ocrd_utils import pushd_popd from tests.base import CapturingTestCase as TestCase, main -from qurator.eynollah.cli import main as eynollah_cli +from eynollah.cli import main as eynollah_cli testdir = Path(__file__).parent.resolve() diff --git a/tests/test_smoke.py b/tests/test_smoke.py index d069479..252213f 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1,7 +1,7 @@ def test_utils_import(): - import qurator.eynollah.utils - import qurator.eynollah.utils.contour - import qurator.eynollah.utils.drop_capitals - import qurator.eynollah.utils.drop_capitals - import qurator.eynollah.utils.is_nan - import qurator.eynollah.utils.rotate + import eynollah.utils + import eynollah.utils.contour + import eynollah.utils.drop_capitals + import eynollah.utils.drop_capitals + import eynollah.utils.is_nan + import eynollah.utils.rotate diff --git a/tests/test_xml.py b/tests/test_xml.py index 8422fd1..09a6ddf 100644 --- a/tests/test_xml.py +++ b/tests/test_xml.py @@ -1,5 +1,5 @@ from pytest import main -from qurator.eynollah.utils.xml import create_page_xml +from eynollah.utils.xml import create_page_xml from ocrd_models.ocrd_page import to_xml PAGE_2019 = 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15'