From 192b9111e31eee4758364b1fe9f63f80aa533ec2 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 22 Apr 2025 00:23:01 +0200 Subject: [PATCH 1/4] updating eynollah README, how to use it for use cases --- README.md | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 16ac661..3cfb587 100644 --- a/README.md +++ b/README.md @@ -50,10 +50,16 @@ For documentation on methods and models, have a look at [`models.md`](https://gi In case you want to train your own model with Eynollah, have a look at [`train.md`](https://github.com/qurator-spk/eynollah/tree/main/docs/train.md). ## Usage -The command-line interface can be called like this: + +Eynollah has four key use cases: layout analysis, binarization, OCR, and machine-based reading order. + +### Layout +The layout module is responsible for detecting layouts, identifying text lines, and determining reading order using both heuristic methods or a machine-based reading order detection model. It's important to note that this functionality should not be confused with the machine-based-reading-order use case. The latter, still under development, focuses specifically on determining the reading order for a given layout in an XML file. In contrast, layout detection takes an image as input, and after detecting the layout, it can also determine the reading order using a machine-based model. + +The command-line interface for layout can be called like this: ```sh -eynollah \ +eynollah layout \ -i | -di \ -o \ -m \ @@ -66,6 +72,7 @@ The following options can be used to further configure the processing: |-------------------|:-------------------------------------------------------------------------------| | `-fl` | full layout analysis including all steps and segmentation classes | | `-light` | lighter and faster but simpler method for main region detection and deskewing | +| `-tll` | this indicates the light textline and should be passed with light version | | `-tab` | apply table detection | | `-ae` | apply enhancement (the resulting image is saved to the output directory) | | `-as` | apply scaling | @@ -83,6 +90,34 @@ The following options can be used to further configure the processing: If no option is set, the tool performs layout detection of main regions (background, text, images, separators and marginals). The best output quality is produced when RGB images are used as input rather than greyscale or binarized images. +### Binarization +Document Image Binarization + +The command-line interface for binarization of single image can be called like this: + +```sh +eynollah binarization \ + -m \ + \ + +``` + +and for flowing from a directory like this: + +```sh +eynollah binarization \ + -m \ + -di \ + -do +``` + +### OCR +Under development + +### Machine-based-reading-order +Under development + + #### Use as OCR-D processor Eynollah ships with a CLI interface to be used as [OCR-D](https://ocr-d.de) [processor](https://ocr-d.de/en/spec/cli), From 83211ae684513ef7f50ee88e0f641702441cde1f Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 7 May 2025 12:33:03 +0200 Subject: [PATCH 2/4] In the case of skip_layout_and_reading_order, the confidence value was not set correctly, leading to an error while writing to the XML file. --- src/eynollah/eynollah.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 022cf0a..ec8d887 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4328,7 +4328,7 @@ class Eynollah: polygons_lines_xml = [] contours_tables = [] ocr_all_textlines = None - conf_contours_textregions =None + conf_contours_textregions = [0] pcgts = self.writer.build_pagexml_no_full_layout( cont_page, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals, From 21ec4fbfb538b40f0d06f55bf8c92f4ca2ebf10c Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 7 May 2025 14:04:01 +0200 Subject: [PATCH 3/4] The text region coordinates are now correctly written into the XML output when using the skip layout and reading order option --- src/eynollah/eynollah.py | 2 +- src/eynollah/writer.py | 30 ++++++++++++++++++++---------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index ec8d887..6da003b 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4333,7 +4333,7 @@ class Eynollah: cont_page, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, - cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines, conf_contours_textregions) + cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines, conf_contours_textregions, self.skip_layout_and_reading_order) return pcgts #print("text region early -1 in %.1fs", time.time() - t0) diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index 92e353f..e589fd4 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -168,7 +168,7 @@ class EynollahXmlWriter(): with open(self.output_filename, 'w') as f: f.write(to_xml(pcgts)) - def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines, conf_contours_textregion): + def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines, conf_contours_textregion, skip_layout_reading_order=False): self.logger.debug('enter build_pagexml_no_full_layout') # create the file structure @@ -184,7 +184,7 @@ class EynollahXmlWriter(): for mm in range(len(found_polygons_text_region)): textregion = TextRegionType(id=counter.next_region_id, type_='paragraph', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord), conf=conf_contours_textregion[mm]), + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord, skip_layout_reading_order), conf=conf_contours_textregion[mm]), ) #textregion.set_conf(conf_contours_textregion[mm]) page.add_TextRegion(textregion) @@ -303,18 +303,28 @@ class EynollahXmlWriter(): return pcgts - def calculate_polygon_coords(self, contour, page_coord): + def calculate_polygon_coords(self, contour, page_coord, skip_layout_reading_order=False): self.logger.debug('enter calculate_polygon_coords') coords = '' for value_bbox in contour: - if len(value_bbox) == 2: - coords += str(int((value_bbox[0] + page_coord[2]) / self.scale_x)) - coords += ',' - coords += str(int((value_bbox[1] + page_coord[0]) / self.scale_y)) + if skip_layout_reading_order: + if len(value_bbox) == 2: + coords += str(int((value_bbox[0]) / self.scale_x)) + coords += ',' + coords += str(int((value_bbox[1]) / self.scale_y)) + else: + coords += str(int((value_bbox[0][0]) / self.scale_x)) + coords += ',' + coords += str(int((value_bbox[0][1]) / self.scale_y)) else: - coords += str(int((value_bbox[0][0] + page_coord[2]) / self.scale_x)) - coords += ',' - coords += str(int((value_bbox[0][1] + page_coord[0]) / self.scale_y)) + if len(value_bbox) == 2: + coords += str(int((value_bbox[0] + page_coord[2]) / self.scale_x)) + coords += ',' + coords += str(int((value_bbox[1] + page_coord[0]) / self.scale_y)) + else: + coords += str(int((value_bbox[0][0] + page_coord[2]) / self.scale_x)) + coords += ',' + coords += str(int((value_bbox[0][1] + page_coord[0]) / self.scale_y)) coords=coords + ' ' return coords[:-1] From 7a22e51f5d2ebff1bd0239c913eb1ed13d97fe77 Mon Sep 17 00:00:00 2001 From: cneud <952378+cneud@users.noreply.github.com> Date: Wed, 14 May 2025 21:56:03 +0200 Subject: [PATCH 4/4] resolve some comments from review --- README.md | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 3cfb587..8a2c4a4 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,8 @@ * Output in [PAGE-XML](https://github.com/PRImA-Research-Lab/PAGE-XML) * [OCR-D](https://github.com/qurator-spk/eynollah#use-as-ocr-d-processor) interface -:warning: Development is currently focused on achieving the best possible quality of results for a wide variety of historical documents and therefore processing can be very slow. We aim to improve this, but contributions are welcome. +:warning: Development is currently focused on achieving the best possible quality of results for a wide variety of +historical documents and therefore processing can be very slow. We aim to improve this, but contributions are welcome. ## Installation Python `3.8-3.11` with Tensorflow `<2.13` on Linux are currently supported. @@ -42,7 +43,7 @@ cd eynollah; pip install -e . Alternatively, you can run `make install` or `make install-dev` for editable installation. ## Models -Pre-trained models can be downloaded from [qurator-data.de](https://qurator-data.de/eynollah/) or [huggingface](https://huggingface.co/SBB?search_models=eynollah). +Pretrained models can be downloaded from [qurator-data.de](https://qurator-data.de/eynollah/) or [huggingface](https://huggingface.co/SBB?search_models=eynollah). For documentation on methods and models, have a look at [`models.md`](https://github.com/qurator-spk/eynollah/tree/main/docs/models.md). @@ -50,13 +51,17 @@ For documentation on methods and models, have a look at [`models.md`](https://gi In case you want to train your own model with Eynollah, have a look at [`train.md`](https://github.com/qurator-spk/eynollah/tree/main/docs/train.md). ## Usage +Eynollah supports four use cases: layout analysis (segmentation), binarization, text recognition (OCR), +and (trainable) reading order detection. -Eynollah has four key use cases: layout analysis, binarization, OCR, and machine-based reading order. +### Layout Analysis +The layout analysis module is responsible for detecting layouts, identifying text lines, and determining reading order +using both heuristic methods or a machine-based reading order detection model. -### Layout -The layout module is responsible for detecting layouts, identifying text lines, and determining reading order using both heuristic methods or a machine-based reading order detection model. It's important to note that this functionality should not be confused with the machine-based-reading-order use case. The latter, still under development, focuses specifically on determining the reading order for a given layout in an XML file. In contrast, layout detection takes an image as input, and after detecting the layout, it can also determine the reading order using a machine-based model. +Note that there are currently two supported ways for reading order detection: either as part of layout analysis based +on image input, or, currently under development, for given layout analysis results based on PAGE-XML data as input. -The command-line interface for layout can be called like this: +The command-line interface for layout analysis can be called like this: ```sh eynollah layout \ @@ -87,18 +92,19 @@ The following options can be used to further configure the processing: | `-sp ` | save cropped page image to this directory | | `-sa ` | save all (plot, enhanced/binary image, layout) to this directory | -If no option is set, the tool performs layout detection of main regions (background, text, images, separators and marginals). +If no option is set, the tool performs layout detection of main regions (background, text, images, separators +and marginals). The best output quality is produced when RGB images are used as input rather than greyscale or binarized images. ### Binarization -Document Image Binarization +The binarization module performs document image binarization using pretrained pixelwise segmentation models. The command-line interface for binarization of single image can be called like this: ```sh eynollah binarization \ - -m \ - \ + -m \ + \ ``` @@ -117,9 +123,7 @@ Under development ### Machine-based-reading-order Under development - #### Use as OCR-D processor - Eynollah ships with a CLI interface to be used as [OCR-D](https://ocr-d.de) [processor](https://ocr-d.de/en/spec/cli), formally described in [`ocrd-tool.json`](https://github.com/qurator-spk/eynollah/tree/main/src/eynollah/ocrd-tool.json). @@ -127,7 +131,6 @@ In this case, the source image file group with (preferably) RGB images should be ocrd-eynollah-segment -I OCR-D-IMG -O OCR-D-SEG -P models 2022-04-05 - If the input file group is PAGE-XML (from a previous OCR-D workflow step), Eynollah behaves as follows: - existing regions are kept and ignored (i.e. in effect they might overlap segments from Eynollah results) - existing annotation (and respective `AlternativeImage`s) are partially _ignored_: @@ -138,7 +141,6 @@ If the input file group is PAGE-XML (from a previous OCR-D workflow step), Eynol (because some other preprocessing step was in effect like `denoised`), then the output PAGE-XML will be based on that as new top-level (`@imageFilename`) - ocrd-eynollah-segment -I OCR-D-XYZ -O OCR-D-SEG -P models 2022-04-05 Still, in general, it makes more sense to add other workflow steps **after** Eynollah.