From 92c1e824dc0683fc74eaa037cabcdb41f49cf677 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 26 Sep 2025 23:05:47 +0200 Subject: [PATCH 1/5] CD: master is now main --- .github/workflows/build-docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-docker.yml b/.github/workflows/build-docker.yml index d77958b..d2869ed 100644 --- a/.github/workflows/build-docker.yml +++ b/.github/workflows/build-docker.yml @@ -2,7 +2,7 @@ name: CD on: push: - branches: [ "master" ] + branches: [ "main" ] workflow_dispatch: # run manually jobs: From a48e52c00eef1b1e8c85b25bf4d95e46ecaf0cf1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 29 Sep 2025 13:49:18 +0200 Subject: [PATCH 2/5] :memo: extend changelog for v0.5.0 --- CHANGELOG.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ad9a09..bfdd1ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,12 +11,37 @@ Fixed: * restoring the contour in the original image caused an error due to an empty tuple, #154 +Changed + + * CLIs: read only allowed filename suffixes (image or XML) with `--dir_in` + * CLIs: make all output option required, and `-i` / `-di` required but mutually exclusive + * ocr CLI: drop redundant `-brb` in favour of just `-dib` + * APIs: move all input/output path options from class (kwarg and attribute) ro `run` kwarg + * layout textlines: polygonal also without `-cl` + Added: * `eynollah machine-based-reading-order` CLI to run reading order detection, #175 * `eynollah enhancement` CLI to run image enhancement, #175 * Improved models for page extraction and reading order detection, #175 +Merged PRs: + + * better machine based reading order + layout and textline + ocr by @vahidrezanezhad in https://github.com/qurator-spk/eynollah/pull/175 + * CI: pypi by @kba in https://github.com/qurator-spk/eynollah/pull/154 + * CI: Use most recent actions/setup-python@v5 by @kba in https://github.com/qurator-spk/eynollah/pull/157 + * update docker by @bertsky in https://github.com/qurator-spk/eynollah/pull/159 + * Ocrd fixes by @kba in https://github.com/qurator-spk/eynollah/pull/167 + * Updating readme for eynollah use cases cli by @kba in https://github.com/qurator-spk/eynollah/pull/166 + * OCR-D processor: expose reading_order_machine_based by @bertsky in https://github.com/qurator-spk/eynollah/pull/171 + * prepare release v0.5.0: fix logging by @bertsky in https://github.com/qurator-spk/eynollah/pull/180 + * mb_ro_on_layout: remove copy-pasta code not actually used by @kba in https://github.com/qurator-spk/eynollah/pull/181 + * prepare release v0.5.0: improve CLI docstring, refactor I/O path options from class to run kwargs, increase test coverage @bertsky in #182 + * prepare release v0.5.0: fix for OCR doit subtest by @bertsky in https://github.com/qurator-spk/eynollah/pull/183 + * Prepare release v0.5.0 by @kba in https://github.com/qurator-spk/eynollah/pull/178 + * updating eynollah README, how to use it for use cases by @vahidrezanezhad in https://github.com/qurator-spk/eynollah/pull/156 + * add feedback to command line interface by @michalbubula in https://github.com/qurator-spk/eynollah/pull/170 + ## [0.4.0] - 2025-04-07 Fixed: From 5725e4fd1f6bab4c1152c88cc28c44c0e8c2c584 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 1 Oct 2025 15:58:03 +0200 Subject: [PATCH 3/5] =?UTF-8?q?-Continue=20processing=20when=20num=5Fcol?= =?UTF-8?q?=20is=20None=20but=20textregions=20exist.=20-Convert=20marginal?= =?UTF-8?q?-only=20=20to=20main=20body=20if=20no=20main=20body=20is=20pres?= =?UTF-8?q?ent.=20-Reset=20deskew=20angle=20to=200=20when=20text=20region?= =?UTF-8?q?=20density=20(textregion=20area=20to=20page=20area)=20<=200.3?= =?UTF-8?q?=20and=20angle=20>=2045=C2=B0.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/eynollah/eynollah.py | 41 +++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 20954a0..5e8412e 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -1,4 +1,4 @@ -# pylint: disable=no-member,invalid-name,line-too-long,missing-function-docstring,missing-class-docstring,too-many-branches +#run_single# pylint: disable=no-member,invalid-name,line-too-long,missing-function-docstring,missing-class-docstring,too-many-branches # pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements,chained-comparison,fixme,broad-except,c-extension-no-member # pylint: disable=too-many-public-methods,too-many-arguments,too-many-instance-attributes,too-many-public-methods, # pylint: disable=consider-using-enumerate @@ -2245,6 +2245,7 @@ class Eynollah: ##mask_texts_only = cv2.dilate(mask_texts_only, KERNEL, iterations=1) mask_texts_only = cv2.dilate(mask_texts_only, kernel=np.ones((2,2), np.uint8), iterations=1) + mask_images_only=(prediction_regions_org[:,:] ==2)*1 polygons_lines_xml, hir_lines_xml = return_contours_of_image(mask_lines_only) @@ -2280,20 +2281,18 @@ class Eynollah: text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_texts, color=(1,1,1)) - #plt.imshow(textline_mask_tot_ea) #plt.show() textline_mask_tot_ea[(text_regions_p_true==0) | (text_regions_p_true==4) ] = 0 - #plt.imshow(textline_mask_tot_ea) #plt.show() #print("inside 4 ", time.time()-t_in) self.logger.debug("exit get_regions_light_v") - return text_regions_p_true, erosion_hurts, polygons_lines_xml, textline_mask_tot_ea, img_bin, confidence_matrix + return text_regions_p_true, erosion_hurts, polygons_lines_xml, textline_mask_tot_ea, img_bin, confidence_matrix, polygons_of_only_texts else: img_bin = resize_image(img_bin,img_height_h, img_width_h ) self.logger.debug("exit get_regions_light_v") - return None, erosion_hurts, None, textline_mask_tot_ea, img_bin, None + return None, erosion_hurts, None, textline_mask_tot_ea, img_bin, None, None def get_regions_from_xy_2models(self,img,is_image_enhanced, num_col_classifier): self.logger.debug("enter get_regions_from_xy_2models") @@ -2386,7 +2385,7 @@ class Eynollah: text_regions_p_true=cv2.fillPoly(text_regions_p_true,pts=polygons_of_only_texts, color=(1,1,1)) self.logger.debug("exit get_regions_from_xy_2models") - return text_regions_p_true, erosion_hurts, polygons_lines_xml + return text_regions_p_true, erosion_hurts, polygons_lines_xml, polygons_of_only_texts except: if self.input_binary: prediction_bin = np.copy(img_org) @@ -2436,7 +2435,7 @@ class Eynollah: erosion_hurts = True self.logger.debug("exit get_regions_from_xy_2models") - return text_regions_p_true, erosion_hurts, polygons_lines_xml + return text_regions_p_true, erosion_hurts, polygons_lines_xml, polygons_of_only_texts def do_order_of_regions_full_layout( self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot): @@ -4701,7 +4700,7 @@ class Eynollah: self.logger.info("Step 2/5: Basic Processing Mode") self.logger.info("Skipping layout analysis and reading order detection") - _ ,_, _, textline_mask_tot_ea, img_bin_light, _ = \ + _ ,_, _, textline_mask_tot_ea, img_bin_light, _,_= \ self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier, skip_layout_and_reading_order=self.skip_layout_and_reading_order) @@ -4768,10 +4767,10 @@ class Eynollah: if self.light_version: self.logger.info("Using light version processing") - text_regions_p_1 ,erosion_hurts, polygons_lines_xml, textline_mask_tot_ea, img_bin_light, confidence_matrix = \ + text_regions_p_1 ,erosion_hurts, polygons_lines_xml, textline_mask_tot_ea, img_bin_light, confidence_matrix, polygons_text_early = \ self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier) #print("text region early -2 in %.1fs", time.time() - t0) - + if num_col_classifier == 1 or num_col_classifier ==2: if num_col_classifier == 1: img_w_new = 1000 @@ -4793,9 +4792,9 @@ class Eynollah: #self.logger.info("run graphics %.1fs ", time.time() - t1t) #print("text region early -3 in %.1fs", time.time() - t0) textline_mask_tot_ea_org = np.copy(textline_mask_tot_ea) - #print("text region early -4 in %.1fs", time.time() - t0) + else: - text_regions_p_1 ,erosion_hurts, polygons_lines_xml = \ + text_regions_p_1 ,erosion_hurts, polygons_lines_xml, polygons_text_early = \ self.get_regions_from_xy_2models(img_res, is_image_enhanced, num_col_classifier) self.logger.info(f"Textregion detection took {time.time() - t1:.1f}s") @@ -4811,7 +4810,7 @@ class Eynollah: #plt.show() self.logger.info(f"Layout analysis complete ({time.time() - t1:.1f}s)") - if not num_col: + if not num_col and len(polygons_text_early) == 0: self.logger.info("No columns detected - generating empty PAGE-XML") pcgts = self.writer.build_pagexml_no_full_layout( @@ -4848,6 +4847,15 @@ class Eynollah: textline_mask_tot, text_regions_p, image_page_rotated = \ self.run_marginals(image_page, textline_mask_tot_ea, mask_images, mask_lines, num_col_classifier, slope_deskew, text_regions_p_1, table_prediction) + + + if image_page.shape[0]!=0 and image_page.shape[1]!=0: + # if ratio of text regions to page area is smaller that 0.3, deskew angle is not aloowed to exceed 45 + if ( ( text_regions_p[:,:]==1).sum() + (text_regions_p[:,:]==4).sum() ) / float(image_page.shape[0]*image_page.shape[1] ) <= 0.3 and abs(slope_deskew) > 45: + slope_deskew = 0 + + if (text_regions_p[:,:]==1).sum() == 0: + text_regions_p[:,:][text_regions_p[:,:]==4] = 1 self.logger.info("Step 3/5: Text Line Detection") @@ -4894,6 +4902,8 @@ class Eynollah: ###min_con_area = 0.000005 contours_only_text, hir_on_text = return_contours_of_image(text_only) contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text) + + if len(contours_only_text_parent) > 0: areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1]) @@ -4995,7 +5005,9 @@ class Eynollah: contours_only_text_parent_d_ordered = [] contours_only_text_parent_d = [] #contours_only_text_parent = [] - + + boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals) + if not len(contours_only_text_parent): # stop early empty_marginals = [[]] * len(polygons_of_marginals) @@ -5031,7 +5043,6 @@ class Eynollah: contours_only_text_parent, self.image, slope_first, confidence_matrix, map=self.executor.map) #print("text region early 4 in %.1fs", time.time() - t0) boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent) - boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals) #print("text region early 5 in %.1fs", time.time() - t0) ## birdan sora chock chakir if not self.curved_line: From 8869c20c33c673e02e4f60081b96a8bd71d823d2 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Mon, 6 Oct 2025 14:53:47 +0200 Subject: [PATCH 4/5] updating CHANGELOG for v0.5.0 --- CHANGELOG.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bfdd1ce..70e8854 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,12 @@ Versioned according to [Semantic Versioning](http://semver.org/). Fixed: * restoring the contour in the original image caused an error due to an empty tuple, #154 + * removed NumPy warnings (fixed issue #158) + * fixed issue #124 + * Drop capitals are now handled separately from their corresponding textline + * Marginals are now divided into left and right. Their reading order is written first for left marginals, then for right marginals, and within each side from top to bottom + * Added a new page extraction model. Instead of bounding boxes, it outputs page contours in the XML file, improving results for skewed pages + * Improved reading order for cases where a textline is segmented into multiple smaller textlines Changed @@ -24,6 +30,20 @@ Added: * `eynollah machine-based-reading-order` CLI to run reading order detection, #175 * `eynollah enhancement` CLI to run image enhancement, #175 * Improved models for page extraction and reading order detection, #175 + * For the lightweight version (layout and textline detection), thresholds are now assigned to the artificial class. Users can apply these thresholds to improve detection of isolated textlines and regions. To counteract the drawback of thresholding, the skeleton of the artificial class is used to keep lines as thin as possible (resolved issues #163 and #161) + * Added and integrated trained CNN-RNN OCR models + * Added and integrated a trained TrOCR model + * Improved OCR detection to support vertical and curved textlines + * Introduced a new machine-based reading order model with rotation augmentation + * Optimized reading order speed by clustering text regions that belong to the same block, maintaining top-to-bottom order + * Implemented text merging across textlines based on hyphenation when a line ends with a hyphen + * Integrated image enhancement as a separate use case + * Added reading order functionality on the layout level as a separate use case + * CNN-RNN OCR models provide confidence scores for predictions + * Added OCR visualization: predicted OCR can be overlaid on an image of the same size as the input + * Introduced a threshold value for CNN-RNN OCR models, allowing users to filter out low-confidence textline predictions + * For OCR, users can specify a single model by name instead of always using the default model + * Under the OCR use case, if ground-truth XMLs and images are available, textline image and corresponding text extraction can now be performed Merged PRs: From 4ffe6190d2c6b885b27330027f4a0d8fd97a32f6 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 9 Oct 2025 14:03:26 +0200 Subject: [PATCH 5/5] :memo: changelog --- CHANGELOG.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 70e8854..5ca95a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,8 +10,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). Fixed: * restoring the contour in the original image caused an error due to an empty tuple, #154 - * removed NumPy warnings (fixed issue #158) - * fixed issue #124 + * removed NumPy warnings calculating sigma, mean, (fixed issue #158) + * fixed bug in `separate_lines.py`, #124 * Drop capitals are now handled separately from their corresponding textline * Marginals are now divided into left and right. Their reading order is written first for left marginals, then for right marginals, and within each side from top to bottom * Added a new page extraction model. Instead of bounding boxes, it outputs page contours in the XML file, improving results for skewed pages @@ -31,7 +31,7 @@ Added: * `eynollah enhancement` CLI to run image enhancement, #175 * Improved models for page extraction and reading order detection, #175 * For the lightweight version (layout and textline detection), thresholds are now assigned to the artificial class. Users can apply these thresholds to improve detection of isolated textlines and regions. To counteract the drawback of thresholding, the skeleton of the artificial class is used to keep lines as thin as possible (resolved issues #163 and #161) - * Added and integrated trained CNN-RNN OCR models + * Added and integrated a trained CNN-RNN OCR models * Added and integrated a trained TrOCR model * Improved OCR detection to support vertical and curved textlines * Introduced a new machine-based reading order model with rotation augmentation @@ -43,7 +43,7 @@ Added: * Added OCR visualization: predicted OCR can be overlaid on an image of the same size as the input * Introduced a threshold value for CNN-RNN OCR models, allowing users to filter out low-confidence textline predictions * For OCR, users can specify a single model by name instead of always using the default model - * Under the OCR use case, if ground-truth XMLs and images are available, textline image and corresponding text extraction can now be performed + * Under the OCR use case, if Ground Truth XMLs and images are available, textline image and corresponding text extraction can now be performed Merged PRs: