From 2641171fb19a8832e0981495040b1f370af37b10 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 27 Apr 2026 00:42:31 +0200 Subject: [PATCH] =?UTF-8?q?return=5Fboxes=5F...order=5Fof=5Freading...:=20?= =?UTF-8?q?avoid=20negative=20slices=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix rare bug when horizontal separators are detected by the very top (of a major vertical part of the page), causing box intervals to become negative --- src/eynollah/eynollah.py | 2 +- src/eynollah/utils/__init__.py | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 74d057d..d7b814b 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -1175,7 +1175,7 @@ class Eynollah: boxes, textline_mask_tot ): - + assert np.any(textline_mask_tot) self.logger.debug("enter do_order_of_regions") contours_only_text_parent = ensure_array(contours_only_text_parent) contours_only_text_parent_h = ensure_array(contours_only_text_parent_h) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 1d48ac5..1888752 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1056,8 +1056,11 @@ def order_of_regions(textline_mask, contours_main, contours_head, contours_drop, * the array of contour indexes for the respective type (i.e. into contours_main or contours_head or contours_drop) """ - ##plt.imshow(textline_mask) - ##plt.show() + total = len(contours_main) + len(contours_head) + len(contours_drop) + assert total == 0 or np.any(textline_mask) + + # ax1 = plt.subplot(2, 1, 1, title="order_of_regions textline_mask") + # plt.imshow(textline_mask, aspect='auto') y = textline_mask.sum(axis=1) # horizontal projection profile y_padded = np.zeros(len(y) + 40) y_padded[20 : len(y) + 20] = y @@ -1066,14 +1069,16 @@ def order_of_regions(textline_mask, contours_main, contours_head, contours_drop, #z = gaussian_filter1d(y_padded, sigma_gaus) #peaks, _ = find_peaks(z, height=0) #peaks = peaks - 20 - ##plt.plot(z) - ##plt.show() + # ax2 = plt.subplot(2, 1, 2, title="smoothed horizontal projection", sharex=ax1) + # plt.plot(y) zneg_rev = np.max(y_padded) - y_padded zneg = np.zeros(len(zneg_rev) + 40) zneg[20 : len(zneg_rev) + 20] = zneg_rev zneg = gaussian_filter1d(zneg, sigma_gaus) peaks_neg, _ = find_peaks(zneg, height=0) + # plt.vlines(peaks_neg - 40, 0, None, label="peaks") + # plt.show() peaks_neg = peaks_neg - 20 - 20 peaks_neg_new = np.array([0] + @@ -1091,7 +1096,6 @@ def order_of_regions(textline_mask, contours_main, contours_head, contours_drop, # assert not len(cy_head) or np.min(peaks_neg_new) <= np.min(cy_head) and np.max(cy_head) <= np.max(peaks_neg_new) # assert not len(cy_drop) or np.min(peaks_neg_new) <= np.min(cy_drop) and np.max(cy_drop) <= np.max(peaks_neg_new) - total = len(contours_main) + len(contours_head) + len(contours_drop) slice_main = slice(0, len(contours_main)) slice_head = slice(len(contours_main), len(contours_main) + len(contours_head)) @@ -1778,6 +1782,13 @@ def return_boxes_of_images_by_order_of_reading_new( y_max_hor_some = np.append(y_max_hor_some, [label_top, label_bot + 2]) cy_hor_some = np.append(cy_hor_some, [label_top - 1, label_bot + 1]) + # ensure no seps are out of bounds + x_min_hor_some = np.maximum(0, np.minimum(width_tot, x_min_hor_some)) + x_max_hor_some = np.maximum(0, np.minimum(width_tot, x_max_hor_some)) + y_min_hor_some = np.maximum(0, np.minimum(height_tot, y_min_hor_some)) + y_max_hor_some = np.maximum(0, np.minimum(height_tot, y_max_hor_some)) + cy_hor_some = np.maximum(0, np.minimum(height_tot, cy_hor_some)) + if right2left_readingorder: x_max_hor_some = width_tot - x_min_hor_some x_min_hor_some = width_tot - x_max_hor_some