return_boxes_of_images_by_order_of_reading_new: improve

- when analysing regions spanning across columns, disregard tiny regions (smaller than half the median size) - if a region spans across columns just by a tiny fraction, and therefore is not good enough for a multi-col separator, then it should also not be good enough for a multi-col box maker
2026-02-21 00:41:56 +01:00 · 2025-11-28 17:58:44 +01:00 · 2025-11-28 17:58:44 +01:00 · 5abf0c1097
commit 5abf0c1097
parent b71bb80e3a
1 changed files with 8 additions and 4 deletions
--- a/src/eynollah/utils/init.py
+++ b/src/eynollah/utils/init.py
@ -1720,6 +1720,7 @@ def return_boxes_of_images_by_order_of_reading_new(
        # analyse connected components of regions to gain additional separators
        # and prepare a map for cross-column boxes
        ccounts = np.bincount(ccomps[top: bot].flatten())
+        ccounts_median = np.median(ccounts)
        col_ccounts = np.stack([np.bincount(ccomps[top: bot, left: right].flatten(),
                                            minlength=ccounts.size)
                                for left, right in pairwise(peaks_neg_tot)])
@ -1727,6 +1728,9 @@ def return_boxes_of_images_by_order_of_reading_new(
        for label, label_count in enumerate(ccounts):
            if not label:
                continue
+            # ignore small labels for the purpose of finding multicol seps
+            if label_count < 0.5 * ccounts_median:
+                continue
            label_left, label_top, label_width, label_height, label_area = cstats[label]
            # if label_count < 0.9 * label_area:
            #     # mostly not in this part of the page
@ -1738,15 +1742,15 @@ def return_boxes_of_images_by_order_of_reading_new(
            label_bot = label_top + label_height
            label_start = np.flatnonzero(peaks_neg_tot > label_left)[0] - 1
            label_end = np.flatnonzero(peaks_neg_tot >= label_right)[0]
+            if label_end - label_start < 2:
+                continue
+            if np.count_nonzero(col_ccounts[:, label] > 0.1 * label_count) < 2:
+                continue
            # store as dict for multi-column boxes:
            for start in range(label_start, label_end):
                labelcolmap.setdefault(start, list()).append(
                    (label_end, label_top, label_bot, sum(col_ccounts[start: label_end, label])))
            # make additional separators:
-            if label_end - label_start < 2:
-                continue
-            if np.count_nonzero(col_ccounts[:, label] > 0.1 * label_count) < 2:
-                continue
            x_min_hor_some = np.append(x_min_hor_some, [label_left] * 2)
            x_max_hor_some = np.append(x_max_hor_some, [label_right] * 2)
            y_min_hor_some = np.append(y_min_hor_some, [label_top - 2, label_bot])