From 5a3de3b42db5d92e7743e49c43315d0e98e679cd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 18:14:24 +0100 Subject: [PATCH] column detection: improve, aided by vseps whenever possible - `find_number_of_columns_in_document`: retain vertical separators and pass to `find_num_col` for each vertical split - `return_boxes_of_images_by_order_of_reading_new`: reconstruct the vertical separators from the segmentation mask and the separator bboxes; pass it on to `find_num_col` everywhere - `return_boxes_of_images_by_order_of_reading_new`: no need to try-catch `find_num_col` anymore - `return_boxes_of_images_by_order_of_reading_new`: when a vertical split has too few columns, * do not raise but lower the threshold `multiplier` responsible for allowing gaps as column boundaries * do not pass the `num_col_classifier` (i.e. expected number of resulting columns) of the entire page to the iterative `find_num_col` for each existing column, but only the portion of that span --- src/eynollah/utils/__init__.py | 97 ++++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 29 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 0f2dac3..43d5d75 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1,4 +1,4 @@ -from typing import Tuple +from typing import List, Tuple from logging import getLogger import time import math @@ -1315,7 +1315,35 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point): peaks_neg_tot.append(last_point) return peaks_neg_tot -def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_seps, contours_h=None): +def find_number_of_columns_in_document( + region_pre_p: np.ndarray, + num_col_classifier: int, + tables: bool, + label_seps: int, + contours_h: List[np.ndarray] = None, + logger=None +) -> Tuple[int, List[int], np.ndarray, List[int], np.ndarray]: + """ + Extract vertical and horizontal separators, vertical splits and horizontal column boundaries on page. + + Arguments: + * region_pre_p: segmentation map of the page + * num_col_classifier: predicted (expected) number of columns of the page + * tables: whether tables may be present + * label_seps: segmentation map class label for separators + * contours_h: polygons of potential headings (serving as additional horizontal separators) + * logger + + Returns: a tuple of + * the actual number of columns found + * the x coordinates of the column boundaries + * an array of the separators (bounding boxes and types) + * the y coordinates of the page splits + * a mask of the separators + """ + if logger is None: + logger = getLogger(__package__) + separators_closeup = 1 * (region_pre_p == label_seps) separators_closeup[0:110] = 0 separators_closeup[-150:] = 0 @@ -1483,8 +1511,11 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, num_big_parts += 1 try: num_col, peaks_neg_fin = find_num_col(regions_without_separators[top: bot], - num_col_classifier, tables, multiplier=7.0) - # print("big part %d:%d has %d columns" % (top, bot, num_col + 1), peaks_neg_fin) + num_col_classifier, tables, + vertical_separators=1 * (vertical[top: bot] > 0), + multiplier=7.0) + logger.debug("big part %d:%d has %d columns", top, bot, num_col + 1) + # print(peaks_neg_fin) except: num_col = 0 peaks_neg_fin = [] @@ -1522,7 +1553,8 @@ def return_boxes_of_images_by_order_of_reading_new( * matrix_of_seps: type and coordinates of horizontal and vertical separators, as well as headings * num_col_classifier: predicted number of columns for the entire page - * erosion_hurts: bool + * erosion_hurts: whether region masks have already been eroded + (and thus gaps can be expected to be wider) * tables: bool * right2left_readingorder: whether to invert the default left-to-right order @@ -1578,6 +1610,12 @@ def return_boxes_of_images_by_order_of_reading_new( height_tot, width_tot = regions_without_separators.shape big_part = 22 * height_tot // 100 # percent height _, ccomps, cstats, _ = cv2.connectedComponentsWithStats(regions_without_separators.astype(np.uint8)) + args_ver = matrix_of_seps_ch[:, 9] == 1 + mask_ver = np.zeros_like(regions_without_separators, dtype=bool) + for i in np.flatnonzero(args_ver): + mask_ver[matrix_of_seps_ch[i, 6]: matrix_of_seps_ch[i, 7], + matrix_of_seps_ch[i, 2]: matrix_of_seps_ch[i, 3]] = True + vertical_seps = 1 * ((regions_with_separators == 6) & mask_ver) for top, bot in pairwise(splitter_y_new): # print("%d:%d" % (top, bot), 'i') # dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot)) @@ -1589,16 +1627,13 @@ def return_boxes_of_images_by_order_of_reading_new( #if (len(matrix_new[:,9][matrix_new[:,9]==1]) > 0 and # np.max(matrix_new[:,8][matrix_new[:,9]==1]) >= # 0.1 * (np.abs(bot-top))): - try: - num_col, peaks_neg_fin = find_num_col( - regions_without_separators[top:bot], - # we do not expect to get all columns in small parts (headings etc.): - num_col_classifier if bot - top >= big_part else 1, - tables, multiplier=6. if erosion_hurts else 7., - unbalanced=True) - except: - peaks_neg_fin=[] - num_col = 0 + num_col, peaks_neg_fin = find_num_col( + regions_without_separators[top:bot], + # we do not expect to get all columns in small parts (headings etc.): + num_col_classifier if bot - top >= big_part else 1, + tables, vertical_separators=vertical_seps[top: bot], + multiplier=6. if erosion_hurts else 7., + unbalanced=True) try: if ((len(peaks_neg_fin) + 1 < num_col_classifier or num_col_classifier == 6) and @@ -1606,12 +1641,18 @@ def return_boxes_of_images_by_order_of_reading_new( bot - top >= big_part): # found too few columns here #print('burda') + logger.debug("searching for more than %d columns in big part %d:%d", + len(peaks_neg_fin) + 1, top, bot) peaks_neg_fin_org = np.copy(peaks_neg_fin) #print("peaks_neg_fin_org", peaks_neg_fin_org) - if len(peaks_neg_fin)==0: + if len(peaks_neg_fin) == 0: num_col, peaks_neg_fin = find_num_col( regions_without_separators[top:bot], - num_col_classifier, tables, multiplier=3., unbalanced=True) + num_col_classifier, tables, + vertical_separators=vertical_seps[top: bot], + # try to be less strict (lower threshold than above) + multiplier=7. if erosion_hurts else 8., + unbalanced=True) #print(peaks_neg_fin,'peaks_neg_fin') peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1] @@ -1625,22 +1666,19 @@ def return_boxes_of_images_by_order_of_reading_new( # plt.plot(regions_without_separators[top:bot, left:right].sum(axis=0)) # plt.title("vertical projection (sum over y)") # plt.show() - try: - _, peaks_neg_fin1 = find_num_col( - regions_without_separators[top:bot, left:right], - num_col_classifier, tables, multiplier=7.) - except: - peaks_neg_fin1 = [] - try: - _, peaks_neg_fin2 = find_num_col( - regions_without_separators[top:bot, left:right], - num_col_classifier, tables, multiplier=5.) - except: - peaks_neg_fin2 = [] + # try to get more peaks with different multipliers + num_col_expected = round((right - left) / width_tot * num_col_classifier) + args = regions_without_separators[top:bot, left:right], num_col_expected, tables + kwargs = dict(vertical_separators=vertical_seps[top: bot, left:right]) + _, peaks_neg_fin1 = find_num_col(*args, **kwargs, multiplier=7.) + _, peaks_neg_fin2 = find_num_col(*args, **kwargs, multiplier=5.) if len(peaks_neg_fin1) >= len(peaks_neg_fin2): peaks_neg_fin = peaks_neg_fin1 else: peaks_neg_fin = peaks_neg_fin2 + # print(peaks_neg_fin) + logger.debug("found %d additional column boundaries in %d:%d", + len(peaks_neg_fin), left, right) # add offset to local result peaks_neg_fin = list(np.array(peaks_neg_fin) + left) #print(peaks_neg_fin,'peaks_neg_fin') @@ -1652,6 +1690,7 @@ def return_boxes_of_images_by_order_of_reading_new( #print(peaks_neg_fin_rev,'peaks_neg_fin_rev') if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org): + #print("found more peaks than at first glance", peaks_neg_fin_rev, peaks_neg_fin_org) peaks_neg_fin = peaks_neg_fin_rev else: peaks_neg_fin = peaks_neg_fin_org