From 4abc2ff57249e634c70cda665abc5d99429595d2 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 14 Nov 2025 03:05:02 +0100
Subject: [PATCH] rewrite/simplify manual reading order using recursive
 algorithm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- rename `return_x_start_end_mothers_childs_and_type_of_reading_order`
  → `return_multicol_separators_x_start_end`, and drop all the analysis
  pertaining to mother/child relationships and full-span separators,
  also drop the separator unification rules;
  instead of the latter, try to combine neighbouring separators more
  generally: join column spans iff there is nothing in between
  (which also necessitates passing the region mask), and keep only
  one of every such redundant pair;
  add the top (of each page part) as full-span separator up front,
  and return separators already ordered by y
- `return_boxes_of_images_by_order_of_reading_new`:
  - also pass regions with separators, so they do not have to be
    reconstructed from the separator coordinates, and also contain
    images and other non-text region types, when trying to elongate
    separators to maximize their span (without introducing overlaps)
  - determine connected components of the region mask, i.e. labels
    and their respective bboxes, in order to
    1. gain additional multi-column separators, if possible
    2. avoid cutting through regions which do cross column boundaries
       later on
  - whenever adding a new bbox, first look up the label map to see if
    there are any multi-column regions extending to the right of the
    current column; if there are, then advance not just one column
    to the right, but as many as necessary to avoid cutting through
    these regions
  - new core algorithm: iterate separators sorted by y and then column
    by column, but whenever the next separator ends in the same column
    as the current one or even further left, recurse (i.e. finish that
    span first before continuing with the top iteration)
---
 src/eynollah/utils/__init__.py | 935 ++++++++++-----------------------
 1 file changed, 277 insertions(+), 658 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index f3dbae2..e00004f 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -32,289 +32,132 @@ def pairwise(iterable):
         yield a, b
         a = b
 
-def return_x_start_end_mothers_childs_and_type_of_reading_order(
-        peak_points, x_min_hor_some, x_max_hor_some, cy_hor_some, y_max_hor_some):
+def return_multicol_separators_x_start_end(
+        regions_without_separators, peak_points, top, bot,
+        x_min_hor_some, x_max_hor_some, cy_hor_some, y_min_hor_some, y_max_hor_some):
     """
     Analyse which separators overlap multiple column candidates,
     and how they overlap each other.
 
     Ignore separators not spanning multiple columns.
 
-    For the separators to be returned, try to join them when they are directly
-    adjacent horizontally but nearby vertically (and thus mutually compatible).
-    Also, mark any separators that already span the full width.
-
-    Furthermore, identify which pairs of (unjoined) separators span subsets of columns
-    of each other (disregarding vertical positions). Referring, respectively, to the
-    superset separators as  "mothers" and to the subset separators as "children",
-    retrieve information on which columns are spanned by separators with no mother,
-    and which columns are spanned by their children (if any).
-
-    Moreover, determine if there is any (column) overlap among the multi-span separators
-    with no mother, specifically (and thus, no simple box separation is possible).
+    For the separators to be returned, try to remove or unify them when there
+    is no region between them (vertically) and their neighbours.
 
     Arguments:
+        * the text mask (with all separators suppressed)
         * the x column coordinates
-        * the x start column index of the raw separators
-        * the x end column index of the raw separators
-        * the y center coordinate of the raw separators
-        * the y end coordinate of the raw separators
+        * the y start coordinate to consider in total
+        * the y end coordinate to consider in total
+        * the x start coordinate of the horizontal separators
+        * the x end coordinate of the horizontal separators
+        * the y start coordinate of the horizontal separators
+        * the y center coordinate of the horizontal separators
+        * the y end coordinate of the horizontal separators
 
     Returns:
         a tuple of:
-        * whether any top-level (no-mother) multi-span separators overlap each other
         * the x start column index of the resulting multi-span separators
         * the x end column index of the resulting multi-span separators
+        * the y start coordinate of the resulting multi-span separators
         * the y center coordinate of the resulting multi-span separators
         * the y end coordinate of the resulting multi-span separators
-        * the y center (for 1 representative) of the top-level (no-mother) multi-span separators
-        * the x start column index of the top-level (no-mother) multi-span separators
-        * the x end column index of the top-level (no-mother) multi-span separators
-        * whether any multi-span separators have super-spans of other (child) multi-span separators
-        * the y center (for 1 representative) of the top-level (no-mother) multi-span separators
-          which have super-spans of other (child) multi-span separators
-        * the x start column index of the top-level multi-span separators
-          which have super-spans of other (child) multi-span separators
-        * the x end column index of the top-level multi-span separators
-          which have super-spans of other (child) multi-span separators
-        * indexes of multi-span separators with full-width span
     """
 
-    x_start=[]
-    x_end=[]
-    len_sep=[]
-    y_mid=[]
-    y_max=[]
-    new_main_sep_y=[]
-    indexer=0
+    x_start = [0]
+    x_end = [len(peak_points) - 1]
+    y_min = [top]
+    y_mid = [top]
+    y_max = [top + 2]
+    indexer = 1
     for i in range(len(x_min_hor_some)):
         #print(indexer, "%d:%d" % (x_min_hor_some[i], x_max_hor_some[i]), cy_hor_some[i])
         starting = x_min_hor_some[i] - peak_points
         min_start = np.flatnonzero(starting >= 0)[-1] # last left-of
         ending = x_max_hor_some[i] - peak_points
-        max_end = np.flatnonzero(ending < 0)[0] # first right-of
+        max_end = np.flatnonzero(ending <= 0)[0] # first right-of
         #print(indexer, "%d:%d" % (min_start, max_end))
 
         if (max_end-min_start)>=2:
             # column range of separator spans more than one column candidate
-            if (max_end-min_start)==(len(peak_points)-1):
-                # all columns (i.e. could be true new y splitter)
-                new_main_sep_y.append(indexer)
-
             #print((max_end-min_start),len(peak_points),'(max_end-min_start)')
+            y_min.append(y_min_hor_some[i])
             y_mid.append(cy_hor_some[i])
             y_max.append(y_max_hor_some[i])
             x_end.append(max_end)
             x_start.append(min_start)
-            len_sep.append(max_end-min_start)
             indexer+=1
     #print(x_start,'x_start')
     #print(x_end,'x_end')
 
-    x_start_returned = np.array(x_start, dtype=int)
-    x_end_returned = np.array(x_end, dtype=int)
-    y_mid_returned = np.array(y_mid, dtype=int)
-    y_max_returned = np.array(y_max, dtype=int)
-    #print(y_mid_returned,'y_mid_returned')
-    #print(x_start_returned,'x_start_returned')
-    #print(x_end_returned,'x_end_returned')
-
-    # join/elongate separators if follow-up x and similar y
-    sep_pairs = contours_in_same_horizon(y_mid_returned)
-    if len(sep_pairs):
-        #print('burda')
-        args_to_be_unified = set()
-        y_mid_unified = []
-        y_max_unified = []
-        x_start_unified = []
-        x_end_unified = []
-        for pair in sep_pairs:
-            if (not np.array_equal(*x_start_returned[pair]) and
-                not np.array_equal(*x_end_returned[pair]) and
-                # immediately adjacent columns?
-                np.diff(x_end_returned[pair] -
-                        x_start_returned[pair])[0] in [1, -1]):
-
-                args_to_be_unified.union(set(pair))
-                y_mid_unified.append(np.min(y_mid_returned[pair]))
-                y_max_unified.append(np.max(y_max_returned[pair]))
-                x_start_unified.append(np.min(x_start_returned[pair]))
-                x_end_unified.append(np.max(x_end_returned[pair]))
-                #print(pair,'pair')
-                #print(x_start_returned[pair],'x_s_same_hor')
-                #print(x_end_returned[pair],'x_e_same_hor')
-        #print(y_mid_unified,'y_mid_unified')
-        #print(y_max_unified,'y_max_unified')
-        #print(x_start_unified,'x_s_unified')
-        #print(x_end_unified,'x_e_selected')
-        #print('#############################')
-
-        if len(y_mid_unified):
-            args_lines_not_unified = np.setdiff1d(np.arange(len(y_mid_returned)),
-                                                  list(args_to_be_unified), assume_unique=True)
-            #print(args_lines_not_unified,'args_lines_not_unified')
-            x_start_returned = np.append(x_start_returned[args_lines_not_unified],
-                                         x_start_unified, axis=0)
-            x_end_returned = np.append(x_end_returned[args_lines_not_unified],
-                                       x_end_unified, axis=0)
-            y_mid_returned = np.append(y_mid_returned[args_lines_not_unified],
-                                       y_mid_unified, axis=0)
-            y_max_returned = np.append(y_max_returned[args_lines_not_unified],
-                                        y_max_unified, axis=0)
-    #print(y_mid_returned,'y_mid_returned2')
-    #print(x_start_returned,'x_start_returned2')
-    #print(x_end_returned,'x_end_returned2')
-
-    #print(new_main_sep_y,'new_main_sep_y')
-    #print(x_start,'x_start')
-    #print(x_end,'x_end')
-    x_start = np.array(x_start)
-    x_end = np.array(x_end)
-    y_mid = np.array(y_mid)
-    if len(new_main_sep_y):
-        # some full-width multi-span separators exist, so
-        # restrict the y range of separators to search for
-        # mutual overlaps to only those within the largest
-        # y strip between adjacent multi-span separators
-        # that involve at least one such full-width seps.
-        # (does not affect the separators to be returned)
-        min_ys=np.min(y_mid)
-        max_ys=np.max(y_mid)
-        #print(min_ys,'min_ys')
-        #print(max_ys,'max_ys')
-
-        y_mains0 = list(y_mid[new_main_sep_y])
-        y_mains = [min_ys] + y_mains0 + [max_ys]
-
-        y_mains = np.sort(y_mains)
-        argm = np.argmax(np.diff(y_mains))
-        y_mid_new = y_mains[argm]
-        y_mid_next_new = y_mains[argm + 1]
-
-        #print(y_mid_new,argm,'y_mid_new')
-        #print(y_mid_next_new,argm+1,'y_mid_next_new')
-        #print(y_mid[new_main_sep_y],new_main_sep_y,'yseps')
-        x_start=np.array(x_start)
-        x_end=np.array(x_end)
-        y_mid=np.array(y_mid)
-        # iff either boundary is itself not a full-width separator,
-        # then include it in the range of separators to be kept
-        if y_mid_new in y_mains0:
-            where = y_mid > y_mid_new
-        else:
-            where = y_mid >= y_mid_new
-        if y_mid_next_new in y_mains0:
-            where &= y_mid < y_mid_next_new
-        else:
-            where &= y_mid <= y_mid_next_new
-        x_start = x_start[where]
-        x_end = x_end[where]
-        y_mid = y_mid[where]
+    x_start = np.array(x_start, dtype=int)
+    x_end = np.array(x_end, dtype=int)
+    y_min = np.array(y_min, dtype=int)
+    y_mid = np.array(y_mid, dtype=int)
+    y_max = np.array(y_max, dtype=int)
+    #print(y_mid,'y_mid')
     #print(x_start,'x_start')
     #print(x_end,'x_end')
 
-    # remove redundant separators that span the same columns
-    # (keeping only 1 representative each)
-    deleted = set()
-    for index_i in range(len(x_start) - 1):
-        nodes_i = set(range(x_start[index_i], x_end[index_i] + 1))
-        #print(nodes_i, "nodes_i")
-        for index_j in range(index_i + 1, len(x_start)):
-            nodes_j = set(range(x_start[index_j], x_end[index_j] + 1))
-            #print(nodes_j, "nodes_j")
-            if nodes_i == nodes_j:
-                deleted.add(index_j)
-    #print(deleted,"deleted")
-    remained_sep_indexes = set(range(len(x_start))) - deleted
-    #print(remained_sep_indexes,'remained_sep_indexes')
+    # remove redundant separators (with nothing in between)
+    args_emptysep = set()
+    args_ysorted = np.argsort(y_mid)
+    for i in range(len(y_mid)):
+        # find nearest neighbours above with nothing in between
+        prev = (~np.eye(len(y_mid), dtype=bool)[i] &
+                (y_mid[i] >= y_mid) &
+                # complete subsumption:
+                # (x_start[i] >= x_start) &
+                # (x_end[i] <= x_end)
+                # partial overlap
+                (x_start[i] < x_end) &
+                (x_end[i] > x_start)
+        )
+        prev[list(args_emptysep)] = False # but no pair we already saw
+        if not prev.any():
+            continue
+        prev = np.flatnonzero(prev[args_ysorted])
+        j = args_ysorted[prev[-1]]
+        if not np.any(regions_without_separators[y_max[j]: y_min[i],
+                                                 peak_points[min(x_start[i], x_start[j])]:
+                                                 peak_points[max(x_end[i], x_end[j])]]):
+            args_emptysep.add(i)
+            if x_start[j] > x_start[i]:
+                # print(j, "now starts at", x_start[i])
+                x_start[j] = x_start[i]
+            if x_end[j] < x_end[i]:
+                x_end[j] = x_end[i]
+                # print(j, "now ends at", x_end[i])
+            # print(j, i, "%d:%d" % (y_mid[j], y_mid[i]), "%d:%d" % (x_start[i], x_end[i]), "empty prev sep")
+            continue
+        # find nearest neighbours below with nothing in between
+        nExt = (~np.eye(len(y_mid), dtype=bool)[i] &
+                (y_mid[i] <= y_mid) &
+                (x_start[i] >= x_start) &
+                (x_end[i] <= x_end))
+        nExt[list(args_emptysep)] = False # but no pair we already saw
+        if not nExt.any():
+            continue
+        nExt = np.flatnonzero(nExt[args_ysorted])
+        j = args_ysorted[nExt[0]]
+        if not np.any(regions_without_separators[y_max[i]: y_min[j],
+                                                 peak_points[x_start[i]]:
+                                                 peak_points[x_end[i]]]):
+            args_emptysep.add(i)
+            # print(j, i, "%d:%d" % (y_mid[j], y_mid[i]), "%d:%d" % (x_start[i], x_end[i]), "empty next sep")
+    args_to_be_kept = [arg for arg in args_ysorted
+                       if not arg in args_emptysep]
+    x_start = x_start[args_to_be_kept]
+    x_end = x_end[args_to_be_kept]
+    y_min = y_min[args_to_be_kept]
+    y_mid = y_mid[args_to_be_kept]
+    y_max = y_max[args_to_be_kept]
 
-    # determine which separators span which columns
-    mother = [] # whether the respective separator has a mother separator
-    child = [] # whether the respective separator has a child separator
-    for index_i in remained_sep_indexes:
-        have_mother=0
-        have_child=0
-        nodes_i = set(range(x_start[index_i], x_end[index_i] + 1))
-        for index_j in remained_sep_indexes:
-            nodes_j = set(range(x_start[index_j], x_end[index_j] + 1))
-            if nodes_i < nodes_j:
-                have_mother=1
-            if nodes_i > nodes_j:
-                have_child=1
-        mother.append(have_mother)
-        child.append(have_child)
-    #print(mother, "mother")
-    #print(child, "child")
-
-    mother = np.array(mother)
-    child = np.array(child)
-    #print(mother,'mother')
-    #print(child,'child')
-    remained_sep_indexes = np.array(list(remained_sep_indexes))
-    #print(len(remained_sep_indexes))
-    #print(len(remained_sep_indexes),len(x_start),len(x_end),len(y_mid),'lens')
-
-    reading_order_type = 0
-    if len(remained_sep_indexes):
-        #print(np.array(remained_sep_indexes),'np.array(remained_sep_indexes)')
-        #print(np.array(mother),'mother')
-        remained_sep_indexes_without_mother = remained_sep_indexes[mother==0]
-        remained_sep_indexes_with_child_without_mother = remained_sep_indexes[(mother==0) & (child==1)]
-        #print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother')
-        #print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother')
-
-        x_end_with_child_without_mother = x_end[remained_sep_indexes_with_child_without_mother]
-        x_start_with_child_without_mother = x_start[remained_sep_indexes_with_child_without_mother]
-        y_mid_with_child_without_mother = y_mid[remained_sep_indexes_with_child_without_mother]
-
-        x_end_without_mother = x_end[remained_sep_indexes_without_mother]
-        x_start_without_mother = x_start[remained_sep_indexes_without_mother]
-        y_mid_without_mother = y_mid[remained_sep_indexes_without_mother]
-
-        if len(remained_sep_indexes_without_mother)>=2:
-            for i in range(len(remained_sep_indexes_without_mother)-1):
-                index_i = remained_sep_indexes_without_mother[i]
-                nodes_i = set(range(x_start[index_i], x_end[index_i] + 1))
-                #print(index_i, nodes_i, "nodes_i without mother")
-                for j in range(i + 1, len(remained_sep_indexes_without_mother)):
-                    index_j = remained_sep_indexes_without_mother[j]
-                    nodes_j = set(range(x_start[index_j], x_end[index_j] + 1))
-                    #print(index_j, nodes_j, "nodes_j without mother")
-                    if nodes_i - nodes_j != nodes_i:
-                        #print("type=1")
-                        reading_order_type = 1
-    else:
-        y_mid_without_mother = np.zeros(0, int)
-        x_start_without_mother = np.zeros(0, int)
-        x_end_without_mother = np.zeros(0, int)
-        y_mid_with_child_without_mother = np.zeros(0, int)
-        x_start_with_child_without_mother = np.zeros(0, int)
-        x_end_with_child_without_mother = np.zeros(0, int)
-
-    #print(reading_order_type,'reading_order_type')
-    #print(y_mid_with_child_without_mother,'y_mid_with_child_without_mother')
-    #print(x_start_with_child_without_mother,'x_start_with_child_without_mother')
-    #print(x_end_with_child_without_mother,'x_end_with_hild_without_mother')
-
-    len_sep_with_child = len(child[child==1])
-    #print(len_sep_with_child,'len_sep_with_child')
-    there_is_sep_with_child = 0
-    if len_sep_with_child >= 1:
-        there_is_sep_with_child = 1
-
-    return (reading_order_type,
-            x_start_returned,
-            x_end_returned,
-            y_mid_returned,
-            y_max_returned,
-            y_mid_without_mother,
-            x_start_without_mother,
-            x_end_without_mother,
-            there_is_sep_with_child,
-            y_mid_with_child_without_mother,
-            x_start_with_child_without_mother,
-            x_end_with_child_without_mother,
-            new_main_sep_y)
+    return (x_start,
+            x_end,
+            y_min,
+            y_mid,
+            y_max)
 
 def box2rect(box: Tuple[int, int, int, int]) -> Tuple[int, int, int, int]:
     return (box[1], box[1] + box[3],
@@ -1212,6 +1055,25 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col)
     return textlines_con_changed
 
 def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref):
+    """
+    Order text region contours within a single column bbox in a top-down-left-right way.
+
+    First, determine the vertical gaps. Then iterate over each vertical segment,
+    identifying the contours centered in that segment. Order them by their
+    horizontal center, and add them to the overall order.
+
+    Arguments:
+      * textline_mask: the mask of the textline segmentation, cropped for that box
+      * contours_main: the paragraph text region contours expected to be here
+      * contours_head: the heading text region contours expected to be here
+      * y_ref: the vertical offset of that box within the page
+      * x_ref: the horizontal offset of that box within the page
+
+    Returns: a tuple of
+      * the array of contour indexes overall within this box (i.e. into main+head)
+      * the array of types (1 for paragraph, 2 for heading)
+      * the array of contour indexes for the respective type (i.e. into contours_main or contours_head)
+    """
     ##plt.imshow(textline_mask)
     ##plt.show()
     y = textline_mask.sum(axis=1) # horizontal projection profile
@@ -1547,7 +1409,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables,
         try:
             num_col, peaks_neg_fin = find_num_col(regions_without_separators[top: bot],
                                                   num_col_classifier, tables, multiplier=7.0)
-            #print("big part %d:%d has %d columns" % (top, bot, num_col), peaks_neg_fin)
+            # print("big part %d:%d has %d columns" % (top, bot, num_col + 1), peaks_neg_fin)
         except:
             num_col = 0
             peaks_neg_fin = []
@@ -1564,11 +1426,36 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables,
     return num_col_fin, peaks_neg_fin_fin, matrix_of_seps_ch, splitter_y_new, separators_closeup_n
 
 def return_boxes_of_images_by_order_of_reading_new(
-        splitter_y_new, regions_without_separators,
-        matrix_of_lines_ch,
+        splitter_y_new,
+        regions_without_separators,
+        regions_with_separators,
+        matrix_of_seps_ch,
         num_col_classifier, erosion_hurts, tables,
         right2left_readingorder,
         logger=None):
+    """
+    Iterate through the vertical parts of a page, each with its own set of columns,
+    and from the matrix of horizontal separators for that part, find an ordered
+    list of bounding boxes through all columns and regions.
+
+    Arguments:
+       * splitter_y_new: the y coordinates separating the parts
+       * regions_without_separators: (text) region mask with separators suppressed;
+             (needed to find per-part columns and to combine separators if possible)
+       * regions_with_separators: (full) region map with separators suppressed;
+             (needed to elongate separators if possible)
+       * matrix_of_seps: type and coordinates of horizontal and vertical separators,
+             as well as headings
+       * num_col_classifier: predicted number of columns for the entire page
+       * erosion_hurts: bool
+       * tables: bool
+       * right2left_readingorder: whether to invert the default left-to-right order
+
+    Returns: a tuple of
+       * the ordered list of bounding boxes
+       * a list of arrays: the x coordinates delimiting the columns for every page part
+             (according to splitter)
+    """
 
     if right2left_readingorder:
         regions_without_separators = cv2.flip(regions_without_separators,1)
@@ -1576,12 +1463,20 @@ def return_boxes_of_images_by_order_of_reading_new(
         logger = getLogger(__package__)
     logger.debug('enter return_boxes_of_images_by_order_of_reading_new')
 
+    # def dbg_imshow(box, title):
+    #     xmin, xmax, ymin, ymax = box
+    #     plt.imshow(regions_with_separators) #, extent=[0, width_tot, bot, top])
+    #     plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
+    #                                           fill=False, linewidth=1, edgecolor='r'))
+    #     plt.title(title + " at %d:%d, %d:%d" % (ymin, ymax, xmin, xmax))
+    #     plt.show()
     # def dbg_plt(box=None, title=None, rectangles=None, rectangles_showidx=False):
     #     minx, maxx, miny, maxy = box or (0, None, 0, None)
     #     img = regions_without_separators[miny:maxy, minx:maxx]
     #     plt.imshow(img)
-    #     xrange = np.arange(0, img.shape[1], 100)
-    #     yrange = np.arange(0, img.shape[0], 100)
+    #     step = max(img.shape) // 10
+    #     xrange = np.arange(0, img.shape[1], step)
+    #     yrange = np.arange(0, img.shape[0], step)
     #     ax = plt.gca()
     #     ax.set_xticks(xrange)
     #     ax.set_yticks(yrange)
@@ -1597,7 +1492,7 @@ def return_boxes_of_images_by_order_of_reading_new(
     #             ax.add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
     #                                            fill=False, linewidth=1, edgecolor='r'))
     #             if rectangles_showidx:
-    #                 ax.text((xmin+xmax)/2, (ymin+ymax)/2, str(i + 1), c='r')
+    #                 ax.text((xmin+xmax)/2, (ymin+ymax)/2, str(i), c='r')
     #     plt.show()
     # dbg_plt(title="return_boxes_of_images_by_order_of_reading_new")
 
@@ -1606,11 +1501,12 @@ def return_boxes_of_images_by_order_of_reading_new(
     splitter_y_new = np.array(splitter_y_new, dtype=int)
     height_tot, width_tot = regions_without_separators.shape
     big_part = 22 * height_tot // 100 # percent height
+    _, ccomps, cstats, _ = cv2.connectedComponentsWithStats(regions_without_separators.astype(np.uint8))
     for top, bot in pairwise(splitter_y_new):
         # print("%d:%d" % (top, bot), 'i')
         # dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot))
-        matrix_new = matrix_of_lines_ch[(matrix_of_lines_ch[:,6] > top) &
-                                        (matrix_of_lines_ch[:,7] < bot)]
+        matrix_new = matrix_of_seps_ch[(matrix_of_seps_ch[:,6] >= top) &
+                                       (matrix_of_seps_ch[:,7] < bot)]
         #print(len( matrix_new[:,9][matrix_new[:,9]==1] ))
         #print(matrix_new[:,8][matrix_new[:,9]==1],'gaddaaa')
         # check to see is there any vertical separator to find holes.
@@ -1698,19 +1594,9 @@ def return_boxes_of_images_by_order_of_reading_new(
 
         # elongate horizontal separators+headings as much as possible without overlap
         args_nonver = matrix_new[:, 9] != 1
-        regions_with_separators = np.copy(regions_without_separators[top:bot])
-        for xmin, xmax, ymin, ymax in matrix_new[:, [2, 3, 6, 7]]:
-            regions_with_separators[ymin - top: ymax - top, xmin: xmax] = 6
-        # def dbg_imshow(box, title):
-        #     xmin, xmax, ymin, ymax = box
-        #     plt.imshow(regions_with_separators, extent=[0, width_tot, bot, top])
-        #     plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
-        #                                           fill=False, linewidth=1, edgecolor='r'))
-        #     plt.title(title + " at %d:%d, %d:%d" % (ymin, ymax, xmin, xmax))
-        #     plt.show()
         for i in np.flatnonzero(args_nonver):
             xmin, xmax, ymin, ymax, typ = matrix_new[i, [2, 3, 6, 7, 9]]
-            cut = regions_with_separators[ymin - top: ymax - top]
+            cut = regions_with_separators[ymin: ymax]
             # dbg_imshow([xmin, xmax, ymin, ymax], "separator %d (%s)" % (i, "heading" if typ else "horizontal"))
             starting = xmin - peaks_neg_tot
             min_start = np.flatnonzero(starting >= 0)[-1] # last left-of
@@ -1737,6 +1623,7 @@ def return_boxes_of_images_by_order_of_reading_new(
         args_hor = matrix_new[:, 9] == 0
         x_min_hor_some = matrix_new[:, 2][args_hor]
         x_max_hor_some = matrix_new[:, 3][args_hor]
+        y_min_hor_some = matrix_new[:, 6][args_hor]
         y_max_hor_some = matrix_new[:, 7][args_hor]
         cy_hor_some = matrix_new[:, 5][args_hor]
 
@@ -1752,412 +1639,144 @@ def return_boxes_of_images_by_order_of_reading_new(
         # (x +/- 30px to avoid crossing col peaks by accident)
         x_min_hor_some = np.append(x_min_hor_some, np.tile(x_min_hor_head + 30, 2))
         x_max_hor_some = np.append(x_max_hor_some, np.tile(x_max_hor_head - 30, 2))
+        y_min_hor_some = np.append(y_min_hor_some, # toplines
+                                   np.concatenate((y_min_hor_head - 2,
+                                                   y_max_hor_head - 0)))
         y_max_hor_some = np.append(y_max_hor_some, # baselines
-                                   np.concatenate((y_min_hor_head + 2,
+                                   np.concatenate((y_min_hor_head + 0,
                                                    y_max_hor_head + 2)))
-        cy_hor_some = np.append(cy_hor_some, # toplines
-                                np.concatenate((y_min_hor_head - 2,
-                                                y_max_hor_head - 2)))
+        cy_hor_some = np.append(cy_hor_some, # centerlines
+                                np.concatenate((y_min_hor_head - 1,
+                                                y_max_hor_head + 1)))
+
+        # analyse connected components of regions to gain additional separators
+        # and prepare a map for cross-column boxes
+        ccounts = np.bincount(ccomps[top: bot].flatten())
+        col_ccounts = np.stack([np.bincount(ccomps[top: bot, left: right].flatten(),
+                                            minlength=ccounts.size)
+                                for left, right in pairwise(peaks_neg_tot)])
+        labelcolmap = dict()
+        for label, label_count in enumerate(ccounts):
+            if not label:
+                continue
+            label_left, label_top, label_width, label_height, label_area = cstats[label]
+            # if label_count < 0.9 * label_area:
+            #     # mostly not in this part of the page
+            #     continue
+            if label_count < 0.01 * (top - bot) * width_tot:
+                continue
+            #assert np.sum(col_ccounts[:, label]) == label_count
+            label_right = label_left + label_width
+            label_bot = label_top + label_height
+            label_start = np.flatnonzero(peaks_neg_tot > label_left)[0] - 1
+            label_end = np.flatnonzero(peaks_neg_tot >= label_right)[0]
+            # store as dict for multi-column boxes:
+            for start in range(label_start, label_end):
+                labelcolmap.setdefault(start, list()).append(
+                    (label_end, label_top, label_bot, sum(col_ccounts[start: label_end, label])))
+            # make additional separators:
+            if label_end - label_start < 2:
+                continue
+            if np.count_nonzero(col_ccounts[:, label] > 0.1 * label_count) < 2:
+                continue
+            x_min_hor_some = np.append(x_min_hor_some, [label_left] * 2)
+            x_max_hor_some = np.append(x_max_hor_some, [label_right] * 2)
+            y_min_hor_some = np.append(y_min_hor_some, [label_top - 2, label_bot])
+            y_max_hor_some = np.append(y_max_hor_some, [label_top, label_bot + 2])
+            cy_hor_some = np.append(cy_hor_some, [label_top - 1, label_bot + 1])
 
         if right2left_readingorder:
             x_max_hor_some = width_tot - x_min_hor_some
             x_min_hor_some = width_tot - x_max_hor_some
 
-
-        reading_order_type, x_starting, x_ending, y_mid, y_max, \
-            y_mid_without_mother, x_start_without_mother, x_end_without_mother, \
-            there_is_sep_with_child, \
-            y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \
-            new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order(
-                peaks_neg_tot, x_min_hor_some, x_max_hor_some, cy_hor_some, y_max_hor_some)
-
-        # show multi-column separators
-        # dbg_plt([0, None, top, bot], "multi-column separators in current split", 
+        x_starting, x_ending, y_min, y_mid, y_max = return_multicol_separators_x_start_end(
+            regions_without_separators, peaks_neg_tot, top, bot,
+            x_min_hor_some, x_max_hor_some, cy_hor_some, y_min_hor_some, y_max_hor_some)
+        # dbg_plt([0, None, top, bot], "non-empty multi-column separators in current split", 
         #         list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending],
-        #                  y_mid - top, y_max - top)), True)
+        #                  y_min - top, y_max - top)), True)
 
-        if (reading_order_type == 1 or
-            len(y_mid_without_mother) >= 2 or
-            there_is_sep_with_child == 1):
-            # there are top-level multi-colspan horizontal separators which overlap each other
-            # or multiple top-level multi-colspan horizontal separators
-            # or multi-colspan horizontal separators shorter than their respective top-level:
-            # todo: explain how this is dealt with
-            try:
-                y_grenze = top + 300
-                up = (y_mid > top) & (y_mid <= y_grenze)
-
-                args_early_ys=np.arange(len(y_mid))
-                #print(args_early_ys,'args_early_ys')
-                #print(y_mid,'y_mid')
-
-                x_starting_up = x_starting[up]
-                x_ending_up = x_ending[up]
-                y_mid_up = y_mid[up]
-                y_max_up = y_max[up]
-                args_up = args_early_ys[up]
-                #print(args_up,'args_up')
-                #print(y_mid_up,'y_mid_up')
-                #check if there is a big separator in this y_mains0
-                if len(y_mid_up) > 0:
-                    # is there a separator with full-width span?
-                    main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1)
-                    y_mid_main_separator_up = y_mid_up[main_separator]
-                    y_max_main_separator_up = y_max_up[main_separator]
-                    args_main_to_deleted = args_up[main_separator]
-                    #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm')
-                    if len(y_max_main_separator_up):
-                        args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) ))
-                        #print(args_to_be_kept,'args_to_be_kept')
-                        boxes.append([0, peaks_neg_tot[-1],
-                                      top, y_max_main_separator_up.max()])
-                        # dbg_plt(boxes[-1], "near top main separator box")
-                        top = y_max_main_separator_up.max()
-
-                        #print(top,'top')
-                        y_mid = y_mid[args_to_be_kept]
-                        x_starting = x_starting[args_to_be_kept]
-                        x_ending = x_ending[args_to_be_kept]
-                        y_max = y_max[args_to_be_kept]
-
-                        #print('galdiha')
-                        y_grenze = top + 200
-                        up = (y_mid > top) & (y_mid <= y_grenze)
-                        args_early_ys2 = np.arange(len(y_mid))
-                        x_starting_up = x_starting[up]
-                        x_ending_up = x_ending[up]
-                        y_mid_up = y_mid[up]
-                        y_max_up = y_max[up]
-                        args_up2 = args_early_ys2[up]
-                        #print(y_mid_up,x_starting_up,x_ending_up,'didid')
-                    else:
-                        args_early_ys2 = args_early_ys
-                        args_up2 = args_up
-
-                    nodes_in = set()
-                    for ij in range(len(x_starting_up)):
-                        nodes_in.update(range(x_starting_up[ij],
-                                              x_ending_up[ij]))
-                    #print(nodes_in,'nodes_in')
-                    #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))')
-
-                    if nodes_in == set(range(len(peaks_neg_tot)-1)):
-                        pass
-                    elif nodes_in == set(range(1, len(peaks_neg_tot)-1)):
-                        pass
-                    else:
-                        #print('burdaydikh')
-                        args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) ))
-
-                        if len(args_to_be_kept2):
-                            #print(args_to_be_kept2, "args_to_be_kept2")
-                            y_mid = y_mid[args_to_be_kept2]
-                            x_starting = x_starting[args_to_be_kept2]
-                            x_ending = x_ending[args_to_be_kept2]
-                            y_max = y_max[args_to_be_kept2]
-
-                #int(top)
-                # order multi-column separators
-                y_mid_by_order=[]
-                x_start_by_order=[]
-                x_end_by_order=[]
-                if (reading_order_type == 1 or
-                    len(x_end_with_child_without_mother) == 0):
-                    if reading_order_type == 1:
-                        # there are top-level multi-colspan horizontal separators which overlap each other
-                        #print("adding all columns at top because of multiple overlapping mothers")
-                        y_mid_by_order.append(top)
-                        x_start_by_order.append(0)
-                        x_end_by_order.append(len(peaks_neg_tot)-2)
-                    else:
-                        # there are no top-level multi-colspan horizontal separators which themselves
-                        # contain shorter multi-colspan separators
-                        #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo')
-                        columns_covered_by_mothers = set()
-                        for dj in range(len(x_start_without_mother)):
-                            columns_covered_by_mothers.update(
-                                range(x_start_without_mother[dj],
-                                      x_end_without_mother[dj]))
-                        columns_not_covered = list(all_columns - columns_covered_by_mothers)
-                        #print(columns_covered_by_mothers, "columns_covered_by_mothers")
-                        #print(columns_not_covered, "columns_not_covered")
-                        y_mid = np.append(y_mid, np.ones(len(columns_not_covered) +
-                                                         len(x_start_without_mother),
-                                                         dtype=int) * top)
-                        ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered))
-                        ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
-                        x_starting = np.append(x_starting, np.array(columns_not_covered, int))
-                        x_starting = np.append(x_starting, x_start_without_mother)
-                        x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1)
-                        x_ending = np.append(x_ending, x_end_without_mother)
-
-                    ind_args=np.arange(len(y_mid))
-                    #print(ind_args,'ind_args')
-                    for column in range(len(peaks_neg_tot)-1):
-                        #print(column,'column')
-                        ind_args_in_col=ind_args[x_starting==column]
-                        #print('babali2')
-                        #print(ind_args_in_col,'ind_args_in_col')
-                        #print(len(y_mid))
-                        y_mid_column=y_mid[ind_args_in_col]
-                        x_start_column=x_starting[ind_args_in_col]
-                        x_end_column=x_ending[ind_args_in_col]
-                        #print('babali3')
-                        ind_args_col_sorted=np.argsort(y_mid_column)
-                        y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
-                        x_start_by_order.extend(x_start_column[ind_args_col_sorted])
-                        x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
-                else:
-                    #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo')
-                    columns_covered_by_mothers = set()
-                    for dj in range(len(x_start_without_mother)):
-                        columns_covered_by_mothers.update(
-                            range(x_start_without_mother[dj],
-                                  x_end_without_mother[dj]))
-                    columns_not_covered = list(all_columns - columns_covered_by_mothers)
-                    #print(columns_covered_by_mothers, "columns_covered_by_mothers")
-                    #print(columns_not_covered, "columns_not_covered")
-                    y_mid = np.append(y_mid, np.ones(len(columns_not_covered) +
-                                                     len(x_start_without_mother),
-                                                     dtype=int) * top)
-                    ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered))
-                    ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
-                    x_starting = np.append(x_starting, np.array(columns_not_covered, int))
-                    x_starting = np.append(x_starting, x_start_without_mother)
-                    x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1)
-                    x_ending = np.append(x_ending, x_end_without_mother)
-
-                    columns_covered_by_mothers_with_child = set()
-                    for dj in range(len(x_end_with_child_without_mother)):
-                        columns_covered_by_mothers_with_child.update(
-                            range(x_start_with_child_without_mother[dj],
-                                  x_end_with_child_without_mother[dj]))
-                    #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child")
-                    columns_not_covered_by_mothers_with_child = list(
-                        all_columns - columns_covered_by_mothers_with_child)
-                    #indexes_to_be_spanned=[]
-                    for i_s in range(len(x_end_with_child_without_mother)):
-                        columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s])
-                    columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child)
-                    #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child")
-                    ind_args = np.arange(len(y_mid))
-                    for i_s_nc in columns_not_covered_by_mothers_with_child:
-                        if i_s_nc in x_start_with_child_without_mother:
-                            # use only seps with mother's span ("biggest")
-                            #print("i_s_nc", i_s_nc)
-                            x_end_biggest_column = \
-                                x_end_with_child_without_mother[
-                                    x_start_with_child_without_mother == i_s_nc][0]
-                            args_all_biggest_seps = \
-                                ind_args[(x_starting == i_s_nc) &
-                                         (x_ending == x_end_biggest_column)]
-                            y_mid_column_nc = y_mid[args_all_biggest_seps]
-                            #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child")
-                            #x_start_column_nc = x_starting[args_all_biggest_seps]
-                            #x_end_column_nc = x_ending[args_all_biggest_seps]
-                            y_mid_column_nc = np.sort(y_mid_column_nc)
-                            #print(y_mid_column_nc, "y_mid_column_nc (sorted)")
-                            for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)):
-                                #print("i_c", i_c)
-                                #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc")
-                                ind_all_seps_between_nm_wc = \
-                                    ind_args[(y_mid > nc_top) &
-                                             (y_mid < nc_bot) &
-                                             (x_starting >= i_s_nc) &
-                                             (x_ending <= x_end_biggest_column)]
-                                y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc]
-                                x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc]
-                                x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc]
-
-                                columns_covered_by_mothers = set()
-                                for dj in range(len(ind_all_seps_between_nm_wc)):
-                                    columns_covered_by_mothers.update(
-                                        range(x_starting_all_between_nm_wc[dj],
-                                              x_ending_all_between_nm_wc[dj]))
-                                #print(columns_covered_by_mothers, "columns_covered_by_mothers")
-                                child_columns = set(range(i_s_nc, x_end_biggest_column))
-                                columns_not_covered = list(child_columns - columns_covered_by_mothers)
-                                #print(child_columns, "child_columns")
-                                #print(columns_not_covered, "columns_not_covered")
-
-                                if len(ind_all_seps_between_nm_wc):
-                                    biggest = np.argmax(x_ending_all_between_nm_wc -
-                                                        x_starting_all_between_nm_wc)
-                                    #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc")
-                                    #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest],
-                                                              x_ending_all_between_nm_wc[biggest]), "biggest")
-                                    if columns_covered_by_mothers == set(
-                                            range(x_starting_all_between_nm_wc[biggest],
-                                                  x_ending_all_between_nm_wc[biggest])):
-                                        # single biggest accounts for all covered columns alone,
-                                        # this separator should be extended to cover all
-                                        seps_too_close_to_top_separator = \
-                                            ((y_mid_all_between_nm_wc > nc_top) &
-                                             (y_mid_all_between_nm_wc <= nc_top + 500))
-                                        if (np.count_nonzero(seps_too_close_to_top_separator) and
-                                            np.count_nonzero(seps_too_close_to_top_separator) <
-                                            len(ind_all_seps_between_nm_wc)):
-                                            #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator")
-                                            y_mid_all_between_nm_wc = \
-                                                y_mid_all_between_nm_wc[~seps_too_close_to_top_separator]
-                                            x_starting_all_between_nm_wc = \
-                                                x_starting_all_between_nm_wc[~seps_too_close_to_top_separator]
-                                            x_ending_all_between_nm_wc = \
-                                                x_ending_all_between_nm_wc[~seps_too_close_to_top_separator]
-
-                                        y_mid_all_between_nm_wc = np.append(
-                                            y_mid_all_between_nm_wc, nc_top)
-                                        x_starting_all_between_nm_wc = np.append(
-                                            x_starting_all_between_nm_wc, i_s_nc)
-                                        x_ending_all_between_nm_wc = np.append(
-                                            x_ending_all_between_nm_wc, x_end_biggest_column)
-                                    else:
-                                        y_mid_all_between_nm_wc = np.append(
-                                            y_mid_all_between_nm_wc, nc_top)
-                                        x_starting_all_between_nm_wc = np.append(
-                                            x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest])
-                                        x_ending_all_between_nm_wc = np.append(
-                                            x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest])
-
-                                if len(columns_not_covered):
-                                    y_mid_all_between_nm_wc = np.append(
-                                        y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered))
-                                    x_starting_all_between_nm_wc = np.append(
-                                        x_starting_all_between_nm_wc, np.array(columns_not_covered, int))
-                                    x_ending_all_between_nm_wc = np.append(
-                                        x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1)
-
-                                ind_args_between=np.arange(len(x_ending_all_between_nm_wc))
-                                for column in range(int(i_s_nc), int(x_end_biggest_column)):
-                                    ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column]
-                                    #print('babali2')
-                                    #print(ind_args_in_col,'ind_args_in_col')
-                                    #print(len(y_mid))
-                                    y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col]
-                                    x_start_column=x_starting_all_between_nm_wc[ind_args_in_col]
-                                    x_end_column=x_ending_all_between_nm_wc[ind_args_in_col]
-                                    #print('babali3')
-                                    ind_args_col_sorted=np.argsort(y_mid_column)
-                                    y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
-                                    x_start_by_order.extend(x_start_column[ind_args_col_sorted])
-                                    x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
-                        else:
-                            #print(i_s_nc,'column not covered by mothers with child')
-                            ind_args_in_col=ind_args[x_starting==i_s_nc]
-                            #print('babali2')
-                            #print(ind_args_in_col,'ind_args_in_col')
-                            #print(len(y_mid))
-                            y_mid_column=y_mid[ind_args_in_col]
-                            x_start_column=x_starting[ind_args_in_col]
-                            x_end_column=x_ending[ind_args_in_col]
-                            #print('babali3')
-                            ind_args_col_sorted = np.argsort(y_mid_column)
-                            y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
-                            x_start_by_order.extend(x_start_column[ind_args_col_sorted])
-                            x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
-
-                # create single-column boxes from multi-column separators
-                y_mid_by_order = np.array(y_mid_by_order)
-                x_start_by_order = np.array(x_start_by_order)
-                x_end_by_order = np.array(x_end_by_order)
-                for il in range(len(y_mid_by_order)):
-                    #print(il, "il")
-                    y_mid_itself = y_mid_by_order[il]
-                    x_start_itself = x_start_by_order[il]
-                    x_end_itself = x_end_by_order[il]
-                    for column in range(int(x_start_itself), int(x_end_itself)+1):
-                        #print(column,'cols')
-                        #print('burda')
-                        #print('burda2')
-                        y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) &
-                                                    (column >= x_start_by_order) &
-                                                    (column <= x_end_by_order)]
-                        y_mid_next = y_mid_next.min(initial=bot)
-                        #print(y_mid_next,'y_mid_next')
-                        #print(y_mid_itself,'y_mid_itself')
+        # core algorithm:
+        # 1. iterate through multi-column separators, pre-ordered by their y coord
+        # 2. for each separator, iterate from its starting to its ending column
+        # 3. in each starting column, determine the next downwards separator,
+        # 4. if there is none, then fill up the column to the bottom;
+        #    otherwise, fill up to that next separator
+        # 5. moreover, determine the next rightward column that would not cut through
+        #     any regions, advancing to that column, and storing a new in-order bbox
+        #     for that down/right span
+        # 6. if there was a next separator, and it ends no further than the current one,
+        #    then recurse on that separator from step 1, then continue (with the next
+        #    column for the current separator) at step 2, or (with the next separator
+        #    in order) at step 1
+        args = list(range(len(y_mid)))
+        while len(args):
+            cur = args[0]
+            args = args[1:]
+            # print("iter", cur, y_mid[cur], "%d:%d" % (x_starting[cur], x_ending[cur]))
+            def get_span(start, y_top, y_bot):
+                # for last, l_top, l_bot, l_count in labelcolmap.get(start, []):
+                #     if y_top < l_bot and y_bot > l_top and last > start + 1:
+                #         width = (peaks_neg_tot[last] - peaks_neg_tot[start])
+                #         print("span", start, last, l_top, l_bot, l_count,
+                #               "box area", (y_bot - y_top) * width,
+                #               "label area", (min(y_bot, l_bot) - max(y_top, l_top)) * width,
+                #               "box height", (y_bot - y_top),
+                #               "label height", sum(regions_without_separators[
+                #                   y_top: y_bot, peaks_neg_tot[start + 1]]))
+                return min((last for last, l_top, l_bot, l_count in labelcolmap.get(start, [])
+                            # yield the right-most column that does not cut through
+                            # any regions in this horizontal span
+                            if y_top < l_bot and y_bot > l_top
+                            # Ignore if it ends here, anyway
+                            and last > start + 1
+                            # Ensure this is not just a tiny region near larger regions
+                            and l_count > 0.1 * max(l_count2 for _, l_top2, l_bot2, l_count2 in labelcolmap[start]
+                                                    if y_top < l_bot2 and y_bot > l_top2)
+                            # or just a small cut of the respective region
+                            # (i.e. box should cover at least 10% of the label).
+                            and ((min(y_bot, l_bot) - max(y_top, l_top)) *
+                                 (peaks_neg_tot[last] - peaks_neg_tot[start])) > 0.1 * l_count
+                            # But do allow cutting tiny passages with less 10% of height
+                            # (i.e. label is already almost separated by columns)
+                            and sum(regions_without_separators[
+                                y_top: y_bot, peaks_neg_tot[start + 1]]) > 0.1 * (y_bot - y_top)),
+                           # Otherwise advance only 1 column.
+                           default=start + 1)
+            def add_sep(cur):
+                column = x_starting[cur]
+                while column < x_ending[cur]:
+                    nxt = np.flatnonzero((y_mid[cur] < y_mid) &
+                                         (column >= x_starting) &
+                                         (column < x_ending))
+                    if len(nxt):
+                        nxt = nxt[0]
+                        # print("column", column)
+                        last = get_span(column, y_max[cur], y_min[nxt])
+                        last = min(last, x_ending[nxt], x_ending[cur])
+                        # print("nxt", nxt, y_mid[nxt], "%d:%d" % (column, last))
                         boxes.append([peaks_neg_tot[column],
-                                      peaks_neg_tot[column+1],
-                                      y_mid_itself,
-                                      y_mid_next])
-                        # dbg_plt(boxes[-1], "A column %d box" % (column + 1))
-            except:
-                logger.exception("cannot assign boxes")
-                boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1],
-                              top, bot])
-                # dbg_plt(boxes[-1], "fallback box")
-        else:
-            # order multi-column separators
-            y_mid_by_order=[]
-            x_start_by_order=[]
-            x_end_by_order=[]
-            if len(x_starting)>0:
-                columns_covered_by_seps_covered_more_than_2col = set()
-                for dj in range(len(x_starting)):
-                    if set(range(x_starting[dj], x_ending[dj])) != all_columns:
-                        columns_covered_by_seps_covered_more_than_2col.update(
-                            range(x_starting[dj], x_ending[dj]))
-                columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col)
-
-                y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1,
-                                                 dtype=int) * top)
-                ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered))
-                ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
-                x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype))
-                x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1)
-                if len(new_main_sep_y) > 0:
-                    x_starting = np.append(x_starting, 0)
-                    x_ending = np.append(x_ending, len(peaks_neg_tot) - 1)
-                else:
-                    x_starting = np.append(x_starting, x_starting[0])
-                    x_ending = np.append(x_ending, x_ending[0])
-            else:
-                columns_not_covered = list(all_columns)
-                y_mid = np.append(y_mid, np.ones(len(columns_not_covered),
-                                                 dtype=int) * top)
-                ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered))
-                ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
-                x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype))
-                x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1)
-
-            ind_args = np.arange(len(y_mid))
-
-            for column in range(len(peaks_neg_tot)-1):
-                #print(column,'column')
-                ind_args_in_col=ind_args[x_starting==column]
-                #print(len(y_mid))
-                y_mid_column=y_mid[ind_args_in_col]
-                x_start_column=x_starting[ind_args_in_col]
-                x_end_column=x_ending[ind_args_in_col]
-
-                ind_args_col_sorted = np.argsort(y_mid_column)
-                y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
-                x_start_by_order.extend(x_start_column[ind_args_col_sorted])
-                x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
-
-            # create single-column boxes from multi-column separators
-            y_mid_by_order = np.array(y_mid_by_order)
-            x_start_by_order = np.array(x_start_by_order)
-            x_end_by_order = np.array(x_end_by_order)
-            for il in range(len(y_mid_by_order)):
-                #print(il, "il")
-                y_mid_itself = y_mid_by_order[il]
-                #print(y_mid_itself,'y_mid_itself')
-                x_start_itself = x_start_by_order[il]
-                x_end_itself = x_end_by_order[il]
-                for column in range(x_start_itself, x_end_itself+1):
-                    #print(column,'cols')
-                    #print('burda2')
-                    y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) &
-                                                (column >= x_start_by_order) &
-                                                (column <= x_end_by_order)]
-                    #print(y_mid_next,'y_mid_next')
-                    y_mid_next = y_mid_next.min(initial=bot)
-                    #print(y_mid_next,'y_mid_next')
-                    boxes.append([peaks_neg_tot[column],
-                                  peaks_neg_tot[column+1],
-                                  y_mid_itself,
-                                  y_mid_next])
-                    # dbg_plt(boxes[-1], "B column %d box" % (column + 1))
+                                      peaks_neg_tot[last],
+                                      y_mid[cur],
+                                      y_mid[nxt]])
+                        # dbg_plt(boxes[-1], "recursive column %d:%d box [%d]" % (column, last, len(boxes)))
+                        column = last
+                        if last == x_ending[nxt] and x_ending[nxt] <= x_ending[cur] and nxt in args:
+                            # child – recur
+                            # print("recur", nxt, y_mid[nxt], "%d:%d" % (x_starting[nxt], x_ending[nxt]))
+                            args.remove(nxt)
+                            add_sep(nxt)
+                    else:
+                        # print("column", column)
+                        last = get_span(column, y_max[cur], bot)
+                        # print("bot", bot, "%d:%d" % (column, last))
+                        boxes.append([peaks_neg_tot[column],
+                                      peaks_neg_tot[last],
+                                      y_mid[cur],
+                                      bot])
+                        # dbg_plt(boxes[-1], "non-recursive column %d box [%d]" % (column, len(boxes)))
+                        column = last
+            add_sep(cur)
 
     if right2left_readingorder:
         peaks_neg_tot_tables_new = []