From 6fbb5f8a12185192f7d9db7b008c3ef8b5f24d33 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 24 Oct 2025 02:02:39 +0200
Subject: [PATCH] `return_boxes_of_images_by_order_of_reading_new`: simplify
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- array instead of list operations
- add better plotting (but commented out)
- add more debug printing (but commented out)
- add more inline comments for documentation
- rename identifiers to make more readable:
  - `cy_hor_diff` → `y_max_hor_some` (because the ymax gets passed)
  - `lines` → `seps`
  - `y_type_2` → `y_mid`
  - `y_diff_type_2` → `y_max`
  - `y_lines_by_order` → `y_mid_by_order`
  - `y_lines_without_mother` → `y_mid_without_mother`
  - `y_lines_with_child_without_mother` → `y_mid_with_child_without_mother`
  - `y_column` → `y_mid_column`
  - `y_column_nc` → `y_mid_column_nc`
  - `y_all_between_nm_wc` → `y_mid_between_nm_wc`
  - `lines_so_close_to_top_separator` → `seps_too_close_to_top_separator`
  - `y_in_cols` and `y_down` → `y_mid_next`
- use `pairwise()` `nc_top:nc_bot` instead of `i_c` indexing
---
 src/eynollah/utils/__init__.py | 480 +++++++++++++++++----------------
 1 file changed, 247 insertions(+), 233 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index 4046396..eca96f3 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -1599,19 +1599,31 @@ def return_boxes_of_images_by_order_of_reading_new(
     if logger is None:
         logger = getLogger(__package__)
     logger.debug('enter return_boxes_of_images_by_order_of_reading_new')
-    # def dbg_plt(box=None, title=None):
-    #     if box is None:
-    #         box = [None, None, None, None]
-    #     img = regions_without_separators[box[2]:box[3], box[0]:box[1]]
+
+    # def dbg_plt(box=None, title=None, rectangles=None, rectangles_showidx=False):
+    #     minx, maxx, miny, maxy = box or (0, None, 0, None)
+    #     img = regions_without_separators[miny:maxy, minx:maxx]
     #     plt.imshow(img)
     #     xrange = np.arange(0, img.shape[1], 100)
     #     yrange = np.arange(0, img.shape[0], 100)
-    #     plt.gca().set_xticks(xrange, xrange + (box[0] or 0))
-    #     plt.gca().set_yticks(yrange, yrange + (box[2] or 0))
+    #     ax = plt.gca()
+    #     ax.set_xticks(xrange)
+    #     ax.set_yticks(yrange)
+    #     ax.set_xticklabels(xrange + minx)
+    #     ax.set_yticklabels(yrange + miny)
+    #     def format_coord(x, y):
+    #         return 'x={:g}, y={:g}'.format(x + minx, y + miny)
+    #     ax.format_coord = format_coord
     #     if title:
     #         plt.title(title)
+    #     if rectangles:
+    #         for i, (xmin, xmax, ymin, ymax) in enumerate(rectangles):
+    #             ax.add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
+    #                                            fill=False, linewidth=1, edgecolor='r'))
+    #             if rectangles_showidx:
+    #                 ax.text((xmin+xmax)/2, (ymin+ymax)/2, str(i + 1), c='r')
     #     plt.show()
-    # dbg_plt()
+    # dbg_plt(title="return_boxes_of_images_by_order_of_reading_new")
 
     boxes=[]
     peaks_neg_tot_tables = []
@@ -1619,9 +1631,7 @@ def return_boxes_of_images_by_order_of_reading_new(
     width_tot = regions_without_separators.shape[1]
     for top, bot in pairwise(splitter_y_new):
         # print("%d:%d" % (top, bot), 'i')
-        # dbg_plt([None, None, top, bot],
-        #         "image cut for y split %d:%d" % (
-        #             top, bot))
+        # dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot))
         matrix_new = matrix_of_lines_ch[(matrix_of_lines_ch[:,6] > top) &
                                         (matrix_of_lines_ch[:,7] < bot)]
         #print(len( matrix_new[:,9][matrix_new[:,9]==1] ))
@@ -1677,20 +1687,21 @@ def return_boxes_of_images_by_order_of_reading_new(
                             peaks_neg_fin = peaks_neg_fin1
                         else:
                             peaks_neg_fin = peaks_neg_fin2
+                        # add offset to local result
                         peaks_neg_fin = list(np.array(peaks_neg_fin) + left)
                         #print(peaks_neg_fin,'peaks_neg_fin')
 
-                        if right < peaks_neg_fin_early[-1]:
-                            peaks_neg_fin_rev.append(right)
                         peaks_neg_fin_rev.extend(peaks_neg_fin)
+                        if right < peaks_neg_fin_early[-1]:
+                            # all but the last column: interject the preexisting boundary
+                            peaks_neg_fin_rev.append(right)
+                        #print(peaks_neg_fin_rev,'peaks_neg_fin_rev')
 
-                    if len(peaks_neg_fin_rev)>=len(peaks_neg_fin_org):
-                        peaks_neg_fin=list(np.sort(peaks_neg_fin_rev))
-                        num_col=len(peaks_neg_fin)
+                    if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org):
+                        peaks_neg_fin = peaks_neg_fin_rev
                     else:
-                        peaks_neg_fin=list(np.copy(peaks_neg_fin_org))
-                        num_col=len(peaks_neg_fin)
-
+                        peaks_neg_fin = peaks_neg_fin_org
+                    num_col = len(peaks_neg_fin)
                     #print(peaks_neg_fin,'peaks_neg_fin')
             except:
                 logger.exception("cannot find peaks consistent with columns")
@@ -1700,7 +1711,7 @@ def return_boxes_of_images_by_order_of_reading_new(
             x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ]
             x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ]
             cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ]
-            cy_hor_diff=matrix_new[:,7][ (matrix_new[:,9]==0) ]
+            y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ]
 
             if right2left_readingorder:
                 x_max_hor_some_new = width_tot - x_min_hor_some
@@ -1708,136 +1719,121 @@ def return_boxes_of_images_by_order_of_reading_new(
                 x_min_hor_some =list(np.copy(x_min_hor_some_new))
                 x_max_hor_some =list(np.copy(x_max_hor_some_new))
 
-            peaks_neg_tot = [0] + peaks_neg_fin + [width_tot]
+            peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot])
+            #print(peaks_neg_tot,'peaks_neg_tot')
             peaks_neg_tot_tables.append(peaks_neg_tot)
 
-            reading_order_type, x_starting, x_ending, y_type_2, y_diff_type_2, \
-                y_lines_without_mother, x_start_without_mother, x_end_without_mother, there_is_sep_with_child, \
-                y_lines_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \
-                new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order(
-                    x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, cy_hor_diff)
-
             all_columns = set(range(len(peaks_neg_tot) - 1))
-            # print("all_columns", all_columns)
+            #print("all_columns", all_columns)
+
+            reading_order_type, x_starting, x_ending, y_mid, y_max, \
+                y_mid_without_mother, x_start_without_mother, x_end_without_mother, \
+                there_is_sep_with_child, \
+                y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \
+                new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order(
+                    x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, y_max_hor_some)
+
+            # show multi-column separators
+            # dbg_plt([0, None, top, bot], "multi-column separators in current split", 
+            #         list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending],
+            #                  y_mid - top, y_max - top)), True)
+
             if (reading_order_type == 1 or
-                len(y_lines_without_mother) >= 2 or
+                len(y_mid_without_mother) >= 2 or
                 there_is_sep_with_child == 1):
+                # there are top-level multi-colspan horizontal separators which overlap each other
+                # or multiple top-level multi-colspan horizontal separators
+                # or multi-colspan horizontal separators shorter than their respective top-level:
+                # todo: explain how this is dealt with
                 try:
                     y_grenze = top + 300
-                    #check if there is a big separator in this y_mains_sep_ohne_grenzen
+                    up = (y_mid > top) & (y_mid <= y_grenze)
 
-                    args_early_ys=np.arange(len(y_type_2))
+                    args_early_ys=np.arange(len(y_mid))
                     #print(args_early_ys,'args_early_ys')
-                    #print(top, bot)
+                    #print(y_mid,'y_mid')
 
-                    x_starting_up = x_starting[(y_type_2 > top) &
-                                               (y_type_2 <= y_grenze)]
-                    x_ending_up = x_ending[(y_type_2 > top) &
-                                           (y_type_2 <= y_grenze)]
-                    y_type_2_up = y_type_2[(y_type_2 > top) &
-                                           (y_type_2 <= y_grenze)]
-                    y_diff_type_2_up = y_diff_type_2[(y_type_2 > top) &
-                                                     (y_type_2 <= y_grenze)]
-                    args_up = args_early_ys[(y_type_2 > top) &
-                                            (y_type_2 <= y_grenze)]
-                    if len(y_type_2_up) > 0:
-                        y_main_separator_up = y_type_2_up [(x_starting_up==0) &
-                                                           (x_ending_up==(len(peaks_neg_tot)-1) )]
-                        y_diff_main_separator_up = y_diff_type_2_up[(x_starting_up==0) &
-                                                                    (x_ending_up==(len(peaks_neg_tot)-1) )]
-                        args_main_to_deleted = args_up[(x_starting_up==0) &
-                                                       (x_ending_up==(len(peaks_neg_tot)-1) )]
-                        #print(y_main_separator_up,y_diff_main_separator_up,args_main_to_deleted,'fffffjammmm')
-                        if len(y_diff_main_separator_up) > 0:
+                    x_starting_up = x_starting[up]
+                    x_ending_up = x_ending[up]
+                    y_mid_up = y_mid[up]
+                    y_max_up = y_max[up]
+                    args_up = args_early_ys[up]
+                    #print(args_up,'args_up')
+                    #print(y_mid_up,'y_mid_up')
+                    #check if there is a big separator in this y_mains0
+                    if len(y_mid_up) > 0:
+                        # is there a separator with full-width span?
+                        main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1)
+                        y_mid_main_separator_up = y_mid_up[main_separator]
+                        y_max_main_separator_up = y_max_up[main_separator]
+                        args_main_to_deleted = args_up[main_separator]
+                        #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm')
+                        if len(y_max_main_separator_up):
                             args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) ))
                             #print(args_to_be_kept,'args_to_be_kept')
-                            boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1],
-                                          top, y_diff_main_separator_up.max()])
-                            # dbg_plt(boxes[-1], "first box")
-                            top = y_diff_main_separator_up.max()
+                            boxes.append([0, peaks_neg_tot[-1],
+                                          top, y_max_main_separator_up.max()])
+                            # dbg_plt(boxes[-1], "near top main separator box")
+                            top = y_max_main_separator_up.max()
 
                             #print(top,'top')
-                            y_type_2 = y_type_2[args_to_be_kept]
+                            y_mid = y_mid[args_to_be_kept]
                             x_starting = x_starting[args_to_be_kept]
                             x_ending = x_ending[args_to_be_kept]
-                            y_diff_type_2 = y_diff_type_2[args_to_be_kept]
+                            y_max = y_max[args_to_be_kept]
 
                             #print('galdiha')
                             y_grenze = top + 200
-                            args_early_ys2=np.arange(len(y_type_2))
-                            y_type_2_up=y_type_2[(y_type_2 > top) &
-                                                 (y_type_2 <= y_grenze)]
-                            x_starting_up=x_starting[(y_type_2 > top) &
-                                                     (y_type_2 <= y_grenze)]
-                            x_ending_up=x_ending[(y_type_2 > top) &
-                                                 (y_type_2 <= y_grenze)]
-                            y_diff_type_2_up=y_diff_type_2[(y_type_2 > top) &
-                                                           (y_type_2 <= y_grenze)]
-                            args_up2=args_early_ys2[(y_type_2 > top) &
-                                                    (y_type_2 <= y_grenze)]
-                            #print(y_type_2_up,x_starting_up,x_ending_up,'didid')
-                            nodes_in = set()
-                            for ij in range(len(x_starting_up)):
-                                nodes_in.update(range(x_starting_up[ij],
-                                                      x_ending_up[ij]))
-                            #print(nodes_in,'nodes_in')
+                            up = (y_mid > top) & (y_mid <= y_grenze)
+                            args_early_ys2 = np.arange(len(y_mid))
+                            x_starting_up = x_starting[up]
+                            x_ending_up = x_ending[up]
+                            y_mid_up = y_mid[up]
+                            y_max_up = y_max[up]
+                            args_up2 = args_early_ys2[up]
+                            #print(y_mid_up,x_starting_up,x_ending_up,'didid')
+                        else:
+                            args_early_ys2 = args_early_ys
+                            args_up2 = args_up
 
-                            if nodes_in == set(range(len(peaks_neg_tot)-1)):
-                                pass
-                            elif nodes_in == set(range(1, len(peaks_neg_tot)-1)):
-                                pass
-                            else:
-                                #print('burdaydikh')
-                                args_to_be_kept2=np.array(list( set(args_early_ys2)-set(args_up2) ))
+                        nodes_in = set()
+                        for ij in range(len(x_starting_up)):
+                            nodes_in.update(range(x_starting_up[ij],
+                                                  x_ending_up[ij]))
+                        #print(nodes_in,'nodes_in')
+                        #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))')
 
-                                if len(args_to_be_kept2)>0:
-                                    y_type_2 = y_type_2[args_to_be_kept2]
-                                    x_starting = x_starting[args_to_be_kept2]
-                                    x_ending = x_ending[args_to_be_kept2]
-                                    y_diff_type_2 = y_diff_type_2[args_to_be_kept2]
-                                else:
-                                    pass
-                                #print('burdaydikh2')
-                        elif len(y_diff_main_separator_up)==0:
-                            nodes_in = set()
-                            for ij in range(len(x_starting_up)):
-                                nodes_in.update(range(x_starting_up[ij],
-                                                      x_ending_up[ij]))
-                            #print(nodes_in,'nodes_in2')
-                            #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))')
+                        if nodes_in == set(range(len(peaks_neg_tot)-1)):
+                            pass
+                        elif nodes_in == set(range(1, len(peaks_neg_tot)-1)):
+                            pass
+                        else:
+                            #print('burdaydikh')
+                            args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) ))
 
-                            if nodes_in == set(range(len(peaks_neg_tot)-1)):
-                                pass
-                            elif nodes_in == set(range(1,len(peaks_neg_tot)-1)):
-                                pass
-                            else:
-                                #print('burdaydikh')
-                                #print(args_early_ys,'args_early_ys')
-                                #print(args_up,'args_up')
-                                args_to_be_kept2=np.array(list( set(args_early_ys) - set(args_up) ))
-
-                                #print(args_to_be_kept2,'args_to_be_kept2')
-                                #print(len(y_type_2),len(x_starting),len(x_ending),len(y_diff_type_2))
-                                if len(args_to_be_kept2)>0:
-                                    y_type_2 = y_type_2[args_to_be_kept2]
-                                    x_starting = x_starting[args_to_be_kept2]
-                                    x_ending = x_ending[args_to_be_kept2]
-                                    y_diff_type_2 = y_diff_type_2[args_to_be_kept2]
-                                else:
-                                    pass
-                                #print('burdaydikh2')
+                            if len(args_to_be_kept2):
+                                #print(args_to_be_kept2, "args_to_be_kept2")
+                                y_mid = y_mid[args_to_be_kept2]
+                                x_starting = x_starting[args_to_be_kept2]
+                                x_ending = x_ending[args_to_be_kept2]
+                                y_max = y_max[args_to_be_kept2]
 
                     #int(top)
-                    y_lines_by_order=[]
+                    # order multi-column separators
+                    y_mid_by_order=[]
                     x_start_by_order=[]
                     x_end_by_order=[]
                     if (reading_order_type == 1 or
                         len(x_end_with_child_without_mother) == 0):
                         if reading_order_type == 1:
-                            y_lines_by_order.append(top)
+                            # there are top-level multi-colspan horizontal separators which overlap each other
+                            #print("adding all columns at top because of multiple overlapping mothers")
+                            y_mid_by_order.append(top)
                             x_start_by_order.append(0)
                             x_end_by_order.append(len(peaks_neg_tot)-2)
                         else:
+                            # there are no top-level multi-colspan horizontal separators which themselves
+                            # contain shorter multi-colspan separators
                             #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo')
                             columns_covered_by_mothers = set()
                             for dj in range(len(x_start_without_mother)):
@@ -1845,31 +1841,32 @@ def return_boxes_of_images_by_order_of_reading_new(
                                     range(x_start_without_mother[dj],
                                           x_end_without_mother[dj]))
                             columns_not_covered = list(all_columns - columns_covered_by_mothers)
-                            y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) +
-                                                                   len(x_start_without_mother),
-                                                                   dtype=int) * top)
-                            ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered))
+                            #print(columns_covered_by_mothers, "columns_covered_by_mothers")
+                            #print(columns_not_covered, "columns_not_covered")
+                            y_mid = np.append(y_mid, np.ones(len(columns_not_covered) +
+                                                             len(x_start_without_mother),
+                                                             dtype=int) * top)
+                            ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered))
                             ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
                             x_starting = np.append(x_starting, np.array(columns_not_covered, int))
                             x_starting = np.append(x_starting, x_start_without_mother)
                             x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1)
                             x_ending = np.append(x_ending, x_end_without_mother)
 
-                        ind_args=np.arange(len(y_type_2))
-                        #ind_args=np.array(ind_args)
+                        ind_args=np.arange(len(y_mid))
                         #print(ind_args,'ind_args')
                         for column in range(len(peaks_neg_tot)-1):
                             #print(column,'column')
                             ind_args_in_col=ind_args[x_starting==column]
                             #print('babali2')
                             #print(ind_args_in_col,'ind_args_in_col')
-                            #print(len(y_type_2))
-                            y_column=y_type_2[ind_args_in_col]
+                            #print(len(y_mid))
+                            y_mid_column=y_mid[ind_args_in_col]
                             x_start_column=x_starting[ind_args_in_col]
                             x_end_column=x_ending[ind_args_in_col]
                             #print('babali3')
-                            ind_args_col_sorted=np.argsort(y_column)
-                            y_lines_by_order.extend(y_column[ind_args_col_sorted])
+                            ind_args_col_sorted=np.argsort(y_mid_column)
+                            y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
                             x_start_by_order.extend(x_start_column[ind_args_col_sorted])
                             x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
                     else:
@@ -1880,93 +1877,113 @@ def return_boxes_of_images_by_order_of_reading_new(
                                 range(x_start_without_mother[dj],
                                       x_end_without_mother[dj]))
                         columns_not_covered = list(all_columns - columns_covered_by_mothers)
-                        y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + len(x_start_without_mother),
-                                                               dtype=int) * top)
-                        ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered))
+                        #print(columns_covered_by_mothers, "columns_covered_by_mothers")
+                        #print(columns_not_covered, "columns_not_covered")
+                        y_mid = np.append(y_mid, np.ones(len(columns_not_covered) +
+                                                         len(x_start_without_mother),
+                                                         dtype=int) * top)
+                        ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered))
                         ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
                         x_starting = np.append(x_starting, np.array(columns_not_covered, int))
                         x_starting = np.append(x_starting, x_start_without_mother)
                         x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1)
                         x_ending = np.append(x_ending, x_end_without_mother)
 
-                        columns_covered_by_with_child_no_mothers = set()
+                        columns_covered_by_mothers_with_child = set()
                         for dj in range(len(x_end_with_child_without_mother)):
-                            columns_covered_by_with_child_no_mothers.update(
+                            columns_covered_by_mothers_with_child.update(
                                 range(x_start_with_child_without_mother[dj],
                                       x_end_with_child_without_mother[dj]))
-                        columns_not_covered_child_no_mother = list(
-                            all_columns - columns_covered_by_with_child_no_mothers)
+                        #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child")
+                        columns_not_covered_by_mothers_with_child = list(
+                            all_columns - columns_covered_by_mothers_with_child)
                         #indexes_to_be_spanned=[]
                         for i_s in range(len(x_end_with_child_without_mother)):
-                            columns_not_covered_child_no_mother.append(x_start_with_child_without_mother[i_s])
-                        columns_not_covered_child_no_mother = np.sort(columns_not_covered_child_no_mother)
-                        ind_args = np.arange(len(y_type_2))
-                        x_end_with_child_without_mother = np.array(x_end_with_child_without_mother, int)
-                        x_start_with_child_without_mother = np.array(x_start_with_child_without_mother, int)
-                        for i_s_nc in columns_not_covered_child_no_mother:
+                            columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s])
+                        columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child)
+                        #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child")
+                        ind_args = np.arange(len(y_mid))
+                        for i_s_nc in columns_not_covered_by_mothers_with_child:
                             if i_s_nc in x_start_with_child_without_mother:
+                                # use only seps with mother's span ("biggest")
                                 #print("i_s_nc", i_s_nc)
                                 x_end_biggest_column = \
-                                    x_end_with_child_without_mother[x_start_with_child_without_mother==i_s_nc][0]
-                                args_all_biggest_lines = ind_args[(x_starting==i_s_nc) &
-                                                                  (x_ending==x_end_biggest_column)]
-                                y_column_nc = y_type_2[args_all_biggest_lines]
-                                #x_start_column_nc = x_starting[args_all_biggest_lines]
-                                #x_end_column_nc = x_ending[args_all_biggest_lines]
-                                y_column_nc = np.sort(y_column_nc)
-                                for i_c in range(len(y_column_nc)):
+                                    x_end_with_child_without_mother[
+                                        x_start_with_child_without_mother == i_s_nc][0]
+                                args_all_biggest_seps = \
+                                    ind_args[(x_starting == i_s_nc) &
+                                             (x_ending == x_end_biggest_column)]
+                                y_mid_column_nc = y_mid[args_all_biggest_seps]
+                                #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child")
+                                #x_start_column_nc = x_starting[args_all_biggest_seps]
+                                #x_end_column_nc = x_ending[args_all_biggest_seps]
+                                y_mid_column_nc = np.sort(y_mid_column_nc)
+                                #print(y_mid_column_nc, "y_mid_column_nc (sorted)")
+                                for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)):
                                     #print("i_c", i_c)
-                                    ind_all_lines_between_nm_wc = \
-                                        ind_args[(y_type_2 > y_column_nc[i_c]) &
-                                                 (y_type_2 < (y_column_nc[i_c+1]
-                                                              if i_c < len(y_column_nc)-1
-                                                              else bot)) &
+                                    #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc")
+                                    ind_all_seps_between_nm_wc = \
+                                        ind_args[(y_mid > nc_top) &
+                                                 (y_mid < nc_bot) &
                                                  (x_starting >= i_s_nc) &
                                                  (x_ending <= x_end_biggest_column)]
-                                    y_all_between_nm_wc = y_type_2[ind_all_lines_between_nm_wc]
-                                    x_starting_all_between_nm_wc = x_starting[ind_all_lines_between_nm_wc]
-                                    x_ending_all_between_nm_wc = x_ending[ind_all_lines_between_nm_wc]
+                                    y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc]
+                                    x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc]
+                                    x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc]
 
                                     columns_covered_by_mothers = set()
-                                    for dj in range(len(ind_all_lines_between_nm_wc)):
+                                    for dj in range(len(ind_all_seps_between_nm_wc)):
                                         columns_covered_by_mothers.update(
                                             range(x_starting_all_between_nm_wc[dj],
                                                   x_ending_all_between_nm_wc[dj]))
+                                    #print(columns_covered_by_mothers, "columns_covered_by_mothers")
                                     child_columns = set(range(i_s_nc, x_end_biggest_column))
                                     columns_not_covered = list(child_columns - columns_covered_by_mothers)
+                                    #print(child_columns, "child_columns")
+                                    #print(columns_not_covered, "columns_not_covered")
 
-                                    if len(ind_all_lines_between_nm_wc):
+                                    if len(ind_all_seps_between_nm_wc):
                                         biggest = np.argmax(x_ending_all_between_nm_wc -
                                                             x_starting_all_between_nm_wc)
+                                        #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc")
+                                        #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest],
+                                                                  x_ending_all_between_nm_wc[biggest]), "biggest")
                                         if columns_covered_by_mothers == set(
                                                 range(x_starting_all_between_nm_wc[biggest],
                                                       x_ending_all_between_nm_wc[biggest])):
-                                            # biggest accounts for all columns alone,
-                                            # longest line should be extended
-                                            lines_so_close_to_top_separator = \
-                                                ((y_all_between_nm_wc > y_column_nc[i_c]) &
-                                                 (y_all_between_nm_wc <= y_column_nc[i_c] + 500))
-                                            if (np.count_nonzero(lines_so_close_to_top_separator) and
-                                                np.count_nonzero(lines_so_close_to_top_separator) <
-                                                len(ind_all_lines_between_nm_wc)):
-                                                y_all_between_nm_wc = \
-                                                    y_all_between_nm_wc[~lines_so_close_to_top_separator]
+                                            # single biggest accounts for all covered columns alone,
+                                            # this separator should be extended to cover all
+                                            seps_too_close_to_top_separator = \
+                                                ((y_mid_all_between_nm_wc > nc_top) &
+                                                 (y_mid_all_between_nm_wc <= nc_top + 500))
+                                            if (np.count_nonzero(seps_too_close_to_top_separator) and
+                                                np.count_nonzero(seps_too_close_to_top_separator) <
+                                                len(ind_all_seps_between_nm_wc)):
+                                                #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator")
+                                                y_mid_all_between_nm_wc = \
+                                                    y_mid_all_between_nm_wc[~seps_too_close_to_top_separator]
                                                 x_starting_all_between_nm_wc = \
-                                                    x_starting_all_between_nm_wc[~lines_so_close_to_top_separator]
+                                                    x_starting_all_between_nm_wc[~seps_too_close_to_top_separator]
                                                 x_ending_all_between_nm_wc = \
-                                                    x_ending_all_between_nm_wc[~lines_so_close_to_top_separator]
+                                                    x_ending_all_between_nm_wc[~seps_too_close_to_top_separator]
 
-                                            y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c])
-                                            x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, i_s_nc)
-                                            x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_end_biggest_column)
+                                            y_mid_all_between_nm_wc = np.append(
+                                                y_mid_all_between_nm_wc, nc_top)
+                                            x_starting_all_between_nm_wc = np.append(
+                                                x_starting_all_between_nm_wc, i_s_nc)
+                                            x_ending_all_between_nm_wc = np.append(
+                                                x_ending_all_between_nm_wc, x_end_biggest_column)
                                         else:
-                                            y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c])
-                                            x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest])
-                                            x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest])
+                                            y_mid_all_between_nm_wc = np.append(
+                                                y_mid_all_between_nm_wc, nc_top)
+                                            x_starting_all_between_nm_wc = np.append(
+                                                x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest])
+                                            x_ending_all_between_nm_wc = np.append(
+                                                x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest])
 
                                     if len(columns_not_covered):
-                                        y_all_between_nm_wc = np.append(
-                                            y_all_between_nm_wc, [y_column_nc[i_c]] * len(columns_not_covered))
+                                        y_mid_all_between_nm_wc = np.append(
+                                            y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered))
                                         x_starting_all_between_nm_wc = np.append(
                                             x_starting_all_between_nm_wc, np.array(columns_not_covered, int))
                                         x_ending_all_between_nm_wc = np.append(
@@ -1977,52 +1994,53 @@ def return_boxes_of_images_by_order_of_reading_new(
                                         ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column]
                                         #print('babali2')
                                         #print(ind_args_in_col,'ind_args_in_col')
-                                        #print(len(y_type_2))
-                                        y_column=y_all_between_nm_wc[ind_args_in_col]
+                                        #print(len(y_mid))
+                                        y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col]
                                         x_start_column=x_starting_all_between_nm_wc[ind_args_in_col]
                                         x_end_column=x_ending_all_between_nm_wc[ind_args_in_col]
                                         #print('babali3')
-                                        ind_args_col_sorted=np.argsort(y_column)
-                                        y_lines_by_order.extend(y_column[ind_args_col_sorted])
+                                        ind_args_col_sorted=np.argsort(y_mid_column)
+                                        y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
                                         x_start_by_order.extend(x_start_column[ind_args_col_sorted])
                                         x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
                             else:
-                                #print(column,'column')
+                                #print(i_s_nc,'column not covered by mothers with child')
                                 ind_args_in_col=ind_args[x_starting==i_s_nc]
                                 #print('babali2')
                                 #print(ind_args_in_col,'ind_args_in_col')
-                                #print(len(y_type_2))
-                                y_column=y_type_2[ind_args_in_col]
+                                #print(len(y_mid))
+                                y_mid_column=y_mid[ind_args_in_col]
                                 x_start_column=x_starting[ind_args_in_col]
                                 x_end_column=x_ending[ind_args_in_col]
                                 #print('babali3')
-                                ind_args_col_sorted = np.argsort(y_column)
-                                y_lines_by_order.extend(y_column[ind_args_col_sorted])
+                                ind_args_col_sorted = np.argsort(y_mid_column)
+                                y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
                                 x_start_by_order.extend(x_start_column[ind_args_col_sorted])
                                 x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
 
-                    y_lines_by_order = np.array(y_lines_by_order)
+                    # create single-column boxes from multi-column separators
+                    y_mid_by_order = np.array(y_mid_by_order)
                     x_start_by_order = np.array(x_start_by_order)
                     x_end_by_order = np.array(x_end_by_order)
-                    for il in range(len(y_lines_by_order)):
+                    for il in range(len(y_mid_by_order)):
                         #print(il, "il")
-                        y_itself = y_lines_by_order[il]
+                        y_mid_itself = y_mid_by_order[il]
                         x_start_itself = x_start_by_order[il]
                         x_end_itself = x_end_by_order[il]
                         for column in range(int(x_start_itself), int(x_end_itself)+1):
                             #print(column,'cols')
-                            y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) &
-                                                         (column >= x_start_by_order) &
-                                                         (column <= x_end_by_order)]
                             #print('burda')
-                            y_down = y_in_cols.min(initial=bot)
                             #print('burda2')
-                            #print(y_in_cols,'y_in_cols')
-                            #print(y_itself,'y_itself')
+                            y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) &
+                                                        (column >= x_start_by_order) &
+                                                        (column <= x_end_by_order)]
+                            y_mid_next = y_mid_next.min(initial=bot)
+                            #print(y_mid_next,'y_mid_next')
+                            #print(y_mid_itself,'y_mid_itself')
                             boxes.append([peaks_neg_tot[column],
                                           peaks_neg_tot[column+1],
-                                          y_itself,
-                                          y_down])
+                                          y_mid_itself,
+                                          y_mid_next])
                             # dbg_plt(boxes[-1], "A column %d box" % (column + 1))
                 except:
                     logger.exception("cannot assign boxes")
@@ -2030,20 +2048,21 @@ def return_boxes_of_images_by_order_of_reading_new(
                                   top, bot])
                     # dbg_plt(boxes[-1], "fallback box")
             else:
-                y_lines_by_order=[]
+                # order multi-column separators
+                y_mid_by_order=[]
                 x_start_by_order=[]
                 x_end_by_order=[]
                 if len(x_starting)>0:
-                    columns_covered_by_lines_covered_more_than_2col = set()
+                    columns_covered_by_seps_covered_more_than_2col = set()
                     for dj in range(len(x_starting)):
                         if set(range(x_starting[dj], x_ending[dj])) != all_columns:
-                            columns_covered_by_lines_covered_more_than_2col.update(
+                            columns_covered_by_seps_covered_more_than_2col.update(
                                 range(x_starting[dj], x_ending[dj]))
-                    columns_not_covered = list(all_columns - columns_covered_by_lines_covered_more_than_2col)
+                    columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col)
 
-                    y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + 1,
-                                                           dtype=int) * top)
-                    ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered))
+                    y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1,
+                                                     dtype=int) * top)
+                    ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered))
                     ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
                     x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype))
                     x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1)
@@ -2055,53 +2074,52 @@ def return_boxes_of_images_by_order_of_reading_new(
                         x_ending = np.append(x_ending, x_ending[0])
                 else:
                     columns_not_covered = list(all_columns)
-                    y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered),
-                                                           dtype=int) * top)
-                    ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered))
+                    y_mid = np.append(y_mid, np.ones(len(columns_not_covered),
+                                                     dtype=int) * top)
+                    ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered))
                     ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
                     x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype))
                     x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1)
 
-                ind_args = np.arange(len(y_type_2))
-                
+                ind_args = np.arange(len(y_mid))
+
                 for column in range(len(peaks_neg_tot)-1):
                     #print(column,'column')
                     ind_args_in_col=ind_args[x_starting==column]
-                    #print(len(y_type_2))
-                    y_column=y_type_2[ind_args_in_col]
+                    #print(len(y_mid))
+                    y_mid_column=y_mid[ind_args_in_col]
                     x_start_column=x_starting[ind_args_in_col]
                     x_end_column=x_ending[ind_args_in_col]
 
-                    ind_args_col_sorted = np.argsort(y_column)
-                    y_lines_by_order.extend(y_column[ind_args_col_sorted])
+                    ind_args_col_sorted = np.argsort(y_mid_column)
+                    y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
                     x_start_by_order.extend(x_start_column[ind_args_col_sorted])
                     x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
 
-                y_lines_by_order = np.array(y_lines_by_order)
+                # create single-column boxes from multi-column separators
+                y_mid_by_order = np.array(y_mid_by_order)
                 x_start_by_order = np.array(x_start_by_order)
                 x_end_by_order = np.array(x_end_by_order)
-                for il in range(len(y_lines_by_order)):
+                for il in range(len(y_mid_by_order)):
                     #print(il, "il")
-                    y_itself = y_lines_by_order[il]
-                    #print(y_itself,'y_itself')
+                    y_mid_itself = y_mid_by_order[il]
+                    #print(y_mid_itself,'y_mid_itself')
                     x_start_itself = x_start_by_order[il]
                     x_end_itself = x_end_by_order[il]
                     for column in range(x_start_itself, x_end_itself+1):
                         #print(column,'cols')
-                        y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) &
-                                                     (column >= x_start_by_order) &
-                                                     (column <= x_end_by_order)]
                         #print('burda2')
-                        #print(y_in_cols,'y_in_cols')
-                        y_down = y_in_cols.min(initial=bot)
-                        #print(y_down,'y_down')
+                        y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) &
+                                                    (column >= x_start_by_order) &
+                                                    (column <= x_end_by_order)]
+                        #print(y_mid_next,'y_mid_next')
+                        y_mid_next = y_mid_next.min(initial=bot)
+                        #print(y_mid_next,'y_mid_next')
                         boxes.append([peaks_neg_tot[column],
                                       peaks_neg_tot[column+1],
-                                      y_itself,
-                                      y_down])
+                                      y_mid_itself,
+                                      y_mid_next])
                         # dbg_plt(boxes[-1], "B column %d box" % (column + 1))
-        #else:
-            #boxes.append([ 0, regions_without_separators[:,:].shape[1] ,top, bot])
 
     if right2left_readingorder:
         peaks_neg_tot_tables_new = []
@@ -2119,11 +2137,7 @@ def return_boxes_of_images_by_order_of_reading_new(
         peaks_neg_tot_tables = peaks_neg_tot_tables_new
 
     # show final xy-cut
-    # plt.imshow(regions_without_separators)
-    # for xmin, xmax, ymin, ymax in boxes:
-    #     plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
-    #                                           fill=False, linewidth=1, edgecolor='r'))
-    # plt.show()
+    # dbg_plt(None, "final XY-Cut", boxes, True)
 
     logger.debug('exit return_boxes_of_images_by_order_of_reading_new')
     return boxes, peaks_neg_tot_tables