return_boxes_of_images_by_order_of_reading_new: simplify

- enumeration instead of indexing
- array instead of list operations
- add better plotting (but commented out)
This commit is contained in:
Robert Sachunsky 2025-10-20 16:13:51 +02:00
parent cd35241e81
commit 7c3e418588

View file

@ -5,6 +5,7 @@ import math
try: try:
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.patches as patches
except ImportError: except ImportError:
plt = None plt = None
import numpy as np import numpy as np
@ -20,6 +21,7 @@ from .contour import (contours_in_same_horizon,
return_contours_of_image, return_contours_of_image,
return_parent_contours) return_parent_contours)
def pairwise(iterable): def pairwise(iterable):
# pairwise('ABCDEFG') → AB BC CD DE EF FG # pairwise('ABCDEFG') → AB BC CD DE EF FG
@ -205,15 +207,15 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order(
#print(x_end,'x_end') #print(x_end,'x_end')
#print(len_sep) #print(len_sep)
deleted=[] deleted = set()
for i in range(len(x_start)-1): for i in range(len(x_start)-1):
nodes_i=set(range(x_start[i],x_end[i]+1)) nodes_i=set(range(x_start[i],x_end[i]+1))
for j in range(i+1,len(x_start)): for j in range(i+1,len(x_start)):
if nodes_i==set(range(x_start[j],x_end[j]+1)): if nodes_i==set(range(x_start[j],x_end[j]+1)):
deleted.append(j) deleted.add(j)
#print(np.unique(deleted)) #print(np.unique(deleted))
remained_sep_indexes=set(range(len(x_start)))-set(np.unique(deleted) ) remained_sep_indexes = set(range(len(x_start))) - deleted
#print(remained_sep_indexes,'remained_sep_indexes') #print(remained_sep_indexes,'remained_sep_indexes')
mother=[]#if it has mother mother=[]#if it has mother
child=[] child=[]
@ -262,7 +264,7 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order(
x_start_with_child_without_mother = x_start[remained_sep_indexes_with_child_without_mother] x_start_with_child_without_mother = x_start[remained_sep_indexes_with_child_without_mother]
y_lines_with_child_without_mother = y_sep[remained_sep_indexes_with_child_without_mother] y_lines_with_child_without_mother = y_sep[remained_sep_indexes_with_child_without_mother]
reading_orther_type=0 reading_order_type=0
x_end_without_mother = x_end[remained_sep_indexes_without_mother] x_end_without_mother = x_end[remained_sep_indexes_without_mother]
x_start_without_mother = x_start[remained_sep_indexes_without_mother] x_start_without_mother = x_start[remained_sep_indexes_without_mother]
y_lines_without_mother = y_sep[remained_sep_indexes_without_mother] y_lines_without_mother = y_sep[remained_sep_indexes_without_mother]
@ -278,12 +280,11 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order(
x_end[remained_sep_indexes_without_mother[j]] x_end[remained_sep_indexes_without_mother[j]]
# + 1 # + 1
)) ))
set_diff = nodes_i - nodes_j if nodes_i - nodes_j != nodes_i:
if set_diff != nodes_i: reading_order_type = 1
reading_orther_type = 1
else: else:
reading_orther_type = 0 reading_order_type = 0
#print(reading_orther_type,'javab') #print(reading_order_type,'javab')
#print(y_lines_with_child_without_mother,'y_lines_with_child_without_mother') #print(y_lines_with_child_without_mother,'y_lines_with_child_without_mother')
#print(x_start_with_child_without_mother,'x_start_with_child_without_mother') #print(x_start_with_child_without_mother,'x_start_with_child_without_mother')
#print(x_end_with_child_without_mother,'x_end_with_hild_without_mother') #print(x_end_with_child_without_mother,'x_end_with_hild_without_mother')
@ -297,7 +298,7 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order(
#print(all_args_uniq,'all_args_uniq') #print(all_args_uniq,'all_args_uniq')
#print(args_to_be_unified,'args_to_be_unified') #print(args_to_be_unified,'args_to_be_unified')
return (reading_orther_type, return (reading_order_type,
x_start_returned, x_start_returned,
x_end_returned, x_end_returned,
y_sep_returned, y_sep_returned,
@ -1590,77 +1591,90 @@ def return_boxes_of_images_by_order_of_reading_new(
if logger is None: if logger is None:
logger = getLogger(__package__) logger = getLogger(__package__)
logger.debug('enter return_boxes_of_images_by_order_of_reading_new') logger.debug('enter return_boxes_of_images_by_order_of_reading_new')
# def dbg_plt(box=None, title=None):
# if box is None:
# box = [None, None, None, None]
# img = regions_without_separators[box[2]:box[3], box[0]:box[1]]
# plt.imshow(img)
# xrange = np.arange(0, img.shape[1], 100)
# yrange = np.arange(0, img.shape[0], 100)
# plt.gca().set_xticks(xrange, xrange + (box[0] or 0))
# plt.gca().set_yticks(yrange, yrange + (box[2] or 0))
# if title:
# plt.title(title)
# plt.show()
# dbg_plt()
boxes=[] boxes=[]
peaks_neg_tot_tables = [] peaks_neg_tot_tables = []
splitter_y_new = np.array(splitter_y_new, dtype=int) splitter_y_new = np.array(splitter_y_new, dtype=int)
for i in range(len(splitter_y_new)-1): width_tot = regions_without_separators.shape[1]
#print(splitter_y_new[i],splitter_y_new[i+1]) for top, bot in pairwise(splitter_y_new):
matrix_new = matrix_of_lines_ch[:,:][(matrix_of_lines_ch[:,6]> splitter_y_new[i] ) & # print("%d:%d" % (top, bot), 'i')
(matrix_of_lines_ch[:,7]< splitter_y_new[i+1] )] # dbg_plt([None, None, top, bot],
# "image cut for y split %d:%d" % (
# top, bot))
matrix_new = matrix_of_lines_ch[(matrix_of_lines_ch[:,6] > top) &
(matrix_of_lines_ch[:,7] < bot)]
#print(len( matrix_new[:,9][matrix_new[:,9]==1] )) #print(len( matrix_new[:,9][matrix_new[:,9]==1] ))
#print(matrix_new[:,8][matrix_new[:,9]==1],'gaddaaa') #print(matrix_new[:,8][matrix_new[:,9]==1],'gaddaaa')
# check to see is there any vertical separator to find holes. # check to see is there any vertical separator to find holes.
#if (len(matrix_new[:,9][matrix_new[:,9]==1]) > 0 and #if (len(matrix_new[:,9][matrix_new[:,9]==1]) > 0 and
# np.max(matrix_new[:,8][matrix_new[:,9]==1]) >= # np.max(matrix_new[:,8][matrix_new[:,9]==1]) >=
# 0.1 * (np.abs(splitter_y_new[i+1]-splitter_y_new[i]))): # 0.1 * (np.abs(bot-top))):
if True: if True:
try: try:
num_col, peaks_neg_fin = find_num_col( num_col, peaks_neg_fin = find_num_col(
regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1], :], regions_without_separators[top:bot],
num_col_classifier, tables, multiplier=6. if erosion_hurts else 7.) num_col_classifier, tables, multiplier=6. if erosion_hurts else 7.)
except: except:
peaks_neg_fin=[] peaks_neg_fin=[]
num_col = 0 num_col = 0
try: try:
if (len(peaks_neg_fin)+1)<num_col_classifier or num_col_classifier==6: if (len(peaks_neg_fin)+1)<num_col_classifier or num_col_classifier==6:
# found too few columns here
#print('burda') #print('burda')
peaks_neg_fin_org = np.copy(peaks_neg_fin) peaks_neg_fin_org = np.copy(peaks_neg_fin)
#print("peaks_neg_fin_org", peaks_neg_fin_org)
if len(peaks_neg_fin)==0: if len(peaks_neg_fin)==0:
num_col, peaks_neg_fin = find_num_col( num_col, peaks_neg_fin = find_num_col(
regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1], :], regions_without_separators[top:bot],
num_col_classifier, tables, multiplier=3.) num_col_classifier, tables, multiplier=3.)
peaks_neg_fin_early=[]
peaks_neg_fin_early.append(0)
#print(peaks_neg_fin,'peaks_neg_fin') #print(peaks_neg_fin,'peaks_neg_fin')
for p_n in peaks_neg_fin: peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1]
peaks_neg_fin_early.append(p_n)
peaks_neg_fin_early.append(regions_without_separators.shape[1]-1)
#print(peaks_neg_fin_early,'burda2') #print(peaks_neg_fin_early,'burda2')
peaks_neg_fin_rev=[] peaks_neg_fin_rev=[]
for i_n in range(len(peaks_neg_fin_early)-1): for left, right in pairwise(peaks_neg_fin_early):
#print(i_n,'i_n') # print("%d:%d" % (left, right), 'i_n')
#plt.plot(regions_without_separators[splitter_y_new[i]: # dbg_plt([left, right, top, bot],
# splitter_y_new[i+1], # "image cut for y split %d:%d / x gap %d:%d" % (
# peaks_neg_fin_early[i_n]: # top, bot, left, right))
# peaks_neg_fin_early[i_n+1]].sum(axis=0) ) # plt.plot(regions_without_separators[top:bot, left:right].sum(axis=0))
#plt.show() # plt.title("vertical projection (sum over y)")
# plt.show()
try: try:
num_col, peaks_neg_fin1 = find_num_col( _, peaks_neg_fin1 = find_num_col(
regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1], regions_without_separators[top:bot, left:right],
peaks_neg_fin_early[i_n]:peaks_neg_fin_early[i_n+1]], num_col_classifier, tables, multiplier=7.)
num_col_classifier,tables, multiplier=7.)
except: except:
peaks_neg_fin1=[] peaks_neg_fin1 = []
try: try:
num_col, peaks_neg_fin2 = find_num_col( _, peaks_neg_fin2 = find_num_col(
regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1], regions_without_separators[top:bot, left:right],
peaks_neg_fin_early[i_n]:peaks_neg_fin_early[i_n+1]], num_col_classifier, tables, multiplier=5.)
num_col_classifier,tables, multiplier=5.)
except: except:
peaks_neg_fin2=[] peaks_neg_fin2 = []
if len(peaks_neg_fin1) >= len(peaks_neg_fin2):
if len(peaks_neg_fin1)>=len(peaks_neg_fin2): peaks_neg_fin = peaks_neg_fin1
peaks_neg_fin=list(np.copy(peaks_neg_fin1))
else: else:
peaks_neg_fin=list(np.copy(peaks_neg_fin2)) peaks_neg_fin = peaks_neg_fin2
peaks_neg_fin=list(np.array(peaks_neg_fin)+peaks_neg_fin_early[i_n]) peaks_neg_fin = list(np.array(peaks_neg_fin) + left)
if i_n!=(len(peaks_neg_fin_early)-2):
peaks_neg_fin_rev.append(peaks_neg_fin_early[i_n+1])
#print(peaks_neg_fin,'peaks_neg_fin') #print(peaks_neg_fin,'peaks_neg_fin')
peaks_neg_fin_rev=peaks_neg_fin_rev+peaks_neg_fin
if right < peaks_neg_fin_early[-1]:
peaks_neg_fin_rev.append(right)
peaks_neg_fin_rev.extend(peaks_neg_fin)
if len(peaks_neg_fin_rev)>=len(peaks_neg_fin_org): if len(peaks_neg_fin_rev)>=len(peaks_neg_fin_org):
peaks_neg_fin=list(np.sort(peaks_neg_fin_rev)) peaks_neg_fin=list(np.sort(peaks_neg_fin_rev))
@ -1673,21 +1687,20 @@ def return_boxes_of_images_by_order_of_reading_new(
except: except:
logger.exception("cannot find peaks consistent with columns") logger.exception("cannot find peaks consistent with columns")
#num_col, peaks_neg_fin = find_num_col( #num_col, peaks_neg_fin = find_num_col(
# regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1],:], # regions_without_separators[top:bot,:],
# multiplier=7.0) # multiplier=7.0)
x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ]
x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ]
cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ]
cy_hor_diff=matrix_new[:,7][ (matrix_new[:,9]==0) ] cy_hor_diff=matrix_new[:,7][ (matrix_new[:,9]==0) ]
arg_org_hor_some=matrix_new[:,0][ (matrix_new[:,9]==0) ]
if right2left_readingorder: if right2left_readingorder:
x_max_hor_some_new = regions_without_separators.shape[1] - x_min_hor_some x_max_hor_some_new = width_tot - x_min_hor_some
x_min_hor_some_new = regions_without_separators.shape[1] - x_max_hor_some x_min_hor_some_new = width_tot - x_max_hor_some
x_min_hor_some =list(np.copy(x_min_hor_some_new)) x_min_hor_some =list(np.copy(x_min_hor_some_new))
x_max_hor_some =list(np.copy(x_max_hor_some_new)) x_max_hor_some =list(np.copy(x_max_hor_some_new))
peaks_neg_tot=return_points_with_boundies(peaks_neg_fin,0, regions_without_separators[:,:].shape[1]) peaks_neg_tot = [0] + peaks_neg_fin + [width_tot]
peaks_neg_tot_tables.append(peaks_neg_tot) peaks_neg_tot_tables.append(peaks_neg_tot)
reading_order_type, x_starting, x_ending, y_type_2, y_diff_type_2, \ reading_order_type, x_starting, x_ending, y_type_2, y_diff_type_2, \
@ -1697,26 +1710,27 @@ def return_boxes_of_images_by_order_of_reading_new(
x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, cy_hor_diff) x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, cy_hor_diff)
all_columns = set(range(len(peaks_neg_tot) - 1)) all_columns = set(range(len(peaks_neg_tot) - 1))
if ((reading_order_type==1) or # print("all_columns", all_columns)
(reading_order_type==0 and if (reading_order_type == 1 or
(len(y_lines_without_mother)>=2 or there_is_sep_with_child==1))): len(y_lines_without_mother) >= 2 or
there_is_sep_with_child == 1):
try: try:
y_grenze = splitter_y_new[i] + 300 y_grenze = top + 300
#check if there is a big separator in this y_mains_sep_ohne_grenzen #check if there is a big separator in this y_mains_sep_ohne_grenzen
args_early_ys=np.arange(len(y_type_2)) args_early_ys=np.arange(len(y_type_2))
#print(args_early_ys,'args_early_ys') #print(args_early_ys,'args_early_ys')
#print(splitter_y_new[i], splitter_y_new[i+1]) #print(top, bot)
x_starting_up = x_starting[(y_type_2 > splitter_y_new[i]) & x_starting_up = x_starting[(y_type_2 > top) &
(y_type_2 <= y_grenze)] (y_type_2 <= y_grenze)]
x_ending_up = x_ending[(y_type_2 > splitter_y_new[i]) & x_ending_up = x_ending[(y_type_2 > top) &
(y_type_2 <= y_grenze)] (y_type_2 <= y_grenze)]
y_type_2_up = y_type_2[(y_type_2 > splitter_y_new[i]) & y_type_2_up = y_type_2[(y_type_2 > top) &
(y_type_2 <= y_grenze)] (y_type_2 <= y_grenze)]
y_diff_type_2_up = y_diff_type_2[(y_type_2 > splitter_y_new[i]) & y_diff_type_2_up = y_diff_type_2[(y_type_2 > top) &
(y_type_2 <= y_grenze)] (y_type_2 <= y_grenze)]
args_up = args_early_ys[(y_type_2 > splitter_y_new[i]) & args_up = args_early_ys[(y_type_2 > top) &
(y_type_2 <= y_grenze)] (y_type_2 <= y_grenze)]
if len(y_type_2_up) > 0: if len(y_type_2_up) > 0:
y_main_separator_up = y_type_2_up [(x_starting_up==0) & y_main_separator_up = y_type_2_up [(x_starting_up==0) &
@ -1730,27 +1744,28 @@ def return_boxes_of_images_by_order_of_reading_new(
args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) ))
#print(args_to_be_kept,'args_to_be_kept') #print(args_to_be_kept,'args_to_be_kept')
boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1],
splitter_y_new[i], y_diff_main_separator_up.max()]) top, y_diff_main_separator_up.max()])
splitter_y_new[i] = y_diff_main_separator_up.max() # dbg_plt(boxes[-1], "first box")
top = y_diff_main_separator_up.max()
#print(splitter_y_new[i],'splitter_y_new[i]') #print(top,'top')
y_type_2 = y_type_2[args_to_be_kept] y_type_2 = y_type_2[args_to_be_kept]
x_starting = x_starting[args_to_be_kept] x_starting = x_starting[args_to_be_kept]
x_ending = x_ending[args_to_be_kept] x_ending = x_ending[args_to_be_kept]
y_diff_type_2 = y_diff_type_2[args_to_be_kept] y_diff_type_2 = y_diff_type_2[args_to_be_kept]
#print('galdiha') #print('galdiha')
y_grenze = splitter_y_new[i] + 200 y_grenze = top + 200
args_early_ys2=np.arange(len(y_type_2)) args_early_ys2=np.arange(len(y_type_2))
y_type_2_up=y_type_2[(y_type_2 > splitter_y_new[i]) & y_type_2_up=y_type_2[(y_type_2 > top) &
(y_type_2 <= y_grenze)] (y_type_2 <= y_grenze)]
x_starting_up=x_starting[(y_type_2 > splitter_y_new[i]) & x_starting_up=x_starting[(y_type_2 > top) &
(y_type_2 <= y_grenze)] (y_type_2 <= y_grenze)]
x_ending_up=x_ending[(y_type_2 > splitter_y_new[i]) & x_ending_up=x_ending[(y_type_2 > top) &
(y_type_2 <= y_grenze)] (y_type_2 <= y_grenze)]
y_diff_type_2_up=y_diff_type_2[(y_type_2 > splitter_y_new[i]) & y_diff_type_2_up=y_diff_type_2[(y_type_2 > top) &
(y_type_2 <= y_grenze)] (y_type_2 <= y_grenze)]
args_up2=args_early_ys2[(y_type_2 > splitter_y_new[i]) & args_up2=args_early_ys2[(y_type_2 > top) &
(y_type_2 <= y_grenze)] (y_type_2 <= y_grenze)]
#print(y_type_2_up,x_starting_up,x_ending_up,'didid') #print(y_type_2_up,x_starting_up,x_ending_up,'didid')
nodes_in = set() nodes_in = set()
@ -1804,13 +1819,14 @@ def return_boxes_of_images_by_order_of_reading_new(
pass pass
#print('burdaydikh2') #print('burdaydikh2')
#int(splitter_y_new[i]) #int(top)
y_lines_by_order=[] y_lines_by_order=[]
x_start_by_order=[] x_start_by_order=[]
x_end_by_order=[] x_end_by_order=[]
if (len(x_end_with_child_without_mother)==0 and reading_order_type==0) or reading_order_type==1: if (reading_order_type == 1 or
if reading_order_type==1: len(x_end_with_child_without_mother) == 0):
y_lines_by_order.append(splitter_y_new[i]) if reading_order_type == 1:
y_lines_by_order.append(top)
x_start_by_order.append(0) x_start_by_order.append(0)
x_end_by_order.append(len(peaks_neg_tot)-2) x_end_by_order.append(len(peaks_neg_tot)-2)
else: else:
@ -1823,8 +1839,8 @@ def return_boxes_of_images_by_order_of_reading_new(
columns_not_covered = list(all_columns - columns_covered_by_mothers) columns_not_covered = list(all_columns - columns_covered_by_mothers)
y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) +
len(x_start_without_mother), len(x_start_without_mother),
dtype=int) * splitter_y_new[i]) dtype=int) * top)
##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered))
##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
x_starting = np.append(x_starting, np.array(columns_not_covered, int)) x_starting = np.append(x_starting, np.array(columns_not_covered, int))
x_starting = np.append(x_starting, x_start_without_mother) x_starting = np.append(x_starting, x_start_without_mother)
@ -1839,22 +1855,15 @@ def return_boxes_of_images_by_order_of_reading_new(
ind_args_in_col=ind_args[x_starting==column] ind_args_in_col=ind_args[x_starting==column]
#print('babali2') #print('babali2')
#print(ind_args_in_col,'ind_args_in_col') #print(ind_args_in_col,'ind_args_in_col')
ind_args_in_col=np.array(ind_args_in_col)
#print(len(y_type_2)) #print(len(y_type_2))
y_column=y_type_2[ind_args_in_col] y_column=y_type_2[ind_args_in_col]
x_start_column=x_starting[ind_args_in_col] x_start_column=x_starting[ind_args_in_col]
x_end_column=x_ending[ind_args_in_col] x_end_column=x_ending[ind_args_in_col]
#print('babali3') #print('babali3')
ind_args_col_sorted=np.argsort(y_column) ind_args_col_sorted=np.argsort(y_column)
y_col_sort=y_column[ind_args_col_sorted] y_lines_by_order.extend(y_column[ind_args_col_sorted])
x_start_column_sort=x_start_column[ind_args_col_sorted] x_start_by_order.extend(x_start_column[ind_args_col_sorted])
x_end_column_sort=x_end_column[ind_args_col_sorted] x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
#print('babali4')
for ii in range(len(y_col_sort)):
#print('babali5')
y_lines_by_order.append(y_col_sort[ii])
x_start_by_order.append(x_start_column_sort[ii])
x_end_by_order.append(x_end_column_sort[ii]-1)
else: else:
#print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo')
columns_covered_by_mothers = set() columns_covered_by_mothers = set()
@ -1864,8 +1873,8 @@ def return_boxes_of_images_by_order_of_reading_new(
x_end_without_mother[dj])) x_end_without_mother[dj]))
columns_not_covered = list(all_columns - columns_covered_by_mothers) columns_not_covered = list(all_columns - columns_covered_by_mothers)
y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + len(x_start_without_mother), y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + len(x_start_without_mother),
dtype=int) * splitter_y_new[i]) dtype=int) * top)
##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered))
##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
x_starting = np.append(x_starting, np.array(columns_not_covered, int)) x_starting = np.append(x_starting, np.array(columns_not_covered, int))
x_starting = np.append(x_starting, x_start_without_mother) x_starting = np.append(x_starting, x_start_without_mother)
@ -1888,25 +1897,24 @@ def return_boxes_of_images_by_order_of_reading_new(
x_start_with_child_without_mother = np.array(x_start_with_child_without_mother, int) x_start_with_child_without_mother = np.array(x_start_with_child_without_mother, int)
for i_s_nc in columns_not_covered_child_no_mother: for i_s_nc in columns_not_covered_child_no_mother:
if i_s_nc in x_start_with_child_without_mother: if i_s_nc in x_start_with_child_without_mother:
#print("i_s_nc", i_s_nc)
x_end_biggest_column = \ x_end_biggest_column = \
x_end_with_child_without_mother[x_start_with_child_without_mother==i_s_nc][0] x_end_with_child_without_mother[x_start_with_child_without_mother==i_s_nc][0]
args_all_biggest_lines = ind_args[(x_starting==i_s_nc) & args_all_biggest_lines = ind_args[(x_starting==i_s_nc) &
(x_ending==x_end_biggest_column)] (x_ending==x_end_biggest_column)]
y_column_nc = y_type_2[args_all_biggest_lines] y_column_nc = y_type_2[args_all_biggest_lines]
x_start_column_nc = x_starting[args_all_biggest_lines] #x_start_column_nc = x_starting[args_all_biggest_lines]
x_end_column_nc = x_ending[args_all_biggest_lines] #x_end_column_nc = x_ending[args_all_biggest_lines]
y_column_nc = np.sort(y_column_nc) y_column_nc = np.sort(y_column_nc)
for i_c in range(len(y_column_nc)): for i_c in range(len(y_column_nc)):
if i_c==(len(y_column_nc)-1): #print("i_c", i_c)
ind_all_lines_between_nm_wc=ind_args[(y_type_2>y_column_nc[i_c]) & ind_all_lines_between_nm_wc = \
(y_type_2<splitter_y_new[i+1]) & ind_args[(y_type_2 > y_column_nc[i_c]) &
(x_starting>=i_s_nc) & (y_type_2 < (y_column_nc[i_c+1]
(x_ending<=x_end_biggest_column)] if i_c < len(y_column_nc)-1
else: else bot)) &
ind_all_lines_between_nm_wc=ind_args[(y_type_2>y_column_nc[i_c]) & (x_starting >= i_s_nc) &
(y_type_2<y_column_nc[i_c+1]) & (x_ending <= x_end_biggest_column)]
(x_starting>=i_s_nc) &
(x_ending<=x_end_biggest_column)]
y_all_between_nm_wc = y_type_2[ind_all_lines_between_nm_wc] y_all_between_nm_wc = y_type_2[ind_all_lines_between_nm_wc]
x_starting_all_between_nm_wc = x_starting[ind_all_lines_between_nm_wc] x_starting_all_between_nm_wc = x_starting[ind_all_lines_between_nm_wc]
x_ending_all_between_nm_wc = x_ending[ind_all_lines_between_nm_wc] x_ending_all_between_nm_wc = x_ending[ind_all_lines_between_nm_wc]
@ -1965,78 +1973,58 @@ def return_boxes_of_images_by_order_of_reading_new(
ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column]
#print('babali2') #print('babali2')
#print(ind_args_in_col,'ind_args_in_col') #print(ind_args_in_col,'ind_args_in_col')
ind_args_in_col=np.array(ind_args_in_col)
#print(len(y_type_2)) #print(len(y_type_2))
y_column=y_all_between_nm_wc[ind_args_in_col] y_column=y_all_between_nm_wc[ind_args_in_col]
x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] x_start_column=x_starting_all_between_nm_wc[ind_args_in_col]
x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] x_end_column=x_ending_all_between_nm_wc[ind_args_in_col]
#print('babali3') #print('babali3')
ind_args_col_sorted=np.argsort(y_column) ind_args_col_sorted=np.argsort(y_column)
y_col_sort=y_column[ind_args_col_sorted] y_lines_by_order.extend(y_column[ind_args_col_sorted])
x_start_column_sort=x_start_column[ind_args_col_sorted] x_start_by_order.extend(x_start_column[ind_args_col_sorted])
x_end_column_sort=x_end_column[ind_args_col_sorted] x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
#print('babali4')
for ii in range(len(y_col_sort)):
#print('babali5')
y_lines_by_order.append(y_col_sort[ii])
x_start_by_order.append(x_start_column_sort[ii])
x_end_by_order.append(x_end_column_sort[ii]-1)
else: else:
#print(column,'column') #print(column,'column')
ind_args_in_col=ind_args[x_starting==i_s_nc] ind_args_in_col=ind_args[x_starting==i_s_nc]
#print('babali2') #print('babali2')
#print(ind_args_in_col,'ind_args_in_col') #print(ind_args_in_col,'ind_args_in_col')
ind_args_in_col=np.array(ind_args_in_col)
#print(len(y_type_2)) #print(len(y_type_2))
y_column=y_type_2[ind_args_in_col] y_column=y_type_2[ind_args_in_col]
x_start_column=x_starting[ind_args_in_col] x_start_column=x_starting[ind_args_in_col]
x_end_column=x_ending[ind_args_in_col] x_end_column=x_ending[ind_args_in_col]
#print('babali3') #print('babali3')
ind_args_col_sorted=np.argsort(y_column) ind_args_col_sorted = np.argsort(y_column)
y_col_sort=y_column[ind_args_col_sorted] y_lines_by_order.extend(y_column[ind_args_col_sorted])
x_start_column_sort=x_start_column[ind_args_col_sorted] x_start_by_order.extend(x_start_column[ind_args_col_sorted])
x_end_column_sort=x_end_column[ind_args_col_sorted] x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
#print('babali4')
for ii in range(len(y_col_sort)):
y_lines_by_order.append(y_col_sort[ii])
x_start_by_order.append(x_start_column_sort[ii])
x_end_by_order.append(x_end_column_sort[ii]-1)
y_lines_by_order = np.array(y_lines_by_order)
x_start_by_order = np.array(x_start_by_order)
x_end_by_order = np.array(x_end_by_order)
for il in range(len(y_lines_by_order)): for il in range(len(y_lines_by_order)):
y_copy = list(y_lines_by_order) #print(il, "il")
x_start_copy = list(x_start_by_order) y_itself = y_lines_by_order[il]
x_end_copy = list(x_end_by_order) x_start_itself = x_start_by_order[il]
x_end_itself = x_end_by_order[il]
#print(y_copy,'y_copy')
y_itself=y_copy.pop(il)
x_start_itself=x_start_copy.pop(il)
x_end_itself=x_end_copy.pop(il)
#print(y_copy,'y_copy2')
for column in range(int(x_start_itself), int(x_end_itself)+1): for column in range(int(x_start_itself), int(x_end_itself)+1):
#print(column,'cols') #print(column,'cols')
y_in_cols=[] y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) &
for yic in range(len(y_copy)): (column >= x_start_by_order) &
(column <= x_end_by_order)]
#print('burda') #print('burda')
if (y_copy[yic]>y_itself and y_down = y_in_cols.min(initial=bot)
column>=x_start_copy[yic] and
column<=x_end_copy[yic]):
y_in_cols.append(y_copy[yic])
#print('burda2') #print('burda2')
#print(y_in_cols,'y_in_cols') #print(y_in_cols,'y_in_cols')
if len(y_in_cols)>0:
y_down=np.min(y_in_cols)
else:
y_down=splitter_y_new[i+1]
#print(y_itself,'y_itself') #print(y_itself,'y_itself')
boxes.append([peaks_neg_tot[column], boxes.append([peaks_neg_tot[column],
peaks_neg_tot[column+1], peaks_neg_tot[column+1],
y_itself, y_itself,
y_down]) y_down])
# dbg_plt(boxes[-1], "A column %d box" % (column + 1))
except: except:
logger.exception("cannot assign boxes") logger.exception("cannot assign boxes")
boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1],
splitter_y_new[i], splitter_y_new[i+1]]) top, bot])
# dbg_plt(boxes[-1], "fallback box")
else: else:
y_lines_by_order=[] y_lines_by_order=[]
x_start_by_order=[] x_start_by_order=[]
@ -2050,8 +2038,8 @@ def return_boxes_of_images_by_order_of_reading_new(
columns_not_covered = list(all_columns - columns_covered_by_lines_covered_more_than_2col) columns_not_covered = list(all_columns - columns_covered_by_lines_covered_more_than_2col)
y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + 1, y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + 1,
dtype=int) * splitter_y_new[i]) dtype=int) * top)
##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered))
##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype))
x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1)
@ -2064,8 +2052,8 @@ def return_boxes_of_images_by_order_of_reading_new(
else: else:
columns_not_covered = list(all_columns) columns_not_covered = list(all_columns)
y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered), y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered),
dtype=int) * splitter_y_new[i]) dtype=int) * top)
##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered))
##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype))
x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1)
@ -2075,71 +2063,64 @@ def return_boxes_of_images_by_order_of_reading_new(
for column in range(len(peaks_neg_tot)-1): for column in range(len(peaks_neg_tot)-1):
#print(column,'column') #print(column,'column')
ind_args_in_col=ind_args[x_starting==column] ind_args_in_col=ind_args[x_starting==column]
ind_args_in_col=np.array(ind_args_in_col)
#print(len(y_type_2)) #print(len(y_type_2))
y_column=y_type_2[ind_args_in_col] y_column=y_type_2[ind_args_in_col]
x_start_column=x_starting[ind_args_in_col] x_start_column=x_starting[ind_args_in_col]
x_end_column=x_ending[ind_args_in_col] x_end_column=x_ending[ind_args_in_col]
ind_args_col_sorted=np.argsort(y_column) ind_args_col_sorted = np.argsort(y_column)
y_col_sort=y_column[ind_args_col_sorted] y_lines_by_order.extend(y_column[ind_args_col_sorted])
x_start_column_sort=x_start_column[ind_args_col_sorted] x_start_by_order.extend(x_start_column[ind_args_col_sorted])
x_end_column_sort=x_end_column[ind_args_col_sorted] x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
#print('babali4')
for ii in range(len(y_col_sort)):
#print('babali5')
y_lines_by_order.append(y_col_sort[ii])
x_start_by_order.append(x_start_column_sort[ii])
x_end_by_order.append(x_end_column_sort[ii]-1)
y_lines_by_order = np.array(y_lines_by_order)
x_start_by_order = np.array(x_start_by_order)
x_end_by_order = np.array(x_end_by_order)
for il in range(len(y_lines_by_order)): for il in range(len(y_lines_by_order)):
y_copy = list(y_lines_by_order) #print(il, "il")
x_start_copy = list(x_start_by_order) y_itself = y_lines_by_order[il]
x_end_copy = list(x_end_by_order) #print(y_itself,'y_itself')
x_start_itself = x_start_by_order[il]
#print(y_copy,'y_copy') x_end_itself = x_end_by_order[il]
y_itself=y_copy.pop(il)
x_start_itself=x_start_copy.pop(il)
x_end_itself=x_end_copy.pop(il)
for column in range(x_start_itself, x_end_itself+1): for column in range(x_start_itself, x_end_itself+1):
#print(column,'cols') #print(column,'cols')
y_in_cols=[] y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) &
for yic in range(len(y_copy)): (column >= x_start_by_order) &
#print('burda') (column <= x_end_by_order)]
if (y_copy[yic]>y_itself and
column>=x_start_copy[yic] and
column<=x_end_copy[yic]):
y_in_cols.append(y_copy[yic])
#print('burda2') #print('burda2')
#print(y_in_cols,'y_in_cols') #print(y_in_cols,'y_in_cols')
if len(y_in_cols)>0: y_down = y_in_cols.min(initial=bot)
y_down=np.min(y_in_cols) #print(y_down,'y_down')
else:
y_down=splitter_y_new[i+1]
#print(y_itself,'y_itself')
boxes.append([peaks_neg_tot[column], boxes.append([peaks_neg_tot[column],
peaks_neg_tot[column+1], peaks_neg_tot[column+1],
y_itself, y_itself,
y_down]) y_down])
# dbg_plt(boxes[-1], "B column %d box" % (column + 1))
#else: #else:
#boxes.append([ 0, regions_without_separators[:,:].shape[1] ,splitter_y_new[i],splitter_y_new[i+1]]) #boxes.append([ 0, regions_without_separators[:,:].shape[1] ,top, bot])
if right2left_readingorder: if right2left_readingorder:
peaks_neg_tot_tables_new = [] peaks_neg_tot_tables_new = []
if len(peaks_neg_tot_tables)>=1: if len(peaks_neg_tot_tables)>=1:
for peaks_tab_ind in peaks_neg_tot_tables: for peaks_tab_ind in peaks_neg_tot_tables:
peaks_neg_tot_tables_ind = regions_without_separators.shape[1] - np.array(peaks_tab_ind) peaks_neg_tot_tables_ind = width_tot - np.array(peaks_tab_ind)
peaks_neg_tot_tables_ind = list(peaks_neg_tot_tables_ind[::-1]) peaks_neg_tot_tables_ind = list(peaks_neg_tot_tables_ind[::-1])
peaks_neg_tot_tables_new.append(peaks_neg_tot_tables_ind) peaks_neg_tot_tables_new.append(peaks_neg_tot_tables_ind)
for i in range(len(boxes)): for i in range(len(boxes)):
x_start_new = regions_without_separators.shape[1] - boxes[i][1] x_start_new = width_tot - boxes[i][1]
x_end_new = regions_without_separators.shape[1] - boxes[i][0] x_end_new = width_tot - boxes[i][0]
boxes[i][0] = x_start_new boxes[i][0] = x_start_new
boxes[i][1] = x_end_new boxes[i][1] = x_end_new
peaks_neg_tot_tables = peaks_neg_tot_tables_new peaks_neg_tot_tables = peaks_neg_tot_tables_new
# show final xy-cut
# plt.imshow(regions_without_separators)
# for xmin, xmax, ymin, ymax in boxes:
# plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
# fill=False, linewidth=1, edgecolor='r'))
# plt.show()
logger.debug('exit return_boxes_of_images_by_order_of_reading_new') logger.debug('exit return_boxes_of_images_by_order_of_reading_new')
return boxes, peaks_neg_tot_tables return boxes, peaks_neg_tot_tables