mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-10-26 23:34:13 +01:00
return_boxes_of_images_by_order_of_reading_new: avoid oversplits
when y slice (`top:bot`) is not a significant part of the page, viz. less than 22% (as in `find_number_of_columns_in_document`), avoid forcing `find_num_col` to reach `num_col_classifier` (allows large headers not to be split up and thus better ordered)
This commit is contained in:
parent
6fbb5f8a12
commit
66a0e55e49
1 changed files with 9 additions and 3 deletions
|
|
@ -1628,7 +1628,8 @@ def return_boxes_of_images_by_order_of_reading_new(
|
|||
boxes=[]
|
||||
peaks_neg_tot_tables = []
|
||||
splitter_y_new = np.array(splitter_y_new, dtype=int)
|
||||
width_tot = regions_without_separators.shape[1]
|
||||
height_tot, width_tot = regions_without_separators.shape
|
||||
big_part = 22 * height_tot // 100 # percent height
|
||||
for top, bot in pairwise(splitter_y_new):
|
||||
# print("%d:%d" % (top, bot), 'i')
|
||||
# dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot))
|
||||
|
|
@ -1644,12 +1645,17 @@ def return_boxes_of_images_by_order_of_reading_new(
|
|||
try:
|
||||
num_col, peaks_neg_fin = find_num_col(
|
||||
regions_without_separators[top:bot],
|
||||
num_col_classifier, tables, multiplier=6. if erosion_hurts else 7.)
|
||||
# we do not expect to get all columns in small parts (headings etc.):
|
||||
num_col_classifier if bot - top >= big_part else 1,
|
||||
tables, multiplier=6. if erosion_hurts else 7.)
|
||||
except:
|
||||
peaks_neg_fin=[]
|
||||
num_col = 0
|
||||
try:
|
||||
if (len(peaks_neg_fin)+1)<num_col_classifier or num_col_classifier==6:
|
||||
if ((len(peaks_neg_fin) + 1 < num_col_classifier or
|
||||
num_col_classifier == 6) and
|
||||
# we do not expect to get all columns in small parts (headings etc.):
|
||||
bot - top >= big_part):
|
||||
# found too few columns here
|
||||
#print('burda')
|
||||
peaks_neg_fin_org = np.copy(peaks_neg_fin)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue