also cover drop-capital in (heuristic) reading order

This commit is contained in:
Robert Sachunsky 2026-04-17 03:41:04 +02:00
parent 92e94753c7
commit 20dc5c3188
3 changed files with 129 additions and 110 deletions

View file

@ -1219,112 +1219,104 @@ class Eynollah:
confidence_matrix)
def do_order_of_regions(
self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot):
self,
contours_only_text_parent,
contours_only_text_parent_h,
polygons_of_drop_capitals,
boxes,
textline_mask_tot
):
self.logger.debug("enter do_order_of_regions")
contours_only_text_parent = ensure_array(contours_only_text_parent)
contours_only_text_parent_h = ensure_array(contours_only_text_parent_h)
polygons_of_drop_capitals = ensure_array(polygons_of_drop_capitals)
boxes = np.array(boxes, dtype=int) # to be on the safe side
c_boxes = np.stack((0.5 * boxes[:, 2:4].sum(axis=1),
0.5 * boxes[:, 0:2].sum(axis=1)))
cx_main, cy_main, mx_main, Mx_main, my_main, My_main, mxy_main = find_new_features_of_contours(
contours_only_text_parent)
cx_head, cy_head, mx_head, Mx_head, my_head, My_head, mxy_head = find_new_features_of_contours(
contours_only_text_parent_h)
cx_main = np.array(cx_main, dtype=int)
cy_main = np.array(cy_main, dtype=int)
cx_head = np.array(cx_head, dtype=int)
cy_head = np.array(cy_head, dtype=int)
def match_boxes(only_centers: bool):
arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int)
for ii in range(len(contours_only_text_parent)):
def match_boxes(contours, only_centers: bool, kind: str):
cx, cy, mx, Mx, my, My, mxy = find_new_features_of_contours(contours)
cx = np.array(cx, dtype=int)
cy = np.array(cy, dtype=int)
arg_text_con = np.zeros(len(contours), dtype=int)
for ii in range(len(contours)):
box_found = False
for jj, box in enumerate(boxes):
if ((cx_main[ii] >= box[0] and
cx_main[ii] < box[1] and
cy_main[ii] >= box[2] and
cy_main[ii] < box[3]) if only_centers else
(mx_main[ii] >= box[0] and
Mx_main[ii] < box[1] and
my_main[ii] >= box[2] and
My_main[ii] < box[3])):
arg_text_con_main[ii] = jj
if ((cx[ii] >= box[0] and
cx[ii] < box[1] and
cy[ii] >= box[2] and
cy[ii] < box[3]) if only_centers else
(mx[ii] >= box[0] and
Mx[ii] < box[1] and
my[ii] >= box[2] and
My[ii] < box[3])):
arg_text_con[ii] = jj
box_found = True
# print("main/matched ", ii, "\t", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", jj, box, only_centers)
# print(kind, "/matched ", ii, "\t", (mx[ii], Mx[ii], my[ii], My[ii]), "\tin", jj, box, only_centers)
break
if not box_found:
dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0)
pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) &
(boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1]))
assert pcontained_in_box.any(), (ii, cx_main[ii], cy_main[ii])
dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy[ii]], [cx[ii]]]), axis=0)
pcontained_in_box = ((boxes[:, 2] <= cy[ii]) & (cy[ii] < boxes[:, 3]) &
(boxes[:, 0] <= cx[ii]) & (cx[ii] < boxes[:, 1]))
assert pcontained_in_box.any(), (ii, cx[ii], cy[ii])
ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box))
arg_text_con_main[ii] = ind_min
# print("main/fallback ", ii, "\t", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", ind_min, boxes[ind_min], only_centers)
arg_text_con[ii] = ind_min
# print(kind, "/fallback ", ii, "\t", (mx[ii], Mx[ii], my[ii], My[ii]), "\tin", ind_min, boxes[ind_min], only_centers)
return arg_text_con
def order_from_boxes(only_centers: bool):
arg_text_con_main = match_boxes(contours_only_text_parent, only_centers, "main")
arg_text_con_head = match_boxes(contours_only_text_parent_h, only_centers, "head")
arg_text_con_drop = match_boxes(polygons_of_drop_capitals, only_centers, "drop")
args_contours_main = np.arange(len(contours_only_text_parent))
order_by_con_main = np.zeros_like(arg_text_con_main)
arg_text_con_head = np.zeros(len(contours_only_text_parent_h), dtype=int)
for ii in range(len(contours_only_text_parent_h)):
box_found = False
for jj, box in enumerate(boxes):
if ((cx_head[ii] >= box[0] and
cx_head[ii] < box[1] and
cy_head[ii] >= box[2] and
cy_head[ii] < box[3]) if only_centers else
(mx_head[ii] >= box[0] and
Mx_head[ii] < box[1] and
my_head[ii] >= box[2] and
My_head[ii] < box[3])):
arg_text_con_head[ii] = jj
box_found = True
# print("head/matched ", ii, "\t", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", jj, box, only_centers)
break
if not box_found:
dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0)
pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) &
(boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1]))
assert pcontained_in_box.any(), (ii, cx_head[ii], cy_head[ii])
ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box))
arg_text_con_head[ii] = ind_min
# print("head/fallback ", ii, "\t", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", ind_min, boxes[ind_min], only_centers)
args_contours_head = np.arange(len(contours_only_text_parent_h))
args_contours_drop = np.arange(len(polygons_of_drop_capitals))
order_by_con_main = np.zeros_like(arg_text_con_main)
order_by_con_head = np.zeros_like(arg_text_con_head)
order_by_con_drop = np.zeros_like(arg_text_con_drop)
idx = 0
for iij, box in enumerate(boxes):
ys = slice(*box[2:4])
xs = slice(*box[0:2])
args_contours_box_main = args_contours_main[arg_text_con_main == iij]
args_contours_box_head = args_contours_head[arg_text_con_head == iij]
con_inter_box = contours_only_text_parent[args_contours_box_main]
con_inter_box_h = contours_only_text_parent_h[args_contours_box_head]
args_contours_box_drop = args_contours_drop[arg_text_con_drop == iij]
_, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions(
textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2], box[0])
textline_mask_tot[ys, xs],
contours_only_text_parent[args_contours_box_main],
contours_only_text_parent_h[args_contours_box_head],
polygons_of_drop_capitals[args_contours_box_drop],
box[2], box[0])
for tidx, kind in zip(index_by_kind_sorted, kind_of_texts_sorted):
if kind == 1:
# print(iij, "main", args_contours_box_main[tidx], "becomes", idx)
order_by_con_main[args_contours_box_main[tidx]] = idx
else:
elif kind == 2:
# print(iij, "head", args_contours_box_head[tidx], "becomes", idx)
order_by_con_head[args_contours_box_head[tidx]] = idx
else:
# print(iij, "drop", args_contours_box_drop[tidx], "becomes", idx)
order_by_con_drop[args_contours_box_drop[tidx]] = idx
idx += 1
# xml writer will create region ids in order of
# - contours_only_text_parent (main text), followed by
# - contours_only_text_parent (headings),
# - contours_only_text_parent_h (headings), and then
# - polygons_of_drop_capitals,
# and then create regionrefs into these ordered by order_text_new
order_text_new = np.argsort(np.concatenate((order_by_con_main,
order_by_con_head)))
order_by_con_head,
order_by_con_drop)))
return order_text_new
try:
results = match_boxes(False)
results = order_from_boxes(False)
except Exception as why:
self.logger.exception(why)
results = match_boxes(True)
results = order_from_boxes(True)
self.logger.debug("exit do_order_of_regions")
return results
@ -1809,6 +1801,7 @@ class Eynollah:
text_regions_p[drops] = label_drop_fl
regions_without_separators = (text_regions_p == label_text) * 1
regions_without_separators[drops] = 1 # also cover in reading-order
# regions_without_separators = ( text_regions_p == 1 | text_regions_p == 2 ) * 1
#self.return_regions_without_separators_new(text_regions_p, img_only_regions)
if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
@ -2399,7 +2392,7 @@ class Eynollah:
order_of_texts=order_text_new,
all_found_textline_polygons=all_found_textline_polygons,
all_box_coord=page_coord,
found_polygons_text_region_img=[],
found_polygons_images=[],
found_polygons_marginals_left=[],
found_polygons_marginals_right=[],
all_found_textline_polygons_marginals_left=[],
@ -2466,7 +2459,7 @@ class Eynollah:
order_of_texts=[],
all_found_textline_polygons=[],
all_box_coord=[],
found_polygons_text_region_img=[],
found_polygons_images=[],
found_polygons_marginals_left=[],
found_polygons_marginals_right=[],
all_found_textline_polygons_marginals_left=[],
@ -2724,7 +2717,7 @@ class Eynollah:
all_found_textline_polygons_h=[],
all_box_coord=[],
all_box_coord_h=[],
found_polygons_text_region_img=polygons_of_images,
found_polygons_images=polygons_of_images,
found_polygons_tables=contours_tables,
found_polygons_drop_capitals=[],
found_polygons_marginals_left=polygons_of_marginals,
@ -2747,7 +2740,7 @@ class Eynollah:
order_of_texts=[],
all_found_textline_polygons=[],
all_box_coord=[],
found_polygons_text_region_img=polygons_of_images,
found_polygons_images=polygons_of_images,
found_polygons_marginals_left=polygons_of_marginals,
found_polygons_marginals_right=polygons_of_marginals,
all_found_textline_polygons_marginals_left=empty_marginals,
@ -2907,14 +2900,21 @@ class Eynollah:
if self.reading_order_machine_based:
order_text_new = self.do_order_of_regions_with_model(
contours_only_text_parent, contours_only_text_parent_h, text_regions_p)
contours_only_text_parent,
contours_only_text_parent_h,
text_regions_p)
else:
if np.abs(slope_deskew) < SLOPE_THRESHOLD:
order_text_new = self.do_order_of_regions(
contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot_ea)
contours_only_text_parent,
contours_only_text_parent_h,
polygons_of_drop_capitals,
boxes, textline_mask_tot_ea)
else:
order_text_new = self.do_order_of_regions(
contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered,
contours_only_text_parent_d_ordered,
contours_only_text_parent_h_d_ordered,
polygons_of_drop_capitals,
boxes_d, textline_mask_tot_ea_d)
self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s")
@ -2930,7 +2930,7 @@ class Eynollah:
all_found_textline_polygons_h=all_found_textline_polygons_h,
all_box_coord=all_box_coord,
all_box_coord_h=all_box_coord_h,
found_polygons_text_region_img=polygons_of_images,
found_polygons_images=polygons_of_images,
found_polygons_tables=contours_tables,
found_polygons_drop_capitals=polygons_of_drop_capitals,
found_polygons_marginals_left=polygons_of_marginals_left,
@ -2955,7 +2955,7 @@ class Eynollah:
order_of_texts=order_text_new,
all_found_textline_polygons=all_found_textline_polygons,
all_box_coord=all_box_coord,
found_polygons_text_region_img=polygons_of_images,
found_polygons_images=polygons_of_images,
found_polygons_marginals_left=polygons_of_marginals_left,
found_polygons_marginals_right=polygons_of_marginals_right,
all_found_textline_polygons_marginals_left=all_found_textline_polygons_marginals_left,

View file

@ -1106,7 +1106,7 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col)
textlines_con_changed.append(textlines_big_org_form)
return textlines_con_changed
def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref):
def order_of_regions(textline_mask, contours_main, contours_head, contours_drop, y_ref, x_ref):
"""
Order text region contours within a single column bbox in a top-down-left-right way.
@ -1118,13 +1118,17 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref):
* textline_mask: the mask of the textline segmentation, cropped for that box
* contours_main: the paragraph text region contours expected to be here
* contours_head: the heading text region contours expected to be here
* contours_drop: the drop-capital region contours expected to be here
* y_ref: the vertical offset of that box within the page
* x_ref: the horizontal offset of that box within the page
Returns: a tuple of
* the array of contour indexes overall within this box (i.e. into main+head)
* the array of types (1 for paragraph, 2 for heading)
* the array of contour indexes for the respective type (i.e. into contours_main or contours_head)
* the array of contour indexes overall within this box
(i.e. into main+head+drop)
* the array of types
(1 for paragraph, 2 for heading, 3 for drop-capital)
* the array of contour indexes for the respective type
(i.e. into contours_main or contours_head or contours_drop)
"""
##plt.imshow(textline_mask)
##plt.show()
@ -1156,19 +1160,31 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref):
cx_main, cy_main = find_center_of_contours(contours_main)
cx_head, cy_head = find_center_of_contours(contours_head)
cx_drop, cy_drop = find_center_of_contours(contours_drop)
# assert not len(cy_main) or np.min(peaks_neg_new) <= np.min(cy_main) and np.max(cy_main) <= np.max(peaks_neg_new)
# assert not len(cy_head) or np.min(peaks_neg_new) <= np.min(cy_head) and np.max(cy_head) <= np.max(peaks_neg_new)
# assert not len(cy_drop) or np.min(peaks_neg_new) <= np.min(cy_drop) and np.max(cy_drop) <= np.max(peaks_neg_new)
matrix_of_orders = np.zeros((len(contours_main) + len(contours_head), 5), dtype=int)
matrix_of_orders[:, 0] = np.arange(len(contours_main) + len(contours_head))
matrix_of_orders[: len(contours_main), 1] = 1
matrix_of_orders[len(contours_main) :, 1] = 2
matrix_of_orders[: len(contours_main), 2] = cx_main
matrix_of_orders[len(contours_main) :, 2] = cx_head
matrix_of_orders[: len(contours_main), 3] = cy_main
matrix_of_orders[len(contours_main) :, 3] = cy_head
matrix_of_orders[: len(contours_main), 4] = np.arange(len(contours_main))
matrix_of_orders[len(contours_main) :, 4] = np.arange(len(contours_head))
total = len(contours_main) + len(contours_head) + len(contours_drop)
slice_main = slice(0, len(contours_main))
slice_head = slice(len(contours_main),
len(contours_main) + len(contours_head))
slice_drop = slice(len(contours_main) + len(contours_head),
total)
matrix_of_orders = np.zeros((total, 5), dtype=int)
matrix_of_orders[:, 0] = np.arange(total)
matrix_of_orders[slice_main, 1] = 1
matrix_of_orders[slice_head, 1] = 2
matrix_of_orders[slice_drop, 1] = 3
matrix_of_orders[slice_main, 2] = cx_main
matrix_of_orders[slice_head, 2] = cx_head
matrix_of_orders[slice_drop, 2] = cx_drop
matrix_of_orders[slice_main, 3] = cy_main
matrix_of_orders[slice_head, 3] = cy_head
matrix_of_orders[slice_drop, 3] = cy_drop
matrix_of_orders[slice_main, 4] = np.arange(len(contours_main))
matrix_of_orders[slice_head, 4] = np.arange(len(contours_head))
matrix_of_orders[slice_drop, 4] = np.arange(len(contours_drop))
# print(peaks_neg_new,'peaks_neg_new')
# print(matrix_of_orders,'matrix_of_orders')
@ -1189,12 +1205,12 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref):
# plt.gca().set_xticks(xrange, xrange + x_ref)
# plt.gca().set_yticks(yrange, yrange + y_ref)
# for idx, type_, cx, cy in zip(typed_indexes_in, types_in, cxs_in, cys_in):
# cnt = (contours_main if type_ == 1 else contours_head)[idx]
# col = 'red' if type_ == 1 else 'blue'
# cnt = {1: contours_main, 2: contours_head, 3: contours_drop}[type_][idx]
# col = {1: 'red', 2: 'blue', 3: 'green'}[type_]
# plt.scatter(cx - x_ref, cy - y_ref, 20, c=col, marker='o')
# plt.text(cx - x_ref, cy - y_ref, str(idx), c=col)
# plt.gca().add_patch(patches.Polygon(cnt[:, 0] - [[x_ref, y_ref]], closed=False, fill=False, color=col))
# plt.title("box contours centered in %d:%d (red=main / blue=heading)" % (top, bot))
# plt.title("box contours centered in %d:%d (red=main / blue=heading / green=drop-capital)" % (top, bot))
# plt.show()
sorted_inside = np.argsort(cxs_in)
@ -1204,8 +1220,11 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref):
##matrix_of_orders[:len_main,4]=final_indexers_sorted[:]
assert len(set(final_indexers_sorted)) == len(contours_main) + len(contours_head)
assert set(final_index_type) == set(range(len(contours_main))).union(range(len(contours_head)))
assert len(set(final_indexers_sorted)) == total
assert set(final_index_type) == (
set(range(len(contours_main)))
.union(range(len(contours_head)))
.union(range(len(contours_drop))))
return np.array(final_indexers_sorted), np.array(final_types), np.array(final_index_type)

View file

@ -81,7 +81,7 @@ class EynollahXmlWriter:
order_of_texts,
all_found_textline_polygons,
all_box_coord,
found_polygons_text_region_img,
found_polygons_images,
found_polygons_marginals_left,
found_polygons_marginals_right,
all_found_textline_polygons_marginals_left,
@ -104,7 +104,7 @@ class EynollahXmlWriter:
all_found_textline_polygons_h=[],
all_box_coord=all_box_coord,
all_box_coord_h=[],
found_polygons_text_region_img=found_polygons_text_region_img,
found_polygons_images=found_polygons_images,
found_polygons_tables=found_polygons_tables,
found_polygons_drop_capitals=[],
found_polygons_marginals_left=found_polygons_marginals_left,
@ -132,7 +132,7 @@ class EynollahXmlWriter:
all_found_textline_polygons_h,
all_box_coord,
all_box_coord_h,
found_polygons_text_region_img,
found_polygons_images,
found_polygons_tables,
found_polygons_drop_capitals,
found_polygons_marginals_left,
@ -211,6 +211,21 @@ class EynollahXmlWriter:
self.serialize_lines_in_region(textregion, all_found_textline_polygons_h, mm, page_coord,
all_box_coord_h, slopes_h, counter, ocr_textlines)
for mm, region_contour in enumerate(found_polygons_drop_capitals):
dropcapital = TextRegionType(
id=counter.next_region_id, type_='drop-capital',
Coords=CoordsType(points=self.calculate_points(region_contour, offset))
)
page.add_TextRegion(dropcapital)
all_box_coord_drop = [[0, 0, 0, 0]]
slopes_drop = [0]
if ocr_all_textlines_drop:
ocr_textlines = ocr_all_textlines_drop[mm]
else:
ocr_textlines = None
self.serialize_lines_in_region(dropcapital, [[found_polygons_drop_capitals[mm]]], 0, page_coord,
all_box_coord_drop, slopes_drop, counter, ocr_textlines)
for mm, region_contour in enumerate(found_polygons_marginals_left):
marginal = TextRegionType(
id=counter.next_region_id, type_='marginalia',
@ -236,22 +251,7 @@ class EynollahXmlWriter:
self.serialize_lines_in_region(marginal, all_found_textline_polygons_marginals_right, mm, page_coord,
all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines)
for mm, region_contour in enumerate(found_polygons_drop_capitals):
dropcapital = TextRegionType(
id=counter.next_region_id, type_='drop-capital',
Coords=CoordsType(points=self.calculate_points(region_contour, offset))
)
page.add_TextRegion(dropcapital)
all_box_coord_drop = [[0, 0, 0, 0]]
slopes_drop = [0]
if ocr_all_textlines_drop:
ocr_textlines = ocr_all_textlines_drop[mm]
else:
ocr_textlines = None
self.serialize_lines_in_region(dropcapital, [[found_polygons_drop_capitals[mm]]], 0, page_coord,
all_box_coord_drop, slopes_drop, counter, ocr_textlines)
for region_contour in found_polygons_text_region_img:
for region_contour in found_polygons_images:
page.add_ImageRegion(
ImageRegionType(id=counter.next_region_id,
Coords=CoordsType(points=self.calculate_points(region_contour, offset))))