mirror of
https://github.com/qurator-spk/eynollah.git
synced 2026-04-30 19:22:03 +02:00
also cover drop-capital in (heuristic) reading order
This commit is contained in:
parent
92e94753c7
commit
20dc5c3188
3 changed files with 129 additions and 110 deletions
|
|
@ -1219,112 +1219,104 @@ class Eynollah:
|
|||
confidence_matrix)
|
||||
|
||||
def do_order_of_regions(
|
||||
self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot):
|
||||
self,
|
||||
contours_only_text_parent,
|
||||
contours_only_text_parent_h,
|
||||
polygons_of_drop_capitals,
|
||||
boxes,
|
||||
textline_mask_tot
|
||||
):
|
||||
|
||||
self.logger.debug("enter do_order_of_regions")
|
||||
contours_only_text_parent = ensure_array(contours_only_text_parent)
|
||||
contours_only_text_parent_h = ensure_array(contours_only_text_parent_h)
|
||||
polygons_of_drop_capitals = ensure_array(polygons_of_drop_capitals)
|
||||
boxes = np.array(boxes, dtype=int) # to be on the safe side
|
||||
c_boxes = np.stack((0.5 * boxes[:, 2:4].sum(axis=1),
|
||||
0.5 * boxes[:, 0:2].sum(axis=1)))
|
||||
cx_main, cy_main, mx_main, Mx_main, my_main, My_main, mxy_main = find_new_features_of_contours(
|
||||
contours_only_text_parent)
|
||||
cx_head, cy_head, mx_head, Mx_head, my_head, My_head, mxy_head = find_new_features_of_contours(
|
||||
contours_only_text_parent_h)
|
||||
cx_main = np.array(cx_main, dtype=int)
|
||||
cy_main = np.array(cy_main, dtype=int)
|
||||
cx_head = np.array(cx_head, dtype=int)
|
||||
cy_head = np.array(cy_head, dtype=int)
|
||||
|
||||
def match_boxes(only_centers: bool):
|
||||
arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int)
|
||||
for ii in range(len(contours_only_text_parent)):
|
||||
def match_boxes(contours, only_centers: bool, kind: str):
|
||||
cx, cy, mx, Mx, my, My, mxy = find_new_features_of_contours(contours)
|
||||
cx = np.array(cx, dtype=int)
|
||||
cy = np.array(cy, dtype=int)
|
||||
arg_text_con = np.zeros(len(contours), dtype=int)
|
||||
for ii in range(len(contours)):
|
||||
box_found = False
|
||||
for jj, box in enumerate(boxes):
|
||||
if ((cx_main[ii] >= box[0] and
|
||||
cx_main[ii] < box[1] and
|
||||
cy_main[ii] >= box[2] and
|
||||
cy_main[ii] < box[3]) if only_centers else
|
||||
(mx_main[ii] >= box[0] and
|
||||
Mx_main[ii] < box[1] and
|
||||
my_main[ii] >= box[2] and
|
||||
My_main[ii] < box[3])):
|
||||
arg_text_con_main[ii] = jj
|
||||
if ((cx[ii] >= box[0] and
|
||||
cx[ii] < box[1] and
|
||||
cy[ii] >= box[2] and
|
||||
cy[ii] < box[3]) if only_centers else
|
||||
(mx[ii] >= box[0] and
|
||||
Mx[ii] < box[1] and
|
||||
my[ii] >= box[2] and
|
||||
My[ii] < box[3])):
|
||||
arg_text_con[ii] = jj
|
||||
box_found = True
|
||||
# print("main/matched ", ii, "\t", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", jj, box, only_centers)
|
||||
# print(kind, "/matched ", ii, "\t", (mx[ii], Mx[ii], my[ii], My[ii]), "\tin", jj, box, only_centers)
|
||||
break
|
||||
if not box_found:
|
||||
dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0)
|
||||
pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) &
|
||||
(boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1]))
|
||||
assert pcontained_in_box.any(), (ii, cx_main[ii], cy_main[ii])
|
||||
dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy[ii]], [cx[ii]]]), axis=0)
|
||||
pcontained_in_box = ((boxes[:, 2] <= cy[ii]) & (cy[ii] < boxes[:, 3]) &
|
||||
(boxes[:, 0] <= cx[ii]) & (cx[ii] < boxes[:, 1]))
|
||||
assert pcontained_in_box.any(), (ii, cx[ii], cy[ii])
|
||||
ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box))
|
||||
arg_text_con_main[ii] = ind_min
|
||||
# print("main/fallback ", ii, "\t", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", ind_min, boxes[ind_min], only_centers)
|
||||
arg_text_con[ii] = ind_min
|
||||
# print(kind, "/fallback ", ii, "\t", (mx[ii], Mx[ii], my[ii], My[ii]), "\tin", ind_min, boxes[ind_min], only_centers)
|
||||
return arg_text_con
|
||||
|
||||
def order_from_boxes(only_centers: bool):
|
||||
arg_text_con_main = match_boxes(contours_only_text_parent, only_centers, "main")
|
||||
arg_text_con_head = match_boxes(contours_only_text_parent_h, only_centers, "head")
|
||||
arg_text_con_drop = match_boxes(polygons_of_drop_capitals, only_centers, "drop")
|
||||
args_contours_main = np.arange(len(contours_only_text_parent))
|
||||
order_by_con_main = np.zeros_like(arg_text_con_main)
|
||||
|
||||
arg_text_con_head = np.zeros(len(contours_only_text_parent_h), dtype=int)
|
||||
for ii in range(len(contours_only_text_parent_h)):
|
||||
box_found = False
|
||||
for jj, box in enumerate(boxes):
|
||||
if ((cx_head[ii] >= box[0] and
|
||||
cx_head[ii] < box[1] and
|
||||
cy_head[ii] >= box[2] and
|
||||
cy_head[ii] < box[3]) if only_centers else
|
||||
(mx_head[ii] >= box[0] and
|
||||
Mx_head[ii] < box[1] and
|
||||
my_head[ii] >= box[2] and
|
||||
My_head[ii] < box[3])):
|
||||
arg_text_con_head[ii] = jj
|
||||
box_found = True
|
||||
# print("head/matched ", ii, "\t", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", jj, box, only_centers)
|
||||
break
|
||||
if not box_found:
|
||||
dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0)
|
||||
pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) &
|
||||
(boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1]))
|
||||
assert pcontained_in_box.any(), (ii, cx_head[ii], cy_head[ii])
|
||||
ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box))
|
||||
arg_text_con_head[ii] = ind_min
|
||||
# print("head/fallback ", ii, "\t", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", ind_min, boxes[ind_min], only_centers)
|
||||
args_contours_head = np.arange(len(contours_only_text_parent_h))
|
||||
args_contours_drop = np.arange(len(polygons_of_drop_capitals))
|
||||
order_by_con_main = np.zeros_like(arg_text_con_main)
|
||||
order_by_con_head = np.zeros_like(arg_text_con_head)
|
||||
|
||||
order_by_con_drop = np.zeros_like(arg_text_con_drop)
|
||||
idx = 0
|
||||
for iij, box in enumerate(boxes):
|
||||
ys = slice(*box[2:4])
|
||||
xs = slice(*box[0:2])
|
||||
args_contours_box_main = args_contours_main[arg_text_con_main == iij]
|
||||
args_contours_box_head = args_contours_head[arg_text_con_head == iij]
|
||||
con_inter_box = contours_only_text_parent[args_contours_box_main]
|
||||
con_inter_box_h = contours_only_text_parent_h[args_contours_box_head]
|
||||
args_contours_box_drop = args_contours_drop[arg_text_con_drop == iij]
|
||||
|
||||
_, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions(
|
||||
textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2], box[0])
|
||||
textline_mask_tot[ys, xs],
|
||||
contours_only_text_parent[args_contours_box_main],
|
||||
contours_only_text_parent_h[args_contours_box_head],
|
||||
polygons_of_drop_capitals[args_contours_box_drop],
|
||||
box[2], box[0])
|
||||
|
||||
for tidx, kind in zip(index_by_kind_sorted, kind_of_texts_sorted):
|
||||
if kind == 1:
|
||||
# print(iij, "main", args_contours_box_main[tidx], "becomes", idx)
|
||||
order_by_con_main[args_contours_box_main[tidx]] = idx
|
||||
else:
|
||||
elif kind == 2:
|
||||
# print(iij, "head", args_contours_box_head[tidx], "becomes", idx)
|
||||
order_by_con_head[args_contours_box_head[tidx]] = idx
|
||||
else:
|
||||
# print(iij, "drop", args_contours_box_drop[tidx], "becomes", idx)
|
||||
order_by_con_drop[args_contours_box_drop[tidx]] = idx
|
||||
idx += 1
|
||||
|
||||
# xml writer will create region ids in order of
|
||||
# - contours_only_text_parent (main text), followed by
|
||||
# - contours_only_text_parent (headings),
|
||||
# - contours_only_text_parent_h (headings), and then
|
||||
# - polygons_of_drop_capitals,
|
||||
# and then create regionrefs into these ordered by order_text_new
|
||||
order_text_new = np.argsort(np.concatenate((order_by_con_main,
|
||||
order_by_con_head)))
|
||||
order_by_con_head,
|
||||
order_by_con_drop)))
|
||||
return order_text_new
|
||||
|
||||
try:
|
||||
results = match_boxes(False)
|
||||
results = order_from_boxes(False)
|
||||
except Exception as why:
|
||||
self.logger.exception(why)
|
||||
results = match_boxes(True)
|
||||
results = order_from_boxes(True)
|
||||
|
||||
self.logger.debug("exit do_order_of_regions")
|
||||
return results
|
||||
|
|
@ -1809,6 +1801,7 @@ class Eynollah:
|
|||
text_regions_p[drops] = label_drop_fl
|
||||
|
||||
regions_without_separators = (text_regions_p == label_text) * 1
|
||||
regions_without_separators[drops] = 1 # also cover in reading-order
|
||||
# regions_without_separators = ( text_regions_p == 1 | text_regions_p == 2 ) * 1
|
||||
#self.return_regions_without_separators_new(text_regions_p, img_only_regions)
|
||||
if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
|
||||
|
|
@ -2399,7 +2392,7 @@ class Eynollah:
|
|||
order_of_texts=order_text_new,
|
||||
all_found_textline_polygons=all_found_textline_polygons,
|
||||
all_box_coord=page_coord,
|
||||
found_polygons_text_region_img=[],
|
||||
found_polygons_images=[],
|
||||
found_polygons_marginals_left=[],
|
||||
found_polygons_marginals_right=[],
|
||||
all_found_textline_polygons_marginals_left=[],
|
||||
|
|
@ -2466,7 +2459,7 @@ class Eynollah:
|
|||
order_of_texts=[],
|
||||
all_found_textline_polygons=[],
|
||||
all_box_coord=[],
|
||||
found_polygons_text_region_img=[],
|
||||
found_polygons_images=[],
|
||||
found_polygons_marginals_left=[],
|
||||
found_polygons_marginals_right=[],
|
||||
all_found_textline_polygons_marginals_left=[],
|
||||
|
|
@ -2724,7 +2717,7 @@ class Eynollah:
|
|||
all_found_textline_polygons_h=[],
|
||||
all_box_coord=[],
|
||||
all_box_coord_h=[],
|
||||
found_polygons_text_region_img=polygons_of_images,
|
||||
found_polygons_images=polygons_of_images,
|
||||
found_polygons_tables=contours_tables,
|
||||
found_polygons_drop_capitals=[],
|
||||
found_polygons_marginals_left=polygons_of_marginals,
|
||||
|
|
@ -2747,7 +2740,7 @@ class Eynollah:
|
|||
order_of_texts=[],
|
||||
all_found_textline_polygons=[],
|
||||
all_box_coord=[],
|
||||
found_polygons_text_region_img=polygons_of_images,
|
||||
found_polygons_images=polygons_of_images,
|
||||
found_polygons_marginals_left=polygons_of_marginals,
|
||||
found_polygons_marginals_right=polygons_of_marginals,
|
||||
all_found_textline_polygons_marginals_left=empty_marginals,
|
||||
|
|
@ -2907,14 +2900,21 @@ class Eynollah:
|
|||
|
||||
if self.reading_order_machine_based:
|
||||
order_text_new = self.do_order_of_regions_with_model(
|
||||
contours_only_text_parent, contours_only_text_parent_h, text_regions_p)
|
||||
contours_only_text_parent,
|
||||
contours_only_text_parent_h,
|
||||
text_regions_p)
|
||||
else:
|
||||
if np.abs(slope_deskew) < SLOPE_THRESHOLD:
|
||||
order_text_new = self.do_order_of_regions(
|
||||
contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot_ea)
|
||||
contours_only_text_parent,
|
||||
contours_only_text_parent_h,
|
||||
polygons_of_drop_capitals,
|
||||
boxes, textline_mask_tot_ea)
|
||||
else:
|
||||
order_text_new = self.do_order_of_regions(
|
||||
contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered,
|
||||
contours_only_text_parent_d_ordered,
|
||||
contours_only_text_parent_h_d_ordered,
|
||||
polygons_of_drop_capitals,
|
||||
boxes_d, textline_mask_tot_ea_d)
|
||||
self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s")
|
||||
|
||||
|
|
@ -2930,7 +2930,7 @@ class Eynollah:
|
|||
all_found_textline_polygons_h=all_found_textline_polygons_h,
|
||||
all_box_coord=all_box_coord,
|
||||
all_box_coord_h=all_box_coord_h,
|
||||
found_polygons_text_region_img=polygons_of_images,
|
||||
found_polygons_images=polygons_of_images,
|
||||
found_polygons_tables=contours_tables,
|
||||
found_polygons_drop_capitals=polygons_of_drop_capitals,
|
||||
found_polygons_marginals_left=polygons_of_marginals_left,
|
||||
|
|
@ -2955,7 +2955,7 @@ class Eynollah:
|
|||
order_of_texts=order_text_new,
|
||||
all_found_textline_polygons=all_found_textline_polygons,
|
||||
all_box_coord=all_box_coord,
|
||||
found_polygons_text_region_img=polygons_of_images,
|
||||
found_polygons_images=polygons_of_images,
|
||||
found_polygons_marginals_left=polygons_of_marginals_left,
|
||||
found_polygons_marginals_right=polygons_of_marginals_right,
|
||||
all_found_textline_polygons_marginals_left=all_found_textline_polygons_marginals_left,
|
||||
|
|
|
|||
|
|
@ -1106,7 +1106,7 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col)
|
|||
textlines_con_changed.append(textlines_big_org_form)
|
||||
return textlines_con_changed
|
||||
|
||||
def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref):
|
||||
def order_of_regions(textline_mask, contours_main, contours_head, contours_drop, y_ref, x_ref):
|
||||
"""
|
||||
Order text region contours within a single column bbox in a top-down-left-right way.
|
||||
|
||||
|
|
@ -1118,13 +1118,17 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref):
|
|||
* textline_mask: the mask of the textline segmentation, cropped for that box
|
||||
* contours_main: the paragraph text region contours expected to be here
|
||||
* contours_head: the heading text region contours expected to be here
|
||||
* contours_drop: the drop-capital region contours expected to be here
|
||||
* y_ref: the vertical offset of that box within the page
|
||||
* x_ref: the horizontal offset of that box within the page
|
||||
|
||||
Returns: a tuple of
|
||||
* the array of contour indexes overall within this box (i.e. into main+head)
|
||||
* the array of types (1 for paragraph, 2 for heading)
|
||||
* the array of contour indexes for the respective type (i.e. into contours_main or contours_head)
|
||||
* the array of contour indexes overall within this box
|
||||
(i.e. into main+head+drop)
|
||||
* the array of types
|
||||
(1 for paragraph, 2 for heading, 3 for drop-capital)
|
||||
* the array of contour indexes for the respective type
|
||||
(i.e. into contours_main or contours_head or contours_drop)
|
||||
"""
|
||||
##plt.imshow(textline_mask)
|
||||
##plt.show()
|
||||
|
|
@ -1156,19 +1160,31 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref):
|
|||
|
||||
cx_main, cy_main = find_center_of_contours(contours_main)
|
||||
cx_head, cy_head = find_center_of_contours(contours_head)
|
||||
cx_drop, cy_drop = find_center_of_contours(contours_drop)
|
||||
# assert not len(cy_main) or np.min(peaks_neg_new) <= np.min(cy_main) and np.max(cy_main) <= np.max(peaks_neg_new)
|
||||
# assert not len(cy_head) or np.min(peaks_neg_new) <= np.min(cy_head) and np.max(cy_head) <= np.max(peaks_neg_new)
|
||||
# assert not len(cy_drop) or np.min(peaks_neg_new) <= np.min(cy_drop) and np.max(cy_drop) <= np.max(peaks_neg_new)
|
||||
|
||||
matrix_of_orders = np.zeros((len(contours_main) + len(contours_head), 5), dtype=int)
|
||||
matrix_of_orders[:, 0] = np.arange(len(contours_main) + len(contours_head))
|
||||
matrix_of_orders[: len(contours_main), 1] = 1
|
||||
matrix_of_orders[len(contours_main) :, 1] = 2
|
||||
matrix_of_orders[: len(contours_main), 2] = cx_main
|
||||
matrix_of_orders[len(contours_main) :, 2] = cx_head
|
||||
matrix_of_orders[: len(contours_main), 3] = cy_main
|
||||
matrix_of_orders[len(contours_main) :, 3] = cy_head
|
||||
matrix_of_orders[: len(contours_main), 4] = np.arange(len(contours_main))
|
||||
matrix_of_orders[len(contours_main) :, 4] = np.arange(len(contours_head))
|
||||
total = len(contours_main) + len(contours_head) + len(contours_drop)
|
||||
slice_main = slice(0, len(contours_main))
|
||||
slice_head = slice(len(contours_main),
|
||||
len(contours_main) + len(contours_head))
|
||||
slice_drop = slice(len(contours_main) + len(contours_head),
|
||||
total)
|
||||
matrix_of_orders = np.zeros((total, 5), dtype=int)
|
||||
matrix_of_orders[:, 0] = np.arange(total)
|
||||
matrix_of_orders[slice_main, 1] = 1
|
||||
matrix_of_orders[slice_head, 1] = 2
|
||||
matrix_of_orders[slice_drop, 1] = 3
|
||||
matrix_of_orders[slice_main, 2] = cx_main
|
||||
matrix_of_orders[slice_head, 2] = cx_head
|
||||
matrix_of_orders[slice_drop, 2] = cx_drop
|
||||
matrix_of_orders[slice_main, 3] = cy_main
|
||||
matrix_of_orders[slice_head, 3] = cy_head
|
||||
matrix_of_orders[slice_drop, 3] = cy_drop
|
||||
matrix_of_orders[slice_main, 4] = np.arange(len(contours_main))
|
||||
matrix_of_orders[slice_head, 4] = np.arange(len(contours_head))
|
||||
matrix_of_orders[slice_drop, 4] = np.arange(len(contours_drop))
|
||||
|
||||
# print(peaks_neg_new,'peaks_neg_new')
|
||||
# print(matrix_of_orders,'matrix_of_orders')
|
||||
|
|
@ -1189,12 +1205,12 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref):
|
|||
# plt.gca().set_xticks(xrange, xrange + x_ref)
|
||||
# plt.gca().set_yticks(yrange, yrange + y_ref)
|
||||
# for idx, type_, cx, cy in zip(typed_indexes_in, types_in, cxs_in, cys_in):
|
||||
# cnt = (contours_main if type_ == 1 else contours_head)[idx]
|
||||
# col = 'red' if type_ == 1 else 'blue'
|
||||
# cnt = {1: contours_main, 2: contours_head, 3: contours_drop}[type_][idx]
|
||||
# col = {1: 'red', 2: 'blue', 3: 'green'}[type_]
|
||||
# plt.scatter(cx - x_ref, cy - y_ref, 20, c=col, marker='o')
|
||||
# plt.text(cx - x_ref, cy - y_ref, str(idx), c=col)
|
||||
# plt.gca().add_patch(patches.Polygon(cnt[:, 0] - [[x_ref, y_ref]], closed=False, fill=False, color=col))
|
||||
# plt.title("box contours centered in %d:%d (red=main / blue=heading)" % (top, bot))
|
||||
# plt.title("box contours centered in %d:%d (red=main / blue=heading / green=drop-capital)" % (top, bot))
|
||||
# plt.show()
|
||||
|
||||
sorted_inside = np.argsort(cxs_in)
|
||||
|
|
@ -1204,8 +1220,11 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref):
|
|||
|
||||
##matrix_of_orders[:len_main,4]=final_indexers_sorted[:]
|
||||
|
||||
assert len(set(final_indexers_sorted)) == len(contours_main) + len(contours_head)
|
||||
assert set(final_index_type) == set(range(len(contours_main))).union(range(len(contours_head)))
|
||||
assert len(set(final_indexers_sorted)) == total
|
||||
assert set(final_index_type) == (
|
||||
set(range(len(contours_main)))
|
||||
.union(range(len(contours_head)))
|
||||
.union(range(len(contours_drop))))
|
||||
|
||||
return np.array(final_indexers_sorted), np.array(final_types), np.array(final_index_type)
|
||||
|
||||
|
|
|
|||
|
|
@ -81,7 +81,7 @@ class EynollahXmlWriter:
|
|||
order_of_texts,
|
||||
all_found_textline_polygons,
|
||||
all_box_coord,
|
||||
found_polygons_text_region_img,
|
||||
found_polygons_images,
|
||||
found_polygons_marginals_left,
|
||||
found_polygons_marginals_right,
|
||||
all_found_textline_polygons_marginals_left,
|
||||
|
|
@ -104,7 +104,7 @@ class EynollahXmlWriter:
|
|||
all_found_textline_polygons_h=[],
|
||||
all_box_coord=all_box_coord,
|
||||
all_box_coord_h=[],
|
||||
found_polygons_text_region_img=found_polygons_text_region_img,
|
||||
found_polygons_images=found_polygons_images,
|
||||
found_polygons_tables=found_polygons_tables,
|
||||
found_polygons_drop_capitals=[],
|
||||
found_polygons_marginals_left=found_polygons_marginals_left,
|
||||
|
|
@ -132,7 +132,7 @@ class EynollahXmlWriter:
|
|||
all_found_textline_polygons_h,
|
||||
all_box_coord,
|
||||
all_box_coord_h,
|
||||
found_polygons_text_region_img,
|
||||
found_polygons_images,
|
||||
found_polygons_tables,
|
||||
found_polygons_drop_capitals,
|
||||
found_polygons_marginals_left,
|
||||
|
|
@ -211,6 +211,21 @@ class EynollahXmlWriter:
|
|||
self.serialize_lines_in_region(textregion, all_found_textline_polygons_h, mm, page_coord,
|
||||
all_box_coord_h, slopes_h, counter, ocr_textlines)
|
||||
|
||||
for mm, region_contour in enumerate(found_polygons_drop_capitals):
|
||||
dropcapital = TextRegionType(
|
||||
id=counter.next_region_id, type_='drop-capital',
|
||||
Coords=CoordsType(points=self.calculate_points(region_contour, offset))
|
||||
)
|
||||
page.add_TextRegion(dropcapital)
|
||||
all_box_coord_drop = [[0, 0, 0, 0]]
|
||||
slopes_drop = [0]
|
||||
if ocr_all_textlines_drop:
|
||||
ocr_textlines = ocr_all_textlines_drop[mm]
|
||||
else:
|
||||
ocr_textlines = None
|
||||
self.serialize_lines_in_region(dropcapital, [[found_polygons_drop_capitals[mm]]], 0, page_coord,
|
||||
all_box_coord_drop, slopes_drop, counter, ocr_textlines)
|
||||
|
||||
for mm, region_contour in enumerate(found_polygons_marginals_left):
|
||||
marginal = TextRegionType(
|
||||
id=counter.next_region_id, type_='marginalia',
|
||||
|
|
@ -236,22 +251,7 @@ class EynollahXmlWriter:
|
|||
self.serialize_lines_in_region(marginal, all_found_textline_polygons_marginals_right, mm, page_coord,
|
||||
all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines)
|
||||
|
||||
for mm, region_contour in enumerate(found_polygons_drop_capitals):
|
||||
dropcapital = TextRegionType(
|
||||
id=counter.next_region_id, type_='drop-capital',
|
||||
Coords=CoordsType(points=self.calculate_points(region_contour, offset))
|
||||
)
|
||||
page.add_TextRegion(dropcapital)
|
||||
all_box_coord_drop = [[0, 0, 0, 0]]
|
||||
slopes_drop = [0]
|
||||
if ocr_all_textlines_drop:
|
||||
ocr_textlines = ocr_all_textlines_drop[mm]
|
||||
else:
|
||||
ocr_textlines = None
|
||||
self.serialize_lines_in_region(dropcapital, [[found_polygons_drop_capitals[mm]]], 0, page_coord,
|
||||
all_box_coord_drop, slopes_drop, counter, ocr_textlines)
|
||||
|
||||
for region_contour in found_polygons_text_region_img:
|
||||
for region_contour in found_polygons_images:
|
||||
page.add_ImageRegion(
|
||||
ImageRegionType(id=counter.next_region_id,
|
||||
Coords=CoordsType(points=self.calculate_points(region_contour, offset))))
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue