mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-06-08 19:59:56 +02:00
writing drop capitals in xml output + and may resolve issue #110
This commit is contained in:
parent
93005959e5
commit
0f87974b0c
2 changed files with 41 additions and 13 deletions
|
@ -3735,9 +3735,9 @@ class Eynollah:
|
|||
contours_only_text_parent = [c for jz, c in enumerate(contours_only_text_parent) if areas_cnt_text[jz] > MIN_AREA_REGION]
|
||||
areas_cnt_text_parent = [area for area in areas_cnt_text if area > MIN_AREA_REGION]
|
||||
index_con_parents = np.argsort(areas_cnt_text_parent)
|
||||
if len(contours_only_text_parent)>1:
|
||||
try:
|
||||
contours_only_text_parent = list(np.array(contours_only_text_parent,dtype=object)[index_con_parents])
|
||||
else:
|
||||
except:
|
||||
contours_only_text_parent = list(np.array(contours_only_text_parent,dtype=np.int32)[index_con_parents])
|
||||
areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents])
|
||||
|
||||
|
@ -3753,10 +3753,11 @@ class Eynollah:
|
|||
if len(areas_cnt_text_d)>0:
|
||||
contours_biggest_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)]
|
||||
index_con_parents_d = np.argsort(areas_cnt_text_d)
|
||||
if len(contours_only_text_parent_d)>1:
|
||||
try:
|
||||
contours_only_text_parent_d = list(np.array(contours_only_text_parent_d,dtype=object)[index_con_parents_d])
|
||||
else:
|
||||
except:
|
||||
contours_only_text_parent_d = list(np.array(contours_only_text_parent_d,dtype=np.int32)[index_con_parents_d])
|
||||
|
||||
areas_cnt_text_d = list(np.array(areas_cnt_text_d)[index_con_parents_d])
|
||||
|
||||
cx_bigest_d_big, cy_biggest_d_big, _, _, _, _, _ = find_new_features_of_contours([contours_biggest_d])
|
||||
|
@ -3819,9 +3820,9 @@ class Eynollah:
|
|||
areas_cnt_text_parent = [area for area in areas_cnt_text if area > MIN_AREA_REGION]
|
||||
|
||||
index_con_parents = np.argsort(areas_cnt_text_parent)
|
||||
if len(contours_only_text_parent)>1:
|
||||
try:
|
||||
contours_only_text_parent = list(np.array(contours_only_text_parent,dtype=object)[index_con_parents])
|
||||
else:
|
||||
except:
|
||||
contours_only_text_parent = list(np.array(contours_only_text_parent,dtype=np.int32)[index_con_parents])
|
||||
areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents])
|
||||
|
||||
|
@ -3864,10 +3865,10 @@ class Eynollah:
|
|||
#print("text region early 6 in %.1fs", time.time() - t0)
|
||||
if self.full_layout:
|
||||
if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
|
||||
if len(contours_only_text_parent_d_ordered)>1:
|
||||
contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con])
|
||||
else:
|
||||
try:
|
||||
contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=np.int32)[index_by_text_par_con])
|
||||
except:
|
||||
contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con])
|
||||
if self.light_version:
|
||||
text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header_light(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered)
|
||||
else:
|
||||
|
@ -3957,9 +3958,9 @@ class Eynollah:
|
|||
if np.abs(slope_deskew) < SLOPE_THRESHOLD:
|
||||
order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot)
|
||||
else:
|
||||
if len(contours_only_text_parent_d_ordered)>1:
|
||||
try:
|
||||
contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con])
|
||||
else:
|
||||
except:
|
||||
contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=np.int32)[index_by_text_par_con])
|
||||
order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d)
|
||||
|
||||
|
|
|
@ -136,6 +136,29 @@ class EynollahXmlWriter():
|
|||
points_co += str(int((contour_textline[0][1] + region_bboxes[0]+page_coord[0])/self.scale_y))
|
||||
points_co += ' '
|
||||
coords.set_points(points_co[:-1])
|
||||
|
||||
def serialize_lines_in_dropcapital(self, text_region, all_found_textline_polygons, region_idx, page_coord, all_box_coord, slopes, counter, ocr_all_textlines_textregion):
|
||||
self.logger.debug('enter serialize_lines_in_region')
|
||||
for j in range(1):
|
||||
coords = CoordsType()
|
||||
textline = TextLineType(id=counter.next_line_id, Coords=coords)
|
||||
if ocr_all_textlines_textregion:
|
||||
textline.set_TextEquiv( [ TextEquivType(Unicode=ocr_all_textlines_textregion[j]) ] )
|
||||
text_region.add_TextLine(textline)
|
||||
#region_bboxes = all_box_coord[region_idx]
|
||||
points_co = ''
|
||||
for idx_contour_textline, contour_textline in enumerate(all_found_textline_polygons[j]):
|
||||
if len(contour_textline) == 2:
|
||||
points_co += str(int((contour_textline[0] + page_coord[2]) / self.scale_x))
|
||||
points_co += ','
|
||||
points_co += str(int((contour_textline[1] + page_coord[0]) / self.scale_y))
|
||||
else:
|
||||
points_co += str(int((contour_textline[0][0] + page_coord[2]) / self.scale_x))
|
||||
points_co += ','
|
||||
points_co += str(int((contour_textline[0][1] + page_coord[0])/self.scale_y))
|
||||
|
||||
points_co += ' '
|
||||
coords.set_points(points_co[:-1])
|
||||
|
||||
def write_pagexml(self, pcgts):
|
||||
out_fname = os.path.join(self.dir_out, self.image_filename_stem) + ".xml"
|
||||
|
@ -251,8 +274,12 @@ class EynollahXmlWriter():
|
|||
self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter)
|
||||
|
||||
for mm in range(len(found_polygons_drop_capitals)):
|
||||
page.add_TextRegion(TextRegionType(id=counter.next_region_id, type_='drop-capital',
|
||||
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord))))
|
||||
dropcapital = TextRegionType(id=counter.next_region_id, type_='drop-capital',
|
||||
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord)))
|
||||
page.add_TextRegion(dropcapital)
|
||||
all_box_coord_drop = None
|
||||
slopes_drop = None
|
||||
self.serialize_lines_in_dropcapital(dropcapital, [found_polygons_drop_capitals[mm]], mm, page_coord, all_box_coord_drop, slopes_drop, counter, ocr_all_textlines_textregion=None)
|
||||
|
||||
for mm in range(len(found_polygons_text_region_img)):
|
||||
page.add_ImageRegion(ImageRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_img[mm], page_coord))))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue