|
|
|
@ -1255,166 +1255,6 @@ class eynollah:
|
|
|
|
|
#print(coords)
|
|
|
|
|
return coords
|
|
|
|
|
|
|
|
|
|
def write_into_page_xml_full(self, contours, contours_h, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals):
|
|
|
|
|
self.logger.debug('enter write_into_page_xml_full')
|
|
|
|
|
|
|
|
|
|
found_polygons_text_region = contours
|
|
|
|
|
found_polygons_text_region_h = contours_h
|
|
|
|
|
|
|
|
|
|
# create the file structure
|
|
|
|
|
pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
|
|
|
|
|
page_print_sub = ET.SubElement(page, "Border")
|
|
|
|
|
coord_page = ET.SubElement(page_print_sub, "Coords")
|
|
|
|
|
coord_page.set('points', self.calculate_page_coords())
|
|
|
|
|
|
|
|
|
|
id_indexer = 0
|
|
|
|
|
id_indexer_l = 0
|
|
|
|
|
id_of_marginalia = []
|
|
|
|
|
|
|
|
|
|
if len(contours) > 0:
|
|
|
|
|
self.xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
|
|
|
|
|
for mm in range(len(found_polygons_text_region)):
|
|
|
|
|
textregion=ET.SubElement(page, 'TextRegion')
|
|
|
|
|
textregion.set('id', 'r%s' % id_indexer)
|
|
|
|
|
id_indexer += 1
|
|
|
|
|
textregion.set('type', 'paragraph')
|
|
|
|
|
coord_text = ET.SubElement(textregion, 'Coords')
|
|
|
|
|
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
|
|
|
|
|
id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
|
|
|
|
|
texteqreg = ET.SubElement(textregion, 'TextEquiv')
|
|
|
|
|
unireg = ET.SubElement(texteqreg, 'Unicode')
|
|
|
|
|
unireg.text = ' '
|
|
|
|
|
|
|
|
|
|
self.logger.debug('len(contours_h) %s', len(contours_h))
|
|
|
|
|
if len(contours_h) > 0:
|
|
|
|
|
for mm in range(len(found_polygons_text_region_h)):
|
|
|
|
|
textregion=ET.SubElement(page, 'TextRegion')
|
|
|
|
|
textregion.set('id', 'r%s' % id_indexer)
|
|
|
|
|
id_indexer += 1
|
|
|
|
|
textregion.set('type','header')
|
|
|
|
|
coord_text = ET.SubElement(textregion, 'Coords')
|
|
|
|
|
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord))
|
|
|
|
|
id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, id_indexer_l)
|
|
|
|
|
texteqreg = ET.SubElement(textregion, 'TextEquiv')
|
|
|
|
|
unireg = ET.SubElement(texteqreg, 'Unicode')
|
|
|
|
|
unireg.text = ' '
|
|
|
|
|
|
|
|
|
|
if len(found_polygons_drop_capitals) > 0:
|
|
|
|
|
id_indexer = len(contours_h) + len(contours) + len(found_polygons_marginals)
|
|
|
|
|
for mm in range(len(found_polygons_drop_capitals)):
|
|
|
|
|
textregion=ET.SubElement(page, 'TextRegion')
|
|
|
|
|
textregion.set('id',' r%s' % id_indexer)
|
|
|
|
|
id_indexer += 1
|
|
|
|
|
textregion.set('type','drop-capital')
|
|
|
|
|
coord_text = ET.SubElement(textregion, 'Coords')
|
|
|
|
|
coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord))
|
|
|
|
|
texteqreg = ET.SubElement(textregion, 'TextEquiv')
|
|
|
|
|
unireg=ET.SubElement(texteqreg, 'Unicode')
|
|
|
|
|
unireg.text = ' '
|
|
|
|
|
try:
|
|
|
|
|
try:
|
|
|
|
|
id_indexer_l=id_indexer_l
|
|
|
|
|
except:
|
|
|
|
|
id_indexer_l=0
|
|
|
|
|
for mm in range(len(found_polygons_marginals)):
|
|
|
|
|
textregion=ET.SubElement(page, 'TextRegion')
|
|
|
|
|
|
|
|
|
|
textregion.set('id', id_of_marginalia[mm])
|
|
|
|
|
|
|
|
|
|
textregion.set('type','marginalia')
|
|
|
|
|
#if mm==0:
|
|
|
|
|
# textregion.set('type','header')
|
|
|
|
|
#else:
|
|
|
|
|
# textregion.set('type','paragraph')
|
|
|
|
|
coord_text = ET.SubElement(textregion, 'Coords')
|
|
|
|
|
coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
|
|
|
|
|
|
|
|
|
|
for j in range(len(all_found_texline_polygons_marginals[mm])):
|
|
|
|
|
textline=ET.SubElement(textregion, 'TextLine')
|
|
|
|
|
textline.set('id','l'+str(id_indexer_l))
|
|
|
|
|
id_indexer_l+=1
|
|
|
|
|
coord = ET.SubElement(textline, 'Coords')
|
|
|
|
|
texteq=ET.SubElement(textline, 'TextEquiv')
|
|
|
|
|
uni=ET.SubElement(texteq, 'Unicode')
|
|
|
|
|
uni.text = ' '
|
|
|
|
|
points_co=''
|
|
|
|
|
for l in range(len(all_found_texline_polygons_marginals[mm][j])):
|
|
|
|
|
if not self.curved_line:
|
|
|
|
|
if len(all_found_texline_polygons_marginals[mm][j][l])==2:
|
|
|
|
|
points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][0]
|
|
|
|
|
+all_box_coord_marginals[mm][2]+page_coord[2])/self.scale_x) )
|
|
|
|
|
points_co=points_co+','
|
|
|
|
|
points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][1]
|
|
|
|
|
+all_box_coord_marginals[mm][0]+page_coord[0])/self.scale_y) )
|
|
|
|
|
else:
|
|
|
|
|
points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][0]
|
|
|
|
|
+all_box_coord_marginals[mm][2]+page_coord[2])/self.scale_x ) )
|
|
|
|
|
points_co=points_co+','
|
|
|
|
|
points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][1]
|
|
|
|
|
+all_box_coord_marginals[mm][0]+page_coord[0])/self.scale_y) )
|
|
|
|
|
else:
|
|
|
|
|
if len(all_found_texline_polygons_marginals[mm][j][l])==2:
|
|
|
|
|
points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][0]
|
|
|
|
|
+page_coord[2])/self.scale_x) )
|
|
|
|
|
points_co=points_co+','
|
|
|
|
|
points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][1]
|
|
|
|
|
+page_coord[0])/self.scale_y) )
|
|
|
|
|
else:
|
|
|
|
|
points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][0]
|
|
|
|
|
+page_coord[2])/self.scale_x ) )
|
|
|
|
|
points_co=points_co+','
|
|
|
|
|
points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][1]
|
|
|
|
|
+page_coord[0])/self.scale_y) )
|
|
|
|
|
|
|
|
|
|
if l<(len(all_found_texline_polygons_marginals[mm][j])-1):
|
|
|
|
|
points_co=points_co+' '
|
|
|
|
|
#print(points_co)
|
|
|
|
|
coord.set('points',points_co)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
texteqreg=ET.SubElement(textregion, 'TextEquiv')
|
|
|
|
|
|
|
|
|
|
unireg=ET.SubElement(texteqreg, 'Unicode')
|
|
|
|
|
unireg.text = ' '
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
id_indexer=len(contours_h)+len(contours)+len(found_polygons_marginals)+len(found_polygons_drop_capitals)
|
|
|
|
|
for mm in range(len(found_polygons_text_region_img)):
|
|
|
|
|
textregion=ET.SubElement(page, 'ImageRegion')
|
|
|
|
|
|
|
|
|
|
textregion.set('id','r'+str(id_indexer))
|
|
|
|
|
id_indexer+=1
|
|
|
|
|
coord_text = ET.SubElement(textregion, 'Coords')
|
|
|
|
|
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, page_coord))
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
for mm in range(len(found_polygons_tables)):
|
|
|
|
|
textregion=ET.SubElement(page, 'TableRegion')
|
|
|
|
|
|
|
|
|
|
textregion.set('id','r'+str(id_indexer))
|
|
|
|
|
id_indexer+=1
|
|
|
|
|
coord_text = ET.SubElement(textregion, 'Coords')
|
|
|
|
|
coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, page_coord))
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
##print(dir_of_image)
|
|
|
|
|
##print(self.f_name)
|
|
|
|
|
##print(os.path.join(dir_of_image, self.f_name) + ".xml")
|
|
|
|
|
##tree = ET.ElementTree(pcgts)
|
|
|
|
|
##tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
|
|
|
|
|
|
|
|
|
|
self.logger.info("filename stem: '%s'", self.image_filename_stem)
|
|
|
|
|
# print(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
|
|
|
|
|
tree = ET.ElementTree(pcgts)
|
|
|
|
|
tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_page_coords(self):
|
|
|
|
|
self.logger.debug('enter calculate_page_coords')
|
|
|
|
|
points_page_print = ""
|
|
|
|
@ -1457,7 +1297,7 @@ class eynollah:
|
|
|
|
|
|
|
|
|
|
def write_into_page_xml(self, contours, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, curved_line, slopes, slopes_marginals):
|
|
|
|
|
self.logger.debug('enter write_into_page_xml')
|
|
|
|
|
id_of_marginalia
|
|
|
|
|
id_of_marginalia = []
|
|
|
|
|
|
|
|
|
|
found_polygons_text_region = contours
|
|
|
|
|
|
|
|
|
@ -1599,6 +1439,166 @@ class eynollah:
|
|
|
|
|
tree = ET.ElementTree(pcgts)
|
|
|
|
|
tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
|
|
|
|
|
|
|
|
|
|
def write_into_page_xml_full(self, contours, contours_h, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals):
|
|
|
|
|
self.logger.debug('enter write_into_page_xml_full')
|
|
|
|
|
|
|
|
|
|
found_polygons_text_region = contours
|
|
|
|
|
found_polygons_text_region_h = contours_h
|
|
|
|
|
|
|
|
|
|
# create the file structure
|
|
|
|
|
pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
|
|
|
|
|
page_print_sub = ET.SubElement(page, "Border")
|
|
|
|
|
coord_page = ET.SubElement(page_print_sub, "Coords")
|
|
|
|
|
coord_page.set('points', self.calculate_page_coords())
|
|
|
|
|
|
|
|
|
|
id_indexer = 0
|
|
|
|
|
id_indexer_l = 0
|
|
|
|
|
id_of_marginalia = []
|
|
|
|
|
|
|
|
|
|
if len(contours) > 0:
|
|
|
|
|
self.xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
|
|
|
|
|
for mm in range(len(found_polygons_text_region)):
|
|
|
|
|
textregion=ET.SubElement(page, 'TextRegion')
|
|
|
|
|
textregion.set('id', 'r%s' % id_indexer)
|
|
|
|
|
id_indexer += 1
|
|
|
|
|
textregion.set('type', 'paragraph')
|
|
|
|
|
coord_text = ET.SubElement(textregion, 'Coords')
|
|
|
|
|
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
|
|
|
|
|
id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
|
|
|
|
|
texteqreg = ET.SubElement(textregion, 'TextEquiv')
|
|
|
|
|
unireg = ET.SubElement(texteqreg, 'Unicode')
|
|
|
|
|
unireg.text = ' '
|
|
|
|
|
|
|
|
|
|
self.logger.debug('len(contours_h) %s', len(contours_h))
|
|
|
|
|
if len(contours_h) > 0:
|
|
|
|
|
for mm in range(len(found_polygons_text_region_h)):
|
|
|
|
|
textregion=ET.SubElement(page, 'TextRegion')
|
|
|
|
|
textregion.set('id', 'r%s' % id_indexer)
|
|
|
|
|
id_indexer += 1
|
|
|
|
|
textregion.set('type','header')
|
|
|
|
|
coord_text = ET.SubElement(textregion, 'Coords')
|
|
|
|
|
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord))
|
|
|
|
|
id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, id_indexer_l)
|
|
|
|
|
texteqreg = ET.SubElement(textregion, 'TextEquiv')
|
|
|
|
|
unireg = ET.SubElement(texteqreg, 'Unicode')
|
|
|
|
|
unireg.text = ' '
|
|
|
|
|
|
|
|
|
|
if len(found_polygons_drop_capitals) > 0:
|
|
|
|
|
id_indexer = len(contours_h) + len(contours) + len(found_polygons_marginals)
|
|
|
|
|
for mm in range(len(found_polygons_drop_capitals)):
|
|
|
|
|
textregion=ET.SubElement(page, 'TextRegion')
|
|
|
|
|
textregion.set('id',' r%s' % id_indexer)
|
|
|
|
|
id_indexer += 1
|
|
|
|
|
textregion.set('type','drop-capital')
|
|
|
|
|
coord_text = ET.SubElement(textregion, 'Coords')
|
|
|
|
|
coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord))
|
|
|
|
|
texteqreg = ET.SubElement(textregion, 'TextEquiv')
|
|
|
|
|
unireg=ET.SubElement(texteqreg, 'Unicode')
|
|
|
|
|
unireg.text = ' '
|
|
|
|
|
try:
|
|
|
|
|
try:
|
|
|
|
|
id_indexer_l=id_indexer_l
|
|
|
|
|
except:
|
|
|
|
|
id_indexer_l=0
|
|
|
|
|
for mm in range(len(found_polygons_marginals)):
|
|
|
|
|
textregion=ET.SubElement(page, 'TextRegion')
|
|
|
|
|
|
|
|
|
|
textregion.set('id', id_of_marginalia[mm])
|
|
|
|
|
|
|
|
|
|
textregion.set('type','marginalia')
|
|
|
|
|
#if mm==0:
|
|
|
|
|
# textregion.set('type','header')
|
|
|
|
|
#else:
|
|
|
|
|
# textregion.set('type','paragraph')
|
|
|
|
|
coord_text = ET.SubElement(textregion, 'Coords')
|
|
|
|
|
coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
|
|
|
|
|
|
|
|
|
|
for j in range(len(all_found_texline_polygons_marginals[mm])):
|
|
|
|
|
textline=ET.SubElement(textregion, 'TextLine')
|
|
|
|
|
textline.set('id','l'+str(id_indexer_l))
|
|
|
|
|
id_indexer_l+=1
|
|
|
|
|
coord = ET.SubElement(textline, 'Coords')
|
|
|
|
|
texteq=ET.SubElement(textline, 'TextEquiv')
|
|
|
|
|
uni=ET.SubElement(texteq, 'Unicode')
|
|
|
|
|
uni.text = ' '
|
|
|
|
|
points_co=''
|
|
|
|
|
for l in range(len(all_found_texline_polygons_marginals[mm][j])):
|
|
|
|
|
if not self.curved_line:
|
|
|
|
|
if len(all_found_texline_polygons_marginals[mm][j][l])==2:
|
|
|
|
|
points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][0]
|
|
|
|
|
+all_box_coord_marginals[mm][2]+page_coord[2])/self.scale_x) )
|
|
|
|
|
points_co=points_co+','
|
|
|
|
|
points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][1]
|
|
|
|
|
+all_box_coord_marginals[mm][0]+page_coord[0])/self.scale_y) )
|
|
|
|
|
else:
|
|
|
|
|
points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][0]
|
|
|
|
|
+all_box_coord_marginals[mm][2]+page_coord[2])/self.scale_x ) )
|
|
|
|
|
points_co=points_co+','
|
|
|
|
|
points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][1]
|
|
|
|
|
+all_box_coord_marginals[mm][0]+page_coord[0])/self.scale_y) )
|
|
|
|
|
else:
|
|
|
|
|
if len(all_found_texline_polygons_marginals[mm][j][l])==2:
|
|
|
|
|
points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][0]
|
|
|
|
|
+page_coord[2])/self.scale_x) )
|
|
|
|
|
points_co=points_co+','
|
|
|
|
|
points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][1]
|
|
|
|
|
+page_coord[0])/self.scale_y) )
|
|
|
|
|
else:
|
|
|
|
|
points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][0]
|
|
|
|
|
+page_coord[2])/self.scale_x ) )
|
|
|
|
|
points_co=points_co+','
|
|
|
|
|
points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][1]
|
|
|
|
|
+page_coord[0])/self.scale_y) )
|
|
|
|
|
|
|
|
|
|
if l<(len(all_found_texline_polygons_marginals[mm][j])-1):
|
|
|
|
|
points_co=points_co+' '
|
|
|
|
|
#print(points_co)
|
|
|
|
|
coord.set('points',points_co)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
texteqreg=ET.SubElement(textregion, 'TextEquiv')
|
|
|
|
|
|
|
|
|
|
unireg=ET.SubElement(texteqreg, 'Unicode')
|
|
|
|
|
unireg.text = ' '
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
id_indexer=len(contours_h)+len(contours)+len(found_polygons_marginals)+len(found_polygons_drop_capitals)
|
|
|
|
|
for mm in range(len(found_polygons_text_region_img)):
|
|
|
|
|
textregion=ET.SubElement(page, 'ImageRegion')
|
|
|
|
|
|
|
|
|
|
textregion.set('id','r'+str(id_indexer))
|
|
|
|
|
id_indexer+=1
|
|
|
|
|
coord_text = ET.SubElement(textregion, 'Coords')
|
|
|
|
|
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, page_coord))
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
for mm in range(len(found_polygons_tables)):
|
|
|
|
|
textregion=ET.SubElement(page, 'TableRegion')
|
|
|
|
|
|
|
|
|
|
textregion.set('id','r'+str(id_indexer))
|
|
|
|
|
id_indexer+=1
|
|
|
|
|
coord_text = ET.SubElement(textregion, 'Coords')
|
|
|
|
|
coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, page_coord))
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
##print(dir_of_image)
|
|
|
|
|
##print(self.f_name)
|
|
|
|
|
##print(os.path.join(dir_of_image, self.f_name) + ".xml")
|
|
|
|
|
##tree = ET.ElementTree(pcgts)
|
|
|
|
|
##tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
|
|
|
|
|
|
|
|
|
|
self.logger.info("filename stem: '%s'", self.image_filename_stem)
|
|
|
|
|
# print(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
|
|
|
|
|
tree = ET.ElementTree(pcgts)
|
|
|
|
|
tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_regions_from_xy_2models(self,img,is_image_enhanced):
|
|
|
|
|
self.logger.debug("enter get_regions_from_xy_2models")
|
|
|
|
|
img_org = np.copy(img)
|
|
|
|
|