diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py index ff0d8bf..c2f4f8a 100644 --- a/sbb_newspapers_org_image/eynollah.py +++ b/sbb_newspapers_org_image/eynollah.py @@ -1255,166 +1255,6 @@ class eynollah: #print(coords) return coords - def write_into_page_xml_full(self, contours, contours_h, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals): - self.logger.debug('enter write_into_page_xml_full') - - found_polygons_text_region = contours - found_polygons_text_region_h = contours_h - - # create the file structure - pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org) - page_print_sub = ET.SubElement(page, "Border") - coord_page = ET.SubElement(page_print_sub, "Coords") - coord_page.set('points', self.calculate_page_coords()) - - id_indexer = 0 - id_indexer_l = 0 - id_of_marginalia = [] - - if len(contours) > 0: - self.xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals) - for mm in range(len(found_polygons_text_region)): - textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id', 'r%s' % id_indexer) - id_indexer += 1 - textregion.set('type', 'paragraph') - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord)) - id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l) - texteqreg = ET.SubElement(textregion, 'TextEquiv') - unireg = ET.SubElement(texteqreg, 'Unicode') - unireg.text = ' ' - - self.logger.debug('len(contours_h) %s', len(contours_h)) - if len(contours_h) > 0: - for mm in range(len(found_polygons_text_region_h)): - textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id', 'r%s' % id_indexer) - id_indexer += 1 - textregion.set('type','header') - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord)) - id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, id_indexer_l) - texteqreg = ET.SubElement(textregion, 'TextEquiv') - unireg = ET.SubElement(texteqreg, 'Unicode') - unireg.text = ' ' - - if len(found_polygons_drop_capitals) > 0: - id_indexer = len(contours_h) + len(contours) + len(found_polygons_marginals) - for mm in range(len(found_polygons_drop_capitals)): - textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id',' r%s' % id_indexer) - id_indexer += 1 - textregion.set('type','drop-capital') - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord)) - texteqreg = ET.SubElement(textregion, 'TextEquiv') - unireg=ET.SubElement(texteqreg, 'Unicode') - unireg.text = ' ' - try: - try: - id_indexer_l=id_indexer_l - except: - id_indexer_l=0 - for mm in range(len(found_polygons_marginals)): - textregion=ET.SubElement(page, 'TextRegion') - - textregion.set('id', id_of_marginalia[mm]) - - textregion.set('type','marginalia') - #if mm==0: - # textregion.set('type','header') - #else: - # textregion.set('type','paragraph') - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord)) - - for j in range(len(all_found_texline_polygons_marginals[mm])): - textline=ET.SubElement(textregion, 'TextLine') - textline.set('id','l'+str(id_indexer_l)) - id_indexer_l+=1 - coord = ET.SubElement(textline, 'Coords') - texteq=ET.SubElement(textline, 'TextEquiv') - uni=ET.SubElement(texteq, 'Unicode') - uni.text = ' ' - points_co='' - for l in range(len(all_found_texline_polygons_marginals[mm][j])): - if not self.curved_line: - if len(all_found_texline_polygons_marginals[mm][j][l])==2: - points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][0] - +all_box_coord_marginals[mm][2]+page_coord[2])/self.scale_x) ) - points_co=points_co+',' - points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][1] - +all_box_coord_marginals[mm][0]+page_coord[0])/self.scale_y) ) - else: - points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][0] - +all_box_coord_marginals[mm][2]+page_coord[2])/self.scale_x ) ) - points_co=points_co+',' - points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][1] - +all_box_coord_marginals[mm][0]+page_coord[0])/self.scale_y) ) - else: - if len(all_found_texline_polygons_marginals[mm][j][l])==2: - points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][0] - +page_coord[2])/self.scale_x) ) - points_co=points_co+',' - points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][1] - +page_coord[0])/self.scale_y) ) - else: - points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][0] - +page_coord[2])/self.scale_x ) ) - points_co=points_co+',' - points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][1] - +page_coord[0])/self.scale_y) ) - - if l<(len(all_found_texline_polygons_marginals[mm][j])-1): - points_co=points_co+' ' - #print(points_co) - coord.set('points',points_co) - - - texteqreg=ET.SubElement(textregion, 'TextEquiv') - - unireg=ET.SubElement(texteqreg, 'Unicode') - unireg.text = ' ' - except: - pass - - try: - id_indexer=len(contours_h)+len(contours)+len(found_polygons_marginals)+len(found_polygons_drop_capitals) - for mm in range(len(found_polygons_text_region_img)): - textregion=ET.SubElement(page, 'ImageRegion') - - textregion.set('id','r'+str(id_indexer)) - id_indexer+=1 - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, page_coord)) - except: - pass - - - try: - for mm in range(len(found_polygons_tables)): - textregion=ET.SubElement(page, 'TableRegion') - - textregion.set('id','r'+str(id_indexer)) - id_indexer+=1 - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, page_coord)) - except: - pass - - ##print(dir_of_image) - ##print(self.f_name) - ##print(os.path.join(dir_of_image, self.f_name) + ".xml") - ##tree = ET.ElementTree(pcgts) - ##tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml") - - self.logger.info("filename stem: '%s'", self.image_filename_stem) - # print(os.path.join(dir_of_image, self.image_filename_stem) + ".xml") - tree = ET.ElementTree(pcgts) - tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml") - - def calculate_page_coords(self): self.logger.debug('enter calculate_page_coords') points_page_print = "" @@ -1457,7 +1297,7 @@ class eynollah: def write_into_page_xml(self, contours, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, curved_line, slopes, slopes_marginals): self.logger.debug('enter write_into_page_xml') - id_of_marginalia + id_of_marginalia = [] found_polygons_text_region = contours @@ -1599,6 +1439,166 @@ class eynollah: tree = ET.ElementTree(pcgts) tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml") + def write_into_page_xml_full(self, contours, contours_h, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals): + self.logger.debug('enter write_into_page_xml_full') + + found_polygons_text_region = contours + found_polygons_text_region_h = contours_h + + # create the file structure + pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org) + page_print_sub = ET.SubElement(page, "Border") + coord_page = ET.SubElement(page_print_sub, "Coords") + coord_page.set('points', self.calculate_page_coords()) + + id_indexer = 0 + id_indexer_l = 0 + id_of_marginalia = [] + + if len(contours) > 0: + self.xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals) + for mm in range(len(found_polygons_text_region)): + textregion=ET.SubElement(page, 'TextRegion') + textregion.set('id', 'r%s' % id_indexer) + id_indexer += 1 + textregion.set('type', 'paragraph') + coord_text = ET.SubElement(textregion, 'Coords') + coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord)) + id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l) + texteqreg = ET.SubElement(textregion, 'TextEquiv') + unireg = ET.SubElement(texteqreg, 'Unicode') + unireg.text = ' ' + + self.logger.debug('len(contours_h) %s', len(contours_h)) + if len(contours_h) > 0: + for mm in range(len(found_polygons_text_region_h)): + textregion=ET.SubElement(page, 'TextRegion') + textregion.set('id', 'r%s' % id_indexer) + id_indexer += 1 + textregion.set('type','header') + coord_text = ET.SubElement(textregion, 'Coords') + coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord)) + id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, id_indexer_l) + texteqreg = ET.SubElement(textregion, 'TextEquiv') + unireg = ET.SubElement(texteqreg, 'Unicode') + unireg.text = ' ' + + if len(found_polygons_drop_capitals) > 0: + id_indexer = len(contours_h) + len(contours) + len(found_polygons_marginals) + for mm in range(len(found_polygons_drop_capitals)): + textregion=ET.SubElement(page, 'TextRegion') + textregion.set('id',' r%s' % id_indexer) + id_indexer += 1 + textregion.set('type','drop-capital') + coord_text = ET.SubElement(textregion, 'Coords') + coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord)) + texteqreg = ET.SubElement(textregion, 'TextEquiv') + unireg=ET.SubElement(texteqreg, 'Unicode') + unireg.text = ' ' + try: + try: + id_indexer_l=id_indexer_l + except: + id_indexer_l=0 + for mm in range(len(found_polygons_marginals)): + textregion=ET.SubElement(page, 'TextRegion') + + textregion.set('id', id_of_marginalia[mm]) + + textregion.set('type','marginalia') + #if mm==0: + # textregion.set('type','header') + #else: + # textregion.set('type','paragraph') + coord_text = ET.SubElement(textregion, 'Coords') + coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord)) + + for j in range(len(all_found_texline_polygons_marginals[mm])): + textline=ET.SubElement(textregion, 'TextLine') + textline.set('id','l'+str(id_indexer_l)) + id_indexer_l+=1 + coord = ET.SubElement(textline, 'Coords') + texteq=ET.SubElement(textline, 'TextEquiv') + uni=ET.SubElement(texteq, 'Unicode') + uni.text = ' ' + points_co='' + for l in range(len(all_found_texline_polygons_marginals[mm][j])): + if not self.curved_line: + if len(all_found_texline_polygons_marginals[mm][j][l])==2: + points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][0] + +all_box_coord_marginals[mm][2]+page_coord[2])/self.scale_x) ) + points_co=points_co+',' + points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][1] + +all_box_coord_marginals[mm][0]+page_coord[0])/self.scale_y) ) + else: + points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][0] + +all_box_coord_marginals[mm][2]+page_coord[2])/self.scale_x ) ) + points_co=points_co+',' + points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][1] + +all_box_coord_marginals[mm][0]+page_coord[0])/self.scale_y) ) + else: + if len(all_found_texline_polygons_marginals[mm][j][l])==2: + points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][0] + +page_coord[2])/self.scale_x) ) + points_co=points_co+',' + points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][1] + +page_coord[0])/self.scale_y) ) + else: + points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][0] + +page_coord[2])/self.scale_x ) ) + points_co=points_co+',' + points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][1] + +page_coord[0])/self.scale_y) ) + + if l<(len(all_found_texline_polygons_marginals[mm][j])-1): + points_co=points_co+' ' + #print(points_co) + coord.set('points',points_co) + + + texteqreg=ET.SubElement(textregion, 'TextEquiv') + + unireg=ET.SubElement(texteqreg, 'Unicode') + unireg.text = ' ' + except: + pass + + try: + id_indexer=len(contours_h)+len(contours)+len(found_polygons_marginals)+len(found_polygons_drop_capitals) + for mm in range(len(found_polygons_text_region_img)): + textregion=ET.SubElement(page, 'ImageRegion') + + textregion.set('id','r'+str(id_indexer)) + id_indexer+=1 + coord_text = ET.SubElement(textregion, 'Coords') + coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, page_coord)) + except: + pass + + + try: + for mm in range(len(found_polygons_tables)): + textregion=ET.SubElement(page, 'TableRegion') + + textregion.set('id','r'+str(id_indexer)) + id_indexer+=1 + coord_text = ET.SubElement(textregion, 'Coords') + coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, page_coord)) + except: + pass + + ##print(dir_of_image) + ##print(self.f_name) + ##print(os.path.join(dir_of_image, self.f_name) + ".xml") + ##tree = ET.ElementTree(pcgts) + ##tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml") + + self.logger.info("filename stem: '%s'", self.image_filename_stem) + # print(os.path.join(dir_of_image, self.image_filename_stem) + ".xml") + tree = ET.ElementTree(pcgts) + tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml") + + def get_regions_from_xy_2models(self,img,is_image_enhanced): self.logger.debug("enter get_regions_from_xy_2models") img_org = np.copy(img)