From 23e97a5e0bff7235d2121fb242663593c6eed5ad Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 23 Feb 2021 15:45:38 +0100 Subject: [PATCH] factor out add_textequiv --- sbb_newspapers_org_image/eynollah.py | 43 +++++++-------------------- sbb_newspapers_org_image/utils/xml.py | 4 +++ 2 files changed, 15 insertions(+), 32 deletions(-) diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py index 66405cb..e402e86 100644 --- a/sbb_newspapers_org_image/eynollah.py +++ b/sbb_newspapers_org_image/eynollah.py @@ -107,7 +107,7 @@ from .utils import ( return_boxes_of_images_by_order_of_reading_new, ) -from .utils.xml import create_page_xml +from .utils.xml import create_page_xml, add_textequiv from .utils.pil_cv2 import check_dpi from .plot import EynollahPlotter @@ -1164,11 +1164,7 @@ class eynollah: textline.set('id','l'+str(id_indexer_l)) id_indexer_l += 1 coord = ET.SubElement(textline, 'Coords') - texteq = ET.SubElement(textline, 'TextEquiv') - uni = ET.SubElement(texteq, 'Unicode') - uni.text = ' ' - - #points = ET.SubElement(coord, 'Points') + add_textequiv(textline) points_co='' for l in range(len(all_found_texline_polygons[region_idx][j])): @@ -1303,7 +1299,6 @@ class eynollah: coord_page = ET.SubElement(page_print_sub, "Coords") coord_page.set('points', self.calculate_page_coords()) - id_of_marginalia = [] id_indexer = 0 id_indexer_l = 0 @@ -1322,9 +1317,7 @@ class eynollah: textline.set('id', 'l' + str(id_indexer_l)) id_indexer_l += 1 coord = ET.SubElement(textline, 'Coords') - texteq=ET.SubElement(textline, 'TextEquiv') - uni=ET.SubElement(texteq, 'Unicode') - uni.text = ' ' + add_textequiv(textline) points_co='' for l in range(len(all_found_texline_polygons[mm][j])): #point = ET.SubElement(coord, 'Point') @@ -1360,9 +1353,7 @@ class eynollah: points_co += ' ' coord.set('points', points_co) - texteqreg = ET.SubElement(textregion, 'TextEquiv') - unireg = ET.SubElement(texteqreg, 'Unicode') - unireg.text = ' ' + add_textequiv(textregion) try: #id_indexer_l=0 try: @@ -1381,10 +1372,8 @@ class eynollah: textline.set('id','l'+str(id_indexer_l)) id_indexer_l+=1 coord = ET.SubElement(textline, 'Coords') - texteq = ET.SubElement(textline, 'TextEquiv') - uni = ET.SubElement(texteq, 'Unicode') - uni.text = ' ' - points_co='' + add_textequiv(textline) + points_co = '' for l in range(len(all_found_texline_polygons_marginals[mm][j])): if not curved_line: if len(all_found_texline_polygons_marginals[mm][j][l]) == 2: @@ -1453,9 +1442,7 @@ class eynollah: coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord)) id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l) - texteqreg = ET.SubElement(textregion, 'TextEquiv') - unireg = ET.SubElement(texteqreg, 'Unicode') - unireg.text = ' ' + add_textequiv(textregion) self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h)) if len(found_polygons_text_region_h) > 0: @@ -1467,9 +1454,7 @@ class eynollah: coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord)) id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, id_indexer_l) - texteqreg = ET.SubElement(textregion, 'TextEquiv') - unireg = ET.SubElement(texteqreg, 'Unicode') - unireg.text = ' ' + add_textequiv(textregion) if len(found_polygons_drop_capitals) > 0: id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals) @@ -1480,9 +1465,7 @@ class eynollah: textregion.set('type','drop-capital') coord_text = ET.SubElement(textregion, 'Coords') coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord)) - texteqreg = ET.SubElement(textregion, 'TextEquiv') - unireg=ET.SubElement(texteqreg, 'Unicode') - unireg.text = ' ' + add_textequiv(textregion) try: for mm in range(len(found_polygons_marginals)): textregion = ET.SubElement(page, 'TextRegion') @@ -1496,9 +1479,7 @@ class eynollah: textline.set('id', 'l%s' % id_indexer_l) id_indexer_l += 1 coord = ET.SubElement(textline, 'Coords') - texteq = ET.SubElement(textline, 'TextEquiv') - uni = ET.SubElement(texteq, 'Unicode') - uni.text = ' ' + add_textequiv(textline) points_co='' for l in range(len(all_found_texline_polygons_marginals[mm][j])): if not self.curved_line: @@ -1523,9 +1504,7 @@ class eynollah: if l<(len(all_found_texline_polygons_marginals[mm][j])-1): points_co=points_co+' ' coord.set('points',points_co) - texteqreg = ET.SubElement(textregion, 'TextEquiv') - unireg = ET.SubElement(texteqreg, 'Unicode') - unireg.text = ' ' + add_textequiv(textregion) except: pass diff --git a/sbb_newspapers_org_image/utils/xml.py b/sbb_newspapers_org_image/utils/xml.py index 0eb10ec..072bca5 100644 --- a/sbb_newspapers_org_image/utils/xml.py +++ b/sbb_newspapers_org_image/utils/xml.py @@ -32,3 +32,7 @@ def create_page_xml(imageFilename, height, width): return pcgts, page +def add_textequiv(parent, text=''): + textequiv = ET.SubElement(parent, 'TextEquiv') + unireg = ET.SubElement(textequiv, 'Unicode') + unireg.text = text