From 11d9b00510c4b866ce934bfed9853c9df912e947 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 3 Mar 2022 12:21:40 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=A7=B9=20Don't=20produce=20spurious=20Tex?= =?UTF-8?q?tEquiv=20elements.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit eynollah produces spurious - and empy - pcGts TextEquiv elements. This is a. unnecessary, b. wrong and c. produces a lot of warning messages in subsequent OCR processing steps because the OCR processor warns about already existing text. Fix this by not generating any TextEquiv elements. Fixes gh-37. --- qurator/eynollah/utils/xml.py | 1 - qurator/eynollah/writer.py | 10 ++-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/qurator/eynollah/utils/xml.py b/qurator/eynollah/utils/xml.py index ac02190..0386b25 100644 --- a/qurator/eynollah/utils/xml.py +++ b/qurator/eynollah/utils/xml.py @@ -21,7 +21,6 @@ from ocrd_models.ocrd_page import ( RegionRefType, SeparatorRegionType, TableRegionType, - TextEquivType, TextLineType, TextRegionType, UnorderedGroupIndexedType, diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index 2bacb17..d36d3ab 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -10,7 +10,6 @@ from ocrd_utils import getLogger from ocrd_models.ocrd_page import ( BorderType, CoordsType, - TextEquivType, PcGtsType, TextLineType, TextRegionType, @@ -59,7 +58,6 @@ class EynollahXmlWriter(): coords = CoordsType() textline = TextLineType(id=counter.next_line_id, Coords=coords) marginal_region.add_TextLine(textline) - textline.add_TextEquiv(TextEquivType(Unicode='')) points_co = '' for l in range(len(all_found_texline_polygons_marginals[marginal_idx][j])): if not self.curved_line: @@ -98,7 +96,7 @@ class EynollahXmlWriter(): self.logger.debug('enter serialize_lines_in_region') for j in range(len(all_found_texline_polygons[region_idx])): coords = CoordsType() - textline = TextLineType(id=counter.next_line_id, Coords=coords, TextEquiv=[TextEquivType(index=0, Unicode='')]) + textline = TextLineType(id=counter.next_line_id, Coords=coords) text_region.add_TextLine(textline) region_bboxes = all_box_coord[region_idx] points_co = '' @@ -158,7 +156,7 @@ class EynollahXmlWriter(): for mm in range(len(found_polygons_text_region)): textregion = TextRegionType(id=counter.next_region_id, type_='paragraph', Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)), - TextEquiv=[TextEquivType(index=0, Unicode='')]) + ) page.add_TextRegion(textregion) self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter) @@ -217,7 +215,6 @@ class EynollahXmlWriter(): for mm in range(len(found_polygons_text_region)): textregion = TextRegionType(id=counter.next_region_id, type_='paragraph', - TextEquiv=[TextEquivType(index=0, Unicode='')], Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord))) page.add_TextRegion(textregion) self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter) @@ -225,21 +222,18 @@ class EynollahXmlWriter(): self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h)) for mm in range(len(found_polygons_text_region_h)): textregion = TextRegionType(id=counter.next_region_id, type_='header', - TextEquiv=[TextEquivType(index=0, Unicode='')], Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord))) page.add_TextRegion(textregion) self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes_h, counter) for mm in range(len(found_polygons_marginals)): marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', - TextEquiv=[TextEquivType(index=0, Unicode='')], Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord))) page.add_TextRegion(marginal) self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) for mm in range(len(found_polygons_drop_capitals)): page.add_TextRegion(TextRegionType(id=counter.next_region_id, type_='drop-capital', - TextEquiv=[TextEquivType(index=0, Unicode='')], Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord)))) for mm in range(len(found_polygons_text_region_img)):