🧹 Don't produce spurious TextEquiv elements.

eynollah produces spurious - and empy - pcGts TextEquiv elements. This
is a. unnecessary, b. wrong and c. produces a lot of warning messages
in subsequent OCR processing steps because the OCR processor warns
about already existing text.

Fix this by not generating any TextEquiv elements.

Fixes gh-37.
pull/68/head
Gerber, Mike 3 years ago
parent f0ac0bb090
commit 11d9b00510

@ -21,7 +21,6 @@ from ocrd_models.ocrd_page import (
RegionRefType, RegionRefType,
SeparatorRegionType, SeparatorRegionType,
TableRegionType, TableRegionType,
TextEquivType,
TextLineType, TextLineType,
TextRegionType, TextRegionType,
UnorderedGroupIndexedType, UnorderedGroupIndexedType,

@ -10,7 +10,6 @@ from ocrd_utils import getLogger
from ocrd_models.ocrd_page import ( from ocrd_models.ocrd_page import (
BorderType, BorderType,
CoordsType, CoordsType,
TextEquivType,
PcGtsType, PcGtsType,
TextLineType, TextLineType,
TextRegionType, TextRegionType,
@ -59,7 +58,6 @@ class EynollahXmlWriter():
coords = CoordsType() coords = CoordsType()
textline = TextLineType(id=counter.next_line_id, Coords=coords) textline = TextLineType(id=counter.next_line_id, Coords=coords)
marginal_region.add_TextLine(textline) marginal_region.add_TextLine(textline)
textline.add_TextEquiv(TextEquivType(Unicode=''))
points_co = '' points_co = ''
for l in range(len(all_found_texline_polygons_marginals[marginal_idx][j])): for l in range(len(all_found_texline_polygons_marginals[marginal_idx][j])):
if not self.curved_line: if not self.curved_line:
@ -98,7 +96,7 @@ class EynollahXmlWriter():
self.logger.debug('enter serialize_lines_in_region') self.logger.debug('enter serialize_lines_in_region')
for j in range(len(all_found_texline_polygons[region_idx])): for j in range(len(all_found_texline_polygons[region_idx])):
coords = CoordsType() coords = CoordsType()
textline = TextLineType(id=counter.next_line_id, Coords=coords, TextEquiv=[TextEquivType(index=0, Unicode='')]) textline = TextLineType(id=counter.next_line_id, Coords=coords)
text_region.add_TextLine(textline) text_region.add_TextLine(textline)
region_bboxes = all_box_coord[region_idx] region_bboxes = all_box_coord[region_idx]
points_co = '' points_co = ''
@ -158,7 +156,7 @@ class EynollahXmlWriter():
for mm in range(len(found_polygons_text_region)): for mm in range(len(found_polygons_text_region)):
textregion = TextRegionType(id=counter.next_region_id, type_='paragraph', textregion = TextRegionType(id=counter.next_region_id, type_='paragraph',
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)), Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)),
TextEquiv=[TextEquivType(index=0, Unicode='')]) )
page.add_TextRegion(textregion) page.add_TextRegion(textregion)
self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter) self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter)
@ -217,7 +215,6 @@ class EynollahXmlWriter():
for mm in range(len(found_polygons_text_region)): for mm in range(len(found_polygons_text_region)):
textregion = TextRegionType(id=counter.next_region_id, type_='paragraph', textregion = TextRegionType(id=counter.next_region_id, type_='paragraph',
TextEquiv=[TextEquivType(index=0, Unicode='')],
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord))) Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)))
page.add_TextRegion(textregion) page.add_TextRegion(textregion)
self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter) self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter)
@ -225,21 +222,18 @@ class EynollahXmlWriter():
self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h)) self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h))
for mm in range(len(found_polygons_text_region_h)): for mm in range(len(found_polygons_text_region_h)):
textregion = TextRegionType(id=counter.next_region_id, type_='header', textregion = TextRegionType(id=counter.next_region_id, type_='header',
TextEquiv=[TextEquivType(index=0, Unicode='')],
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord))) Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord)))
page.add_TextRegion(textregion) page.add_TextRegion(textregion)
self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes_h, counter) self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes_h, counter)
for mm in range(len(found_polygons_marginals)): for mm in range(len(found_polygons_marginals)):
marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', marginal = TextRegionType(id=counter.next_region_id, type_='marginalia',
TextEquiv=[TextEquivType(index=0, Unicode='')],
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord))) Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord)))
page.add_TextRegion(marginal) page.add_TextRegion(marginal)
self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter)
for mm in range(len(found_polygons_drop_capitals)): for mm in range(len(found_polygons_drop_capitals)):
page.add_TextRegion(TextRegionType(id=counter.next_region_id, type_='drop-capital', page.add_TextRegion(TextRegionType(id=counter.next_region_id, type_='drop-capital',
TextEquiv=[TextEquivType(index=0, Unicode='')],
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord)))) Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord))))
for mm in range(len(found_polygons_text_region_img)): for mm in range(len(found_polygons_text_region_img)):

Loading…
Cancel
Save