From 416a84e542329d685f90f0cd2046173311a17b88 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 12 Apr 2021 13:25:29 +0200 Subject: [PATCH] replace lxml with OCR-D/core PAGE API --- qurator/eynollah/eynollah.py | 2 +- qurator/eynollah/utils/xml.py | 93 ++++++++++----------- qurator/eynollah/writer.py | 147 ++++++++++++++++------------------ tests/test_xml.py | 17 ++-- 4 files changed, 126 insertions(+), 133 deletions(-) diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index 57384b6..7510654 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -1639,7 +1639,7 @@ class Eynollah: if not num_col: self.logger.info("No columns detected, outputting an empty PAGE-XML") - pcgts = self.writer.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], [], cont_page) + pcgts = self.writer.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], [], [], cont_page) self.logger.info("Job done in %ss", str(time.time() - t1)) return pcgts diff --git a/qurator/eynollah/utils/xml.py b/qurator/eynollah/utils/xml.py index e972218..ac02190 100644 --- a/qurator/eynollah/utils/xml.py +++ b/qurator/eynollah/utils/xml.py @@ -1,60 +1,63 @@ # pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements,chained-comparison,fixme,broad-except,c-extension-no-member # pylint: disable=invalid-name -from lxml import etree as ET from .counter import EynollahIdCounter import numpy as np - -NAMESPACES = {} -NAMESPACES['page'] = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" -NAMESPACES['xsi'] = "http://www.w3.org/2001/XMLSchema-instance" -NAMESPACES[None] = NAMESPACES['page'] +from datetime import datetime + +from ocrd_models.ocrd_page import ( + CoordsType, + GlyphType, + ImageRegionType, + MathsRegionType, + MetadataType, + MetadataItemType, + NoiseRegionType, + OrderedGroupIndexedType, + OrderedGroupType, + PcGtsType, + PageType, + ReadingOrderType, + RegionRefIndexedType, + RegionRefType, + SeparatorRegionType, + TableRegionType, + TextEquivType, + TextLineType, + TextRegionType, + UnorderedGroupIndexedType, + UnorderedGroupType, + WordType, + + to_xml) def create_page_xml(imageFilename, height, width): - pcgts = ET.Element("PcGts", nsmap=NAMESPACES) - - pcgts.set("{%s}schemaLocation" % NAMESPACES['xsi'], NAMESPACES['page']) - - metadata = ET.SubElement(pcgts, "Metadata") - - author = ET.SubElement(metadata, "Creator") - author.text = "SBB_QURATOR" - - created = ET.SubElement(metadata, "Created") - created.text = "2019-06-17T18:15:12" - - changetime = ET.SubElement(metadata, "LastChange") - changetime.text = "2019-06-17T18:15:12" - - page = ET.SubElement(pcgts, "Page") - - page.set("imageFilename", imageFilename) - page.set("imageHeight", str(height)) - page.set("imageWidth", str(width)) - page.set("type", "content") - page.set("readingDirection", "left-to-right") - page.set("textLineOrder", "top-to-bottom") - - return pcgts, page - -def add_textequiv(parent, text=''): - textequiv = ET.SubElement(parent, 'TextEquiv') - unireg = ET.SubElement(textequiv, 'Unicode') - unireg.text = text + now = datetime.now() + pcgts = PcGtsType( + Metadata=MetadataType( + Creator='SBB_QURATOR', + Created=now, + LastChange=now + ), + Page=PageType( + imageWidth=str(width), + imageHeight=str(height), + imageFilename=imageFilename, + readingDirection='left-to-right', + textLineOrder='top-to-bottom' + )) + return pcgts def xml_reading_order(page, order_of_texts, id_of_marginalia): - region_order = ET.SubElement(page, 'ReadingOrder') - region_order_sub = ET.SubElement(region_order, 'OrderedGroup') - region_order_sub.set('id', "ro357564684568544579089") + region_order = ReadingOrderType() + og = OrderedGroupType(id="ro357564684568544579089") + page.set_ReadingOrder(region_order) + region_order.set_OrderedGroup(og) region_counter = EynollahIdCounter() for idx_textregion, _ in enumerate(order_of_texts): - name = ET.SubElement(region_order_sub, 'RegionRefIndexed') - name.set('index', str(region_counter.get('region'))) - name.set('regionRef', region_counter.region_id(order_of_texts[idx_textregion] + 1)) + og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=region_counter.region_id(order_of_texts[idx_textregion] + 1))) region_counter.inc('region') for id_marginal in id_of_marginalia: - name = ET.SubElement(region_order_sub, 'RegionRefIndexed') - name.set('index', str(region_counter.get('region'))) - name.set('regionRef', id_marginal) + og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=id_marginal)) region_counter.inc('region') def order_and_id_of_texts(found_polygons_text_region, found_polygons_text_region_h, matrix_of_orders, indexes_sorted, index_of_types, kind_of_texts, ref_point): diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index a54103c..822d255 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -1,23 +1,36 @@ # pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements,chained-comparison,fixme,broad-except,c-extension-no-member +# pylint: disable=import-error from pathlib import Path import os.path -from .utils.xml import create_page_xml, add_textequiv, xml_reading_order +from .utils.xml import create_page_xml, xml_reading_order from .utils.counter import EynollahIdCounter from ocrd_utils import getLogger -from lxml import etree as ET +from ocrd_models.ocrd_page import ( + BorderType, + CoordsType, + TextEquivType, + PcGtsType, + TextLineType, + TextRegionType, + ImageRegionType, + TableRegionType, + + to_xml + ) import numpy as np class EynollahXmlWriter(): - def __init__(self, *, dir_out, image_filename, curved_line): + def __init__(self, *, dir_out, image_filename, curved_line, pcgts=None): self.logger = getLogger('eynollah.writer') self.counter = EynollahIdCounter() self.dir_out = dir_out self.image_filename = image_filename self.image_filename_stem = Path(Path(image_filename).name).stem self.curved_line = curved_line + self.pcgts = pcgts if pcgts else PcGtsType() self.scale_x = None # XXX set outside __init__ self.scale_y = None # XXX set outside __init__ self.height_org = None # XXX set outside __init__ @@ -38,12 +51,12 @@ class EynollahXmlWriter(): points_page_print = points_page_print + ' ' return points_page_print[:-1] - def serialize_lines_in_marginal(self, marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, counter): + def serialize_lines_in_marginal(self, marginal_region, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, counter): for j in range(len(all_found_texline_polygons_marginals[marginal_idx])): - textline = ET.SubElement(marginal, 'TextLine') - textline.set('id', counter.next_line_id) - coord = ET.SubElement(textline, 'Coords') - add_textequiv(textline) + coords = CoordsType() + textline = TextLineType(id=counter.next_line_id, coords=coords) + marginal_region.add_TextLine(textline) + textline.add_TextEquiv(TextEquivType(Unicode='')) points_co = '' for l in range(len(all_found_texline_polygons_marginals[marginal_idx][j])): if not self.curved_line: @@ -64,7 +77,7 @@ class EynollahXmlWriter(): points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + page_coord[2]) / self.scale_x)) points_co += ',' points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + page_coord[0]) / self.scale_y)) - + elif self.curved_line and np.abs(slopes_marginals[marginal_idx]) > 45: if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2: points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x)) @@ -74,20 +87,16 @@ class EynollahXmlWriter(): points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x)) points_co += ',' points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y)) + points += ' ' + coords.set_points(points_co[:-1]) - if l < len(all_found_texline_polygons_marginals[marginal_idx][j]) - 1: - points_co += ' ' - coord.set('points',points_co) - - def serialize_lines_in_region(self, textregion, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, counter): + def serialize_lines_in_region(self, text_region, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, counter): self.logger.debug('enter serialize_lines_in_region') for j in range(len(all_found_texline_polygons[region_idx])): - textline = ET.SubElement(textregion, 'TextLine') - textline.set('id', counter.next_line_id) - coord = ET.SubElement(textline, 'Coords') - add_textequiv(textline) + coords = CoordsType() + textline = TextLineType(id=counter.next_line_id, coords=coords, TextEquiv=[TextEquivType(index=0, Unicode='')]) + text_region.add_TextLine(textline) region_bboxes = all_box_coord[region_idx] - points_co = '' for idx_contour_textline, contour_textline in enumerate(all_found_texline_polygons[region_idx][j]): if not self.curved_line: @@ -119,23 +128,22 @@ class EynollahXmlWriter(): points_co += str(int((contour_textline[0][0] + region_bboxes[2]+page_coord[2])/self.scale_x)) points_co += ',' points_co += str(int((contour_textline[0][1] + region_bboxes[0]+page_coord[0])/self.scale_y)) - points_co += ' ' - coord.set('points', points_co[:-1]) + coords.set_points(points_co[:-1]) def write_pagexml(self, pcgts): - self.logger.info("filename stem: '%s'", self.image_filename_stem) - tree = ET.ElementTree(pcgts) - tree.write(os.path.join(self.dir_out, self.image_filename_stem) + ".xml") + out_fname = os.path.join(self.dir_out, self.image_filename_stem) + ".xml" + self.logger.info("output filename: '%s'", out_fname) + with open(out_fname, 'w') as f: + f.write(to_xml(pcgts)) def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page): self.logger.debug('enter build_pagexml_no_full_layout') # create the file structure - pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org) - page_print_sub = ET.SubElement(page, "Border") - coord_page = ET.SubElement(page_print_sub, "Coords") - coord_page.set('points', self.calculate_page_coords(cont_page)) + pcgts = create_page_xml(self.image_filename, self.height_org, self.width_org) + page = pcgts.get_Page() + page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page)))) counter = EynollahIdCounter() if len(found_polygons_text_region) > 0: @@ -144,33 +152,28 @@ class EynollahXmlWriter(): xml_reading_order(page, order_of_texts, id_of_marginalia) for mm in range(len(found_polygons_text_region)): - textregion = ET.SubElement(page, 'TextRegion') - textregion.set('id', counter.next_region_id) - textregion.set('type', 'paragraph') - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)) + textregion = TextRegionType(id=counter.next_region_id, type='paragraph', + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)), + TextEquiv=[TextEquivType(index=0, Unicode='')]) + page.add_TextRegion(textregion) self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter) - add_textequiv(textregion) for mm in range(len(found_polygons_marginals)): - marginal = ET.SubElement(page, 'TextRegion') - marginal.set('id', counter.next_region_id) - marginal.set('type', 'marginalia') - coord_text = ET.SubElement(marginal, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord)) + marginal = TextRegionType(id=counter.next_region_id, type='marginalia', + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord))) + page.add_TextRegion(marginal) self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) for mm in range(len(found_polygons_text_region_img)): - textregion = ET.SubElement(page, 'ImageRegion') - textregion.set('id', counter.next_region_id) - coord_text = ET.SubElement(textregion, 'Coords') + img_region = ImageRegionType(id=counter.next_region_id, Coords=CoordsType()) + page.add_ImageRegion(img_region) points_co = '' for lmm in range(len(found_polygons_text_region_img[mm])): points_co += str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x)) points_co += ',' points_co += str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y)) points_co += ' ' - coord_text.set('points', points_co[:-1]) + img_region.get_Coords().set_points(points_co[:-1]) return pcgts @@ -178,10 +181,9 @@ class EynollahXmlWriter(): self.logger.debug('enter build_pagexml_full_layout') # create the file structure - pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org) - page_print_sub = ET.SubElement(page, "Border") - coord_page = ET.SubElement(page_print_sub, "Coords") - coord_page.set('points', self.calculate_page_coords(cont_page)) + pcgts = create_page_xml(self.image_filename, self.height_org, self.width_org) + page = pcgts.get_Page() + page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page)))) counter = EynollahIdCounter() _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) @@ -189,52 +191,37 @@ class EynollahXmlWriter(): xml_reading_order(page, order_of_texts, id_of_marginalia) for mm in range(len(found_polygons_text_region)): - textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id', counter.next_region_id) - textregion.set('type', 'paragraph') - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)) + textregion = TextRegionType(id=counter.next_region_id, type='paragraph', + TextEquiv=[TextEquivType(index=0, Unicode='')], + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord))) + page.add_TextRegion(textregion) self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter) - add_textequiv(textregion) self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h)) for mm in range(len(found_polygons_text_region_h)): - textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id', counter.next_region_id) - textregion.set('type','header') - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord)) + textregion = TextRegionType(id=counter.next_region_id, type='header', + TextEquiv=[TextEquivType(index=0, Unicode='')], + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord))) + page.add_TextRegion(textregion) self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, counter) - add_textequiv(textregion) for mm in range(len(found_polygons_marginals)): - marginal = ET.SubElement(page, 'TextRegion') - add_textequiv(textregion) - marginal.set('id', counter.next_region_id) - marginal.set('type', 'marginalia') - coord_text = ET.SubElement(marginal, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord)) + marginal = TextRegionType(id=counter.next_region_id, type='marginalia', + TextEquiv=[TextEquivType(index=0, Unicode='')], + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord))) + page.add_TextRegion(marginal) self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) - + for mm in range(len(found_polygons_drop_capitals)): - textregion=ET.SubElement(page, 'TextRegion') - textregion.set('id', counter.next_region_id) - textregion.set('type', 'drop-capital') - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord)) - add_textequiv(textregion) + page.add_TextRegion(TextRegionType(id=counter.next_region_id, type='drop-capital', + TextEquiv=[TextEquivType(index=0, Unicode='')], + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord)))) for mm in range(len(found_polygons_text_region_img)): - textregion=ET.SubElement(page, 'ImageRegion') - textregion.set('id', counter.next_region_id) - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img[mm], page_coord)) + page.add_ImageRegion(ImageRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_img[mm], page_coord)))) for mm in range(len(found_polygons_tables)): - textregion = ET.SubElement(page, 'TableRegion') - textregion.set('id', counter.next_region_id) - coord_text = ET.SubElement(textregion, 'Coords') - coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables[mm], page_coord)) + page.add_TableRegion(TableRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_tables[mm], page_coord)))) return pcgts diff --git a/tests/test_xml.py b/tests/test_xml.py index 052f91e..8422fd1 100644 --- a/tests/test_xml.py +++ b/tests/test_xml.py @@ -1,11 +1,14 @@ -from lxml import etree as ET -from qurator.eynollah.utils.xml import create_page_xml, NAMESPACES +from pytest import main +from qurator.eynollah.utils.xml import create_page_xml +from ocrd_models.ocrd_page import to_xml -def tostring(el): - return ET.tostring(el).decode('utf-8') +PAGE_2019 = 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15' def test_create_xml(): - pcgts, page = create_page_xml('/path/to/img.tif', 100, 100) - xmlstr = tostring(pcgts) - assert 'xmlns="%s"' % NAMESPACES[None] in xmlstr + pcgts = create_page_xml('/path/to/img.tif', 100, 100) + xmlstr = to_xml(pcgts) + assert 'xmlns:pc="%s"' % PAGE_2019 in xmlstr assert 'Metadata' in xmlstr + +if __name__ == '__main__': + main([__file__])