mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-06-08 19:59:56 +02:00
replace lxml with OCR-D/core PAGE API
This commit is contained in:
parent
7a859ffae4
commit
416a84e542
4 changed files with 125 additions and 132 deletions
|
@ -1639,7 +1639,7 @@ class Eynollah:
|
||||||
|
|
||||||
if not num_col:
|
if not num_col:
|
||||||
self.logger.info("No columns detected, outputting an empty PAGE-XML")
|
self.logger.info("No columns detected, outputting an empty PAGE-XML")
|
||||||
pcgts = self.writer.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], [], cont_page)
|
pcgts = self.writer.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], [], [], cont_page)
|
||||||
self.logger.info("Job done in %ss", str(time.time() - t1))
|
self.logger.info("Job done in %ss", str(time.time() - t1))
|
||||||
return pcgts
|
return pcgts
|
||||||
|
|
||||||
|
|
|
@ -1,60 +1,63 @@
|
||||||
# pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements,chained-comparison,fixme,broad-except,c-extension-no-member
|
# pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements,chained-comparison,fixme,broad-except,c-extension-no-member
|
||||||
# pylint: disable=invalid-name
|
# pylint: disable=invalid-name
|
||||||
from lxml import etree as ET
|
|
||||||
from .counter import EynollahIdCounter
|
from .counter import EynollahIdCounter
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
NAMESPACES = {}
|
from ocrd_models.ocrd_page import (
|
||||||
NAMESPACES['page'] = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
|
CoordsType,
|
||||||
NAMESPACES['xsi'] = "http://www.w3.org/2001/XMLSchema-instance"
|
GlyphType,
|
||||||
NAMESPACES[None] = NAMESPACES['page']
|
ImageRegionType,
|
||||||
|
MathsRegionType,
|
||||||
|
MetadataType,
|
||||||
|
MetadataItemType,
|
||||||
|
NoiseRegionType,
|
||||||
|
OrderedGroupIndexedType,
|
||||||
|
OrderedGroupType,
|
||||||
|
PcGtsType,
|
||||||
|
PageType,
|
||||||
|
ReadingOrderType,
|
||||||
|
RegionRefIndexedType,
|
||||||
|
RegionRefType,
|
||||||
|
SeparatorRegionType,
|
||||||
|
TableRegionType,
|
||||||
|
TextEquivType,
|
||||||
|
TextLineType,
|
||||||
|
TextRegionType,
|
||||||
|
UnorderedGroupIndexedType,
|
||||||
|
UnorderedGroupType,
|
||||||
|
WordType,
|
||||||
|
|
||||||
|
to_xml)
|
||||||
|
|
||||||
def create_page_xml(imageFilename, height, width):
|
def create_page_xml(imageFilename, height, width):
|
||||||
pcgts = ET.Element("PcGts", nsmap=NAMESPACES)
|
now = datetime.now()
|
||||||
|
pcgts = PcGtsType(
|
||||||
pcgts.set("{%s}schemaLocation" % NAMESPACES['xsi'], NAMESPACES['page'])
|
Metadata=MetadataType(
|
||||||
|
Creator='SBB_QURATOR',
|
||||||
metadata = ET.SubElement(pcgts, "Metadata")
|
Created=now,
|
||||||
|
LastChange=now
|
||||||
author = ET.SubElement(metadata, "Creator")
|
),
|
||||||
author.text = "SBB_QURATOR"
|
Page=PageType(
|
||||||
|
imageWidth=str(width),
|
||||||
created = ET.SubElement(metadata, "Created")
|
imageHeight=str(height),
|
||||||
created.text = "2019-06-17T18:15:12"
|
imageFilename=imageFilename,
|
||||||
|
readingDirection='left-to-right',
|
||||||
changetime = ET.SubElement(metadata, "LastChange")
|
textLineOrder='top-to-bottom'
|
||||||
changetime.text = "2019-06-17T18:15:12"
|
))
|
||||||
|
return pcgts
|
||||||
page = ET.SubElement(pcgts, "Page")
|
|
||||||
|
|
||||||
page.set("imageFilename", imageFilename)
|
|
||||||
page.set("imageHeight", str(height))
|
|
||||||
page.set("imageWidth", str(width))
|
|
||||||
page.set("type", "content")
|
|
||||||
page.set("readingDirection", "left-to-right")
|
|
||||||
page.set("textLineOrder", "top-to-bottom")
|
|
||||||
|
|
||||||
return pcgts, page
|
|
||||||
|
|
||||||
def add_textequiv(parent, text=''):
|
|
||||||
textequiv = ET.SubElement(parent, 'TextEquiv')
|
|
||||||
unireg = ET.SubElement(textequiv, 'Unicode')
|
|
||||||
unireg.text = text
|
|
||||||
|
|
||||||
def xml_reading_order(page, order_of_texts, id_of_marginalia):
|
def xml_reading_order(page, order_of_texts, id_of_marginalia):
|
||||||
region_order = ET.SubElement(page, 'ReadingOrder')
|
region_order = ReadingOrderType()
|
||||||
region_order_sub = ET.SubElement(region_order, 'OrderedGroup')
|
og = OrderedGroupType(id="ro357564684568544579089")
|
||||||
region_order_sub.set('id', "ro357564684568544579089")
|
page.set_ReadingOrder(region_order)
|
||||||
|
region_order.set_OrderedGroup(og)
|
||||||
region_counter = EynollahIdCounter()
|
region_counter = EynollahIdCounter()
|
||||||
for idx_textregion, _ in enumerate(order_of_texts):
|
for idx_textregion, _ in enumerate(order_of_texts):
|
||||||
name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
|
og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=region_counter.region_id(order_of_texts[idx_textregion] + 1)))
|
||||||
name.set('index', str(region_counter.get('region')))
|
|
||||||
name.set('regionRef', region_counter.region_id(order_of_texts[idx_textregion] + 1))
|
|
||||||
region_counter.inc('region')
|
region_counter.inc('region')
|
||||||
for id_marginal in id_of_marginalia:
|
for id_marginal in id_of_marginalia:
|
||||||
name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
|
og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=id_marginal))
|
||||||
name.set('index', str(region_counter.get('region')))
|
|
||||||
name.set('regionRef', id_marginal)
|
|
||||||
region_counter.inc('region')
|
region_counter.inc('region')
|
||||||
|
|
||||||
def order_and_id_of_texts(found_polygons_text_region, found_polygons_text_region_h, matrix_of_orders, indexes_sorted, index_of_types, kind_of_texts, ref_point):
|
def order_and_id_of_texts(found_polygons_text_region, found_polygons_text_region_h, matrix_of_orders, indexes_sorted, index_of_types, kind_of_texts, ref_point):
|
||||||
|
|
|
@ -1,23 +1,36 @@
|
||||||
# pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements,chained-comparison,fixme,broad-except,c-extension-no-member
|
# pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements,chained-comparison,fixme,broad-except,c-extension-no-member
|
||||||
|
# pylint: disable=import-error
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import os.path
|
import os.path
|
||||||
|
|
||||||
from .utils.xml import create_page_xml, add_textequiv, xml_reading_order
|
from .utils.xml import create_page_xml, xml_reading_order
|
||||||
from .utils.counter import EynollahIdCounter
|
from .utils.counter import EynollahIdCounter
|
||||||
|
|
||||||
from ocrd_utils import getLogger
|
from ocrd_utils import getLogger
|
||||||
from lxml import etree as ET
|
from ocrd_models.ocrd_page import (
|
||||||
|
BorderType,
|
||||||
|
CoordsType,
|
||||||
|
TextEquivType,
|
||||||
|
PcGtsType,
|
||||||
|
TextLineType,
|
||||||
|
TextRegionType,
|
||||||
|
ImageRegionType,
|
||||||
|
TableRegionType,
|
||||||
|
|
||||||
|
to_xml
|
||||||
|
)
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
class EynollahXmlWriter():
|
class EynollahXmlWriter():
|
||||||
|
|
||||||
def __init__(self, *, dir_out, image_filename, curved_line):
|
def __init__(self, *, dir_out, image_filename, curved_line, pcgts=None):
|
||||||
self.logger = getLogger('eynollah.writer')
|
self.logger = getLogger('eynollah.writer')
|
||||||
self.counter = EynollahIdCounter()
|
self.counter = EynollahIdCounter()
|
||||||
self.dir_out = dir_out
|
self.dir_out = dir_out
|
||||||
self.image_filename = image_filename
|
self.image_filename = image_filename
|
||||||
self.image_filename_stem = Path(Path(image_filename).name).stem
|
self.image_filename_stem = Path(Path(image_filename).name).stem
|
||||||
self.curved_line = curved_line
|
self.curved_line = curved_line
|
||||||
|
self.pcgts = pcgts if pcgts else PcGtsType()
|
||||||
self.scale_x = None # XXX set outside __init__
|
self.scale_x = None # XXX set outside __init__
|
||||||
self.scale_y = None # XXX set outside __init__
|
self.scale_y = None # XXX set outside __init__
|
||||||
self.height_org = None # XXX set outside __init__
|
self.height_org = None # XXX set outside __init__
|
||||||
|
@ -38,12 +51,12 @@ class EynollahXmlWriter():
|
||||||
points_page_print = points_page_print + ' '
|
points_page_print = points_page_print + ' '
|
||||||
return points_page_print[:-1]
|
return points_page_print[:-1]
|
||||||
|
|
||||||
def serialize_lines_in_marginal(self, marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, counter):
|
def serialize_lines_in_marginal(self, marginal_region, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, counter):
|
||||||
for j in range(len(all_found_texline_polygons_marginals[marginal_idx])):
|
for j in range(len(all_found_texline_polygons_marginals[marginal_idx])):
|
||||||
textline = ET.SubElement(marginal, 'TextLine')
|
coords = CoordsType()
|
||||||
textline.set('id', counter.next_line_id)
|
textline = TextLineType(id=counter.next_line_id, coords=coords)
|
||||||
coord = ET.SubElement(textline, 'Coords')
|
marginal_region.add_TextLine(textline)
|
||||||
add_textequiv(textline)
|
textline.add_TextEquiv(TextEquivType(Unicode=''))
|
||||||
points_co = ''
|
points_co = ''
|
||||||
for l in range(len(all_found_texline_polygons_marginals[marginal_idx][j])):
|
for l in range(len(all_found_texline_polygons_marginals[marginal_idx][j])):
|
||||||
if not self.curved_line:
|
if not self.curved_line:
|
||||||
|
@ -74,20 +87,16 @@ class EynollahXmlWriter():
|
||||||
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
|
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
|
||||||
points_co += ','
|
points_co += ','
|
||||||
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y))
|
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y))
|
||||||
|
points += ' '
|
||||||
|
coords.set_points(points_co[:-1])
|
||||||
|
|
||||||
if l < len(all_found_texline_polygons_marginals[marginal_idx][j]) - 1:
|
def serialize_lines_in_region(self, text_region, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, counter):
|
||||||
points_co += ' '
|
|
||||||
coord.set('points',points_co)
|
|
||||||
|
|
||||||
def serialize_lines_in_region(self, textregion, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, counter):
|
|
||||||
self.logger.debug('enter serialize_lines_in_region')
|
self.logger.debug('enter serialize_lines_in_region')
|
||||||
for j in range(len(all_found_texline_polygons[region_idx])):
|
for j in range(len(all_found_texline_polygons[region_idx])):
|
||||||
textline = ET.SubElement(textregion, 'TextLine')
|
coords = CoordsType()
|
||||||
textline.set('id', counter.next_line_id)
|
textline = TextLineType(id=counter.next_line_id, coords=coords, TextEquiv=[TextEquivType(index=0, Unicode='')])
|
||||||
coord = ET.SubElement(textline, 'Coords')
|
text_region.add_TextLine(textline)
|
||||||
add_textequiv(textline)
|
|
||||||
region_bboxes = all_box_coord[region_idx]
|
region_bboxes = all_box_coord[region_idx]
|
||||||
|
|
||||||
points_co = ''
|
points_co = ''
|
||||||
for idx_contour_textline, contour_textline in enumerate(all_found_texline_polygons[region_idx][j]):
|
for idx_contour_textline, contour_textline in enumerate(all_found_texline_polygons[region_idx][j]):
|
||||||
if not self.curved_line:
|
if not self.curved_line:
|
||||||
|
@ -119,23 +128,22 @@ class EynollahXmlWriter():
|
||||||
points_co += str(int((contour_textline[0][0] + region_bboxes[2]+page_coord[2])/self.scale_x))
|
points_co += str(int((contour_textline[0][0] + region_bboxes[2]+page_coord[2])/self.scale_x))
|
||||||
points_co += ','
|
points_co += ','
|
||||||
points_co += str(int((contour_textline[0][1] + region_bboxes[0]+page_coord[0])/self.scale_y))
|
points_co += str(int((contour_textline[0][1] + region_bboxes[0]+page_coord[0])/self.scale_y))
|
||||||
|
|
||||||
points_co += ' '
|
points_co += ' '
|
||||||
coord.set('points', points_co[:-1])
|
coords.set_points(points_co[:-1])
|
||||||
|
|
||||||
def write_pagexml(self, pcgts):
|
def write_pagexml(self, pcgts):
|
||||||
self.logger.info("filename stem: '%s'", self.image_filename_stem)
|
out_fname = os.path.join(self.dir_out, self.image_filename_stem) + ".xml"
|
||||||
tree = ET.ElementTree(pcgts)
|
self.logger.info("output filename: '%s'", out_fname)
|
||||||
tree.write(os.path.join(self.dir_out, self.image_filename_stem) + ".xml")
|
with open(out_fname, 'w') as f:
|
||||||
|
f.write(to_xml(pcgts))
|
||||||
|
|
||||||
def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page):
|
def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page):
|
||||||
self.logger.debug('enter build_pagexml_no_full_layout')
|
self.logger.debug('enter build_pagexml_no_full_layout')
|
||||||
|
|
||||||
# create the file structure
|
# create the file structure
|
||||||
pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
|
pcgts = create_page_xml(self.image_filename, self.height_org, self.width_org)
|
||||||
page_print_sub = ET.SubElement(page, "Border")
|
page = pcgts.get_Page()
|
||||||
coord_page = ET.SubElement(page_print_sub, "Coords")
|
page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page))))
|
||||||
coord_page.set('points', self.calculate_page_coords(cont_page))
|
|
||||||
|
|
||||||
counter = EynollahIdCounter()
|
counter = EynollahIdCounter()
|
||||||
if len(found_polygons_text_region) > 0:
|
if len(found_polygons_text_region) > 0:
|
||||||
|
@ -144,33 +152,28 @@ class EynollahXmlWriter():
|
||||||
xml_reading_order(page, order_of_texts, id_of_marginalia)
|
xml_reading_order(page, order_of_texts, id_of_marginalia)
|
||||||
|
|
||||||
for mm in range(len(found_polygons_text_region)):
|
for mm in range(len(found_polygons_text_region)):
|
||||||
textregion = ET.SubElement(page, 'TextRegion')
|
textregion = TextRegionType(id=counter.next_region_id, type='paragraph',
|
||||||
textregion.set('id', counter.next_region_id)
|
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)),
|
||||||
textregion.set('type', 'paragraph')
|
TextEquiv=[TextEquivType(index=0, Unicode='')])
|
||||||
coord_text = ET.SubElement(textregion, 'Coords')
|
page.add_TextRegion(textregion)
|
||||||
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord))
|
|
||||||
self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter)
|
self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter)
|
||||||
add_textequiv(textregion)
|
|
||||||
|
|
||||||
for mm in range(len(found_polygons_marginals)):
|
for mm in range(len(found_polygons_marginals)):
|
||||||
marginal = ET.SubElement(page, 'TextRegion')
|
marginal = TextRegionType(id=counter.next_region_id, type='marginalia',
|
||||||
marginal.set('id', counter.next_region_id)
|
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord)))
|
||||||
marginal.set('type', 'marginalia')
|
page.add_TextRegion(marginal)
|
||||||
coord_text = ET.SubElement(marginal, 'Coords')
|
|
||||||
coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord))
|
|
||||||
self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter)
|
self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter)
|
||||||
|
|
||||||
for mm in range(len(found_polygons_text_region_img)):
|
for mm in range(len(found_polygons_text_region_img)):
|
||||||
textregion = ET.SubElement(page, 'ImageRegion')
|
img_region = ImageRegionType(id=counter.next_region_id, Coords=CoordsType())
|
||||||
textregion.set('id', counter.next_region_id)
|
page.add_ImageRegion(img_region)
|
||||||
coord_text = ET.SubElement(textregion, 'Coords')
|
|
||||||
points_co = ''
|
points_co = ''
|
||||||
for lmm in range(len(found_polygons_text_region_img[mm])):
|
for lmm in range(len(found_polygons_text_region_img[mm])):
|
||||||
points_co += str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x))
|
points_co += str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x))
|
||||||
points_co += ','
|
points_co += ','
|
||||||
points_co += str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y))
|
points_co += str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y))
|
||||||
points_co += ' '
|
points_co += ' '
|
||||||
coord_text.set('points', points_co[:-1])
|
img_region.get_Coords().set_points(points_co[:-1])
|
||||||
|
|
||||||
return pcgts
|
return pcgts
|
||||||
|
|
||||||
|
@ -178,10 +181,9 @@ class EynollahXmlWriter():
|
||||||
self.logger.debug('enter build_pagexml_full_layout')
|
self.logger.debug('enter build_pagexml_full_layout')
|
||||||
|
|
||||||
# create the file structure
|
# create the file structure
|
||||||
pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
|
pcgts = create_page_xml(self.image_filename, self.height_org, self.width_org)
|
||||||
page_print_sub = ET.SubElement(page, "Border")
|
page = pcgts.get_Page()
|
||||||
coord_page = ET.SubElement(page_print_sub, "Coords")
|
page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page))))
|
||||||
coord_page.set('points', self.calculate_page_coords(cont_page))
|
|
||||||
|
|
||||||
counter = EynollahIdCounter()
|
counter = EynollahIdCounter()
|
||||||
_counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts))
|
_counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts))
|
||||||
|
@ -189,52 +191,37 @@ class EynollahXmlWriter():
|
||||||
xml_reading_order(page, order_of_texts, id_of_marginalia)
|
xml_reading_order(page, order_of_texts, id_of_marginalia)
|
||||||
|
|
||||||
for mm in range(len(found_polygons_text_region)):
|
for mm in range(len(found_polygons_text_region)):
|
||||||
textregion=ET.SubElement(page, 'TextRegion')
|
textregion = TextRegionType(id=counter.next_region_id, type='paragraph',
|
||||||
textregion.set('id', counter.next_region_id)
|
TextEquiv=[TextEquivType(index=0, Unicode='')],
|
||||||
textregion.set('type', 'paragraph')
|
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)))
|
||||||
coord_text = ET.SubElement(textregion, 'Coords')
|
page.add_TextRegion(textregion)
|
||||||
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord))
|
|
||||||
self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter)
|
self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter)
|
||||||
add_textequiv(textregion)
|
|
||||||
|
|
||||||
self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h))
|
self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h))
|
||||||
for mm in range(len(found_polygons_text_region_h)):
|
for mm in range(len(found_polygons_text_region_h)):
|
||||||
textregion=ET.SubElement(page, 'TextRegion')
|
textregion = TextRegionType(id=counter.next_region_id, type='header',
|
||||||
textregion.set('id', counter.next_region_id)
|
TextEquiv=[TextEquivType(index=0, Unicode='')],
|
||||||
textregion.set('type','header')
|
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord)))
|
||||||
coord_text = ET.SubElement(textregion, 'Coords')
|
page.add_TextRegion(textregion)
|
||||||
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord))
|
|
||||||
self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, counter)
|
self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, counter)
|
||||||
add_textequiv(textregion)
|
|
||||||
|
|
||||||
for mm in range(len(found_polygons_marginals)):
|
for mm in range(len(found_polygons_marginals)):
|
||||||
marginal = ET.SubElement(page, 'TextRegion')
|
marginal = TextRegionType(id=counter.next_region_id, type='marginalia',
|
||||||
add_textequiv(textregion)
|
TextEquiv=[TextEquivType(index=0, Unicode='')],
|
||||||
marginal.set('id', counter.next_region_id)
|
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord)))
|
||||||
marginal.set('type', 'marginalia')
|
page.add_TextRegion(marginal)
|
||||||
coord_text = ET.SubElement(marginal, 'Coords')
|
|
||||||
coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord))
|
|
||||||
self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter)
|
self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter)
|
||||||
|
|
||||||
for mm in range(len(found_polygons_drop_capitals)):
|
for mm in range(len(found_polygons_drop_capitals)):
|
||||||
textregion=ET.SubElement(page, 'TextRegion')
|
page.add_TextRegion(TextRegionType(id=counter.next_region_id, type='drop-capital',
|
||||||
textregion.set('id', counter.next_region_id)
|
TextEquiv=[TextEquivType(index=0, Unicode='')],
|
||||||
textregion.set('type', 'drop-capital')
|
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord))))
|
||||||
coord_text = ET.SubElement(textregion, 'Coords')
|
|
||||||
coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord))
|
|
||||||
add_textequiv(textregion)
|
|
||||||
|
|
||||||
for mm in range(len(found_polygons_text_region_img)):
|
for mm in range(len(found_polygons_text_region_img)):
|
||||||
textregion=ET.SubElement(page, 'ImageRegion')
|
page.add_ImageRegion(ImageRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_img[mm], page_coord))))
|
||||||
textregion.set('id', counter.next_region_id)
|
|
||||||
coord_text = ET.SubElement(textregion, 'Coords')
|
|
||||||
coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img[mm], page_coord))
|
|
||||||
|
|
||||||
for mm in range(len(found_polygons_tables)):
|
for mm in range(len(found_polygons_tables)):
|
||||||
textregion = ET.SubElement(page, 'TableRegion')
|
page.add_TableRegion(TableRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_tables[mm], page_coord))))
|
||||||
textregion.set('id', counter.next_region_id)
|
|
||||||
coord_text = ET.SubElement(textregion, 'Coords')
|
|
||||||
coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables[mm], page_coord))
|
|
||||||
|
|
||||||
return pcgts
|
return pcgts
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,14 @@
|
||||||
from lxml import etree as ET
|
from pytest import main
|
||||||
from qurator.eynollah.utils.xml import create_page_xml, NAMESPACES
|
from qurator.eynollah.utils.xml import create_page_xml
|
||||||
|
from ocrd_models.ocrd_page import to_xml
|
||||||
|
|
||||||
def tostring(el):
|
PAGE_2019 = 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15'
|
||||||
return ET.tostring(el).decode('utf-8')
|
|
||||||
|
|
||||||
def test_create_xml():
|
def test_create_xml():
|
||||||
pcgts, page = create_page_xml('/path/to/img.tif', 100, 100)
|
pcgts = create_page_xml('/path/to/img.tif', 100, 100)
|
||||||
xmlstr = tostring(pcgts)
|
xmlstr = to_xml(pcgts)
|
||||||
assert 'xmlns="%s"' % NAMESPACES[None] in xmlstr
|
assert 'xmlns:pc="%s"' % PAGE_2019 in xmlstr
|
||||||
assert 'Metadata' in xmlstr
|
assert 'Metadata' in xmlstr
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main([__file__])
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue