|
|
|
from lxml import etree as ET
|
|
|
|
|
|
|
|
NAMESPACES = {}
|
|
|
|
NAMESPACES['page'] = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
|
|
|
|
NAMESPACES['xsi'] = "http://www.w3.org/2001/XMLSchema-instance"
|
|
|
|
NAMESPACES[None] = NAMESPACES['page']
|
|
|
|
|
|
|
|
def create_page_xml(imageFilename, height, width):
|
|
|
|
pcgts = ET.Element("PcGts", nsmap=NAMESPACES)
|
|
|
|
|
|
|
|
pcgts.set("{%s}schemaLocation" % NAMESPACES['xsi'], NAMESPACES['page'])
|
|
|
|
|
|
|
|
metadata = ET.SubElement(pcgts, "Metadata")
|
|
|
|
|
|
|
|
author = ET.SubElement(metadata, "Creator")
|
|
|
|
author.text = "SBB_QURATOR"
|
|
|
|
|
|
|
|
created = ET.SubElement(metadata, "Created")
|
|
|
|
created.text = "2019-06-17T18:15:12"
|
|
|
|
|
|
|
|
changetime = ET.SubElement(metadata, "LastChange")
|
|
|
|
changetime.text = "2019-06-17T18:15:12"
|
|
|
|
|
|
|
|
page = ET.SubElement(pcgts, "Page")
|
|
|
|
|
|
|
|
page.set("imageFilename", imageFilename)
|
|
|
|
page.set("imageHeight", str(height))
|
|
|
|
page.set("imageWidth", str(width))
|
|
|
|
page.set("type", "content")
|
|
|
|
page.set("readingDirection", "left-to-right")
|
|
|
|
page.set("textLineOrder", "top-to-bottom")
|
|
|
|
|
|
|
|
return pcgts, page
|
|
|
|
|
|
|
|
def add_textequiv(parent, text=''):
|
|
|
|
textequiv = ET.SubElement(parent, 'TextEquiv')
|
|
|
|
unireg = ET.SubElement(textequiv, 'Unicode')
|
|
|
|
unireg.text = text
|
|
|
|
|
|
|
|
def xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals):
|
|
|
|
"""
|
|
|
|
XXX side-effect: extends id_of_marginalia
|
|
|
|
"""
|
|
|
|
region_order = ET.SubElement(page, 'ReadingOrder')
|
|
|
|
region_order_sub = ET.SubElement(region_order, 'OrderedGroup')
|
|
|
|
region_order_sub.set('id', "ro357564684568544579089")
|
|
|
|
indexer_region = 0
|
|
|
|
for vj in order_of_texts:
|
|
|
|
name = "coord_text_%s" % vj
|
|
|
|
name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
|
|
|
|
name.set('index', str(indexer_region))
|
|
|
|
name.set('regionRef', id_of_texts[vj])
|
|
|
|
indexer_region += 1
|
|
|
|
for vm in range(len(found_polygons_marginals)):
|
|
|
|
id_of_marginalia.append('r%s' % indexer_region)
|
|
|
|
name = "coord_text_%s" % indexer_region
|
|
|
|
name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
|
|
|
|
name.set('index', str(indexer_region))
|
|
|
|
name.set('regionRef', 'r%s' % indexer_region)
|
|
|
|
indexer_region += 1
|
|
|
|
|