You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
eynollah/sbb_newspapers_org_image/utils/xml.py

39 lines
1.2 KiB
Python

from lxml import etree as ET
NAMESPACES = {}
NAMESPACES['page'] = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
NAMESPACES['xsi'] = "http://www.w3.org/2001/XMLSchema-instance"
NAMESPACES[None] = NAMESPACES['page']
def create_page_xml(imageFilename, height, width):
pcgts = ET.Element("PcGts", nsmap=NAMESPACES)
pcgts.set("{%s}schemaLocation" % NAMESPACES['xsi'], NAMESPACES['page'])
metadata = ET.SubElement(pcgts, "Metadata")
author = ET.SubElement(metadata, "Creator")
author.text = "SBB_QURATOR"
created = ET.SubElement(metadata, "Created")
created.text = "2019-06-17T18:15:12"
changetime = ET.SubElement(metadata, "LastChange")
changetime.text = "2019-06-17T18:15:12"
page = ET.SubElement(pcgts, "Page")
page.set("imageFilename", imageFilename)
page.set("imageHeight", str(height))
page.set("imageWidth", str(width))
page.set("type", "content")
page.set("readingDirection", "left-to-right")
page.set("textLineOrder", "top-to-bottom")
return pcgts, page
def add_textequiv(parent, text=''):
textequiv = ET.SubElement(parent, 'TextEquiv')
unireg = ET.SubElement(textequiv, 'Unicode')
unireg.text = text