mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-06-08 11:49:55 +02:00
Merge branch 'lxml' into main
This commit is contained in:
commit
0d47f28655
3 changed files with 52 additions and 52 deletions
|
@ -113,6 +113,8 @@ from .utils import (
|
||||||
return_boxes_of_images_by_order_of_reading_new,
|
return_boxes_of_images_by_order_of_reading_new,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from utils.xml import create_page_xml
|
||||||
|
|
||||||
|
|
||||||
SLOPE_THRESHOLD = 0.13
|
SLOPE_THRESHOLD = 0.13
|
||||||
|
|
||||||
|
@ -1444,37 +1446,15 @@ class eynollah:
|
||||||
poly.put(poly_sub)
|
poly.put(poly_sub)
|
||||||
box_sub.put(boxes_sub_new)
|
box_sub.put(boxes_sub_new)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def write_into_page_xml_full(self, contours, contours_h, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals):
|
def write_into_page_xml_full(self, contours, contours_h, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals):
|
||||||
|
|
||||||
found_polygons_text_region = contours
|
found_polygons_text_region = contours
|
||||||
found_polygons_text_region_h = contours_h
|
found_polygons_text_region_h = contours_h
|
||||||
|
|
||||||
# create the file structure
|
# create the file structure
|
||||||
data = ET.Element("PcGts")
|
pcgts, page = create_page_xml(self.image_dir, self.height_org, self.width_org)
|
||||||
|
|
||||||
data.set("xmlns", "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15")
|
|
||||||
data.set("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance")
|
|
||||||
data.set("xsi:schemaLocation", "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15")
|
|
||||||
|
|
||||||
metadata = ET.SubElement(data, "Metadata")
|
|
||||||
|
|
||||||
author = ET.SubElement(metadata, "Creator")
|
|
||||||
author.text = "SBB_QURATOR"
|
|
||||||
|
|
||||||
created = ET.SubElement(metadata, "Created")
|
|
||||||
created.text = "2019-06-17T18:15:12"
|
|
||||||
|
|
||||||
changetime = ET.SubElement(metadata, "LastChange")
|
|
||||||
changetime.text = "2019-06-17T18:15:12"
|
|
||||||
|
|
||||||
page = ET.SubElement(data, "Page")
|
|
||||||
|
|
||||||
page.set("imageFilename", self.image_dir)
|
|
||||||
page.set("imageHeight", str(self.height_org))
|
|
||||||
page.set("imageWidth", str(self.width_org))
|
|
||||||
page.set("type", "content")
|
|
||||||
page.set("readingDirection", "left-to-right")
|
|
||||||
page.set("textLineOrder", "top-to-bottom")
|
|
||||||
|
|
||||||
page_print_sub = ET.SubElement(page, "PrintSpace")
|
page_print_sub = ET.SubElement(page, "PrintSpace")
|
||||||
coord_page = ET.SubElement(page_print_sub, "Coords")
|
coord_page = ET.SubElement(page_print_sub, "Coords")
|
||||||
|
@ -1948,7 +1928,7 @@ class eynollah:
|
||||||
print(dir_of_image)
|
print(dir_of_image)
|
||||||
print(self.f_name)
|
print(self.f_name)
|
||||||
print(os.path.join(dir_of_image, self.f_name) + ".xml")
|
print(os.path.join(dir_of_image, self.f_name) + ".xml")
|
||||||
tree = ET.ElementTree(data)
|
tree = ET.ElementTree(pcgts)
|
||||||
tree.write(os.path.join(dir_of_image, self.f_name) + ".xml")
|
tree.write(os.path.join(dir_of_image, self.f_name) + ".xml")
|
||||||
|
|
||||||
def write_into_page_xml(self, contours, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, curved_line, slopes, slopes_marginals):
|
def write_into_page_xml(self, contours, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, curved_line, slopes, slopes_marginals):
|
||||||
|
@ -1957,32 +1937,7 @@ class eynollah:
|
||||||
##found_polygons_text_region_h=contours_h
|
##found_polygons_text_region_h=contours_h
|
||||||
|
|
||||||
# create the file structure
|
# create the file structure
|
||||||
data = ET.Element("PcGts")
|
pcgts, page = create_page_xml(self.image_dir, self.height_org, self.width_org)
|
||||||
|
|
||||||
data.set("xmlns", "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15")
|
|
||||||
data.set("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance")
|
|
||||||
data.set("xsi:schemaLocation", "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15")
|
|
||||||
|
|
||||||
metadata = ET.SubElement(data, "Metadata")
|
|
||||||
|
|
||||||
author = ET.SubElement(metadata, "Creator")
|
|
||||||
author.text = "SBB_QURATOR"
|
|
||||||
|
|
||||||
created = ET.SubElement(metadata, "Created")
|
|
||||||
created.text = "2019-06-17T18:15:12"
|
|
||||||
|
|
||||||
changetime = ET.SubElement(metadata, "LastChange")
|
|
||||||
changetime.text = "2019-06-17T18:15:12"
|
|
||||||
|
|
||||||
page = ET.SubElement(data, "Page")
|
|
||||||
|
|
||||||
page.set("imageFilename", self.image_dir)
|
|
||||||
page.set("imageHeight", str(self.height_org))
|
|
||||||
page.set("imageWidth", str(self.width_org))
|
|
||||||
page.set("type", "content")
|
|
||||||
page.set("readingDirection", "left-to-right")
|
|
||||||
page.set("textLineOrder", "top-to-bottom")
|
|
||||||
|
|
||||||
page_print_sub = ET.SubElement(page, "PrintSpace")
|
page_print_sub = ET.SubElement(page, "PrintSpace")
|
||||||
coord_page = ET.SubElement(page_print_sub, "Coords")
|
coord_page = ET.SubElement(page_print_sub, "Coords")
|
||||||
points_page_print = ""
|
points_page_print = ""
|
||||||
|
|
34
sbb_newspapers_org_image/utils/xml.py
Normal file
34
sbb_newspapers_org_image/utils/xml.py
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
from lxml import etree as ET
|
||||||
|
|
||||||
|
NAMESPACES = {}
|
||||||
|
NAMESPACES['page'] = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
|
||||||
|
NAMESPACES['xsi'] = "http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
NAMESPACES[None] = NAMESPACES['page']
|
||||||
|
|
||||||
|
def create_page_xml(imageFilename, height, width):
|
||||||
|
pcgts = ET.Element("PcGts", nsmap=NAMESPACES)
|
||||||
|
|
||||||
|
pcgts.set("{%s}schemaLocation" % NAMESPACES['xsi'], NAMESPACES['page'])
|
||||||
|
|
||||||
|
metadata = ET.SubElement(pcgts, "Metadata")
|
||||||
|
|
||||||
|
author = ET.SubElement(metadata, "Creator")
|
||||||
|
author.text = "SBB_QURATOR"
|
||||||
|
|
||||||
|
created = ET.SubElement(metadata, "Created")
|
||||||
|
created.text = "2019-06-17T18:15:12"
|
||||||
|
|
||||||
|
changetime = ET.SubElement(metadata, "LastChange")
|
||||||
|
changetime.text = "2019-06-17T18:15:12"
|
||||||
|
|
||||||
|
page = ET.SubElement(pcgts, "Page")
|
||||||
|
|
||||||
|
page.set("imageFilename", imageFilename)
|
||||||
|
page.set("imageHeight", str(height))
|
||||||
|
page.set("imageWidth", str(width))
|
||||||
|
page.set("type", "content")
|
||||||
|
page.set("readingDirection", "left-to-right")
|
||||||
|
page.set("textLineOrder", "top-to-bottom")
|
||||||
|
|
||||||
|
return pcgts, page
|
||||||
|
|
11
tests/test_xml.py
Normal file
11
tests/test_xml.py
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
from lxml import etree as ET
|
||||||
|
from sbb_newspapers_org_image.utils.xml import create_page_xml, NAMESPACES
|
||||||
|
|
||||||
|
def tostring(el):
|
||||||
|
return ET.tostring(el).decode('utf-8')
|
||||||
|
|
||||||
|
def test_create_xml():
|
||||||
|
pcgts, page = create_page_xml('/path/to/img.tif', 100, 100)
|
||||||
|
xmlstr = tostring(pcgts)
|
||||||
|
assert 'xmlns="%s"' % NAMESPACES[None] in xmlstr
|
||||||
|
assert 'Metadata' in xmlstr
|
Loading…
Add table
Add a link
Reference in a new issue