You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
eynollah/qurator/eynollah/utils/xml.py

89 lines
3.1 KiB
Python

# pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements,chained-comparison,fixme,broad-except,c-extension-no-member
# pylint: disable=invalid-name
from .counter import EynollahIdCounter
import numpy as np
from datetime import datetime
from ocrd_models.ocrd_page import (
CoordsType,
GlyphType,
ImageRegionType,
MathsRegionType,
MetadataType,
MetadataItemType,
NoiseRegionType,
OrderedGroupIndexedType,
OrderedGroupType,
PcGtsType,
PageType,
ReadingOrderType,
RegionRefIndexedType,
RegionRefType,
SeparatorRegionType,
TableRegionType,
TextLineType,
TextRegionType,
UnorderedGroupIndexedType,
UnorderedGroupType,
WordType,
to_xml)
def create_page_xml(imageFilename, height, width):
now = datetime.now()
pcgts = PcGtsType(
Metadata=MetadataType(
Creator='SBB_QURATOR',
Created=now,
LastChange=now
),
Page=PageType(
imageWidth=str(width),
imageHeight=str(height),
imageFilename=imageFilename,
readingDirection='left-to-right',
textLineOrder='top-to-bottom'
))
return pcgts
def xml_reading_order(page, order_of_texts, id_of_marginalia):
region_order = ReadingOrderType()
og = OrderedGroupType(id="ro357564684568544579089")
page.set_ReadingOrder(region_order)
region_order.set_OrderedGroup(og)
region_counter = EynollahIdCounter()
for idx_textregion, _ in enumerate(order_of_texts):
og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=region_counter.region_id(order_of_texts[idx_textregion] + 1)))
region_counter.inc('region')
for id_marginal in id_of_marginalia:
og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=id_marginal))
region_counter.inc('region')
def order_and_id_of_texts(found_polygons_text_region, found_polygons_text_region_h, matrix_of_orders, indexes_sorted, index_of_types, kind_of_texts, ref_point):
indexes_sorted = np.array(indexes_sorted)
index_of_types = np.array(index_of_types)
kind_of_texts = np.array(kind_of_texts)
id_of_texts = []
order_of_texts = []
index_of_types_1 = index_of_types[kind_of_texts == 1]
indexes_sorted_1 = indexes_sorted[kind_of_texts == 1]
index_of_types_2 = index_of_types[kind_of_texts == 2]
indexes_sorted_2 = indexes_sorted[kind_of_texts == 2]
counter = EynollahIdCounter(region_idx=ref_point)
for idx_textregion, _ in enumerate(found_polygons_text_region):
id_of_texts.append(counter.next_region_id)
interest = indexes_sorted_1[indexes_sorted_1 == index_of_types_1[idx_textregion]]
if len(interest) > 0:
order_of_texts.append(interest[0])
for idx_headerregion, _ in enumerate(found_polygons_text_region_h):
id_of_texts.append(counter.next_region_id)
interest = indexes_sorted_2[index_of_types_2[idx_headerregion]]
order_of_texts.append(interest)
return order_of_texts, id_of_texts