|
|
@ -4,6 +4,7 @@ from warnings import warn
|
|
|
|
|
|
|
|
|
|
|
|
from lxml import etree as ET
|
|
|
|
from lxml import etree as ET
|
|
|
|
from lxml.etree import XMLSyntaxError
|
|
|
|
from lxml.etree import XMLSyntaxError
|
|
|
|
|
|
|
|
from contextlib import suppress
|
|
|
|
import sys
|
|
|
|
import sys
|
|
|
|
import attr
|
|
|
|
import attr
|
|
|
|
import enum
|
|
|
|
import enum
|
|
|
@ -51,10 +52,20 @@ class ExtractedTextSegment:
|
|
|
|
text = attr.ib(type=str)
|
|
|
|
text = attr.ib(type=str)
|
|
|
|
@text.validator
|
|
|
|
@text.validator
|
|
|
|
def check(self, attribute, value):
|
|
|
|
def check(self, attribute, value):
|
|
|
|
if normalize(value, self.normalization) != value:
|
|
|
|
if value is not None and normalize(value, self.normalization) != value:
|
|
|
|
raise ValueError('String "{}" is not normalized.'.format(value))
|
|
|
|
raise ValueError('String "{}" is not normalized.'.format(value))
|
|
|
|
normalization = attr.ib(converter=Normalization, default=Normalization.NFC)
|
|
|
|
normalization = attr.ib(converter=Normalization, default=Normalization.NFC)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
|
|
def from_text_segment(cls, text_segment, nsmap):
|
|
|
|
|
|
|
|
"""Build an ExtractedTextSegment from a PAGE content text element"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
segment_id = text_segment.attrib['id']
|
|
|
|
|
|
|
|
segment_text = None
|
|
|
|
|
|
|
|
with suppress(AttributeError):
|
|
|
|
|
|
|
|
segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
|
|
|
|
|
|
|
|
return cls(segment_id, segment_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def alto_namespace(tree):
|
|
|
|
def alto_namespace(tree):
|
|
|
|
"""Return the ALTO namespace used in the given ElementTree.
|
|
|
|
"""Return the ALTO namespace used in the given ElementTree.
|
|
|
@ -106,13 +117,7 @@ def page_extract(tree):
|
|
|
|
|
|
|
|
|
|
|
|
nsmap = {'page': page_namespace(tree)}
|
|
|
|
nsmap = {'page': page_namespace(tree)}
|
|
|
|
|
|
|
|
|
|
|
|
def region_text(region):
|
|
|
|
regions = []
|
|
|
|
try:
|
|
|
|
|
|
|
|
return region.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
|
|
|
|
|
|
|
|
except AttributeError:
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
region_texts = []
|
|
|
|
|
|
|
|
reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap)
|
|
|
|
reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap)
|
|
|
|
if reading_order is not None:
|
|
|
|
if reading_order is not None:
|
|
|
|
for group in reading_order.iterfind('./*', namespaces=nsmap):
|
|
|
|
for group in reading_order.iterfind('./*', namespaces=nsmap):
|
|
|
@ -122,20 +127,20 @@ def page_extract(tree):
|
|
|
|
region_id = region_ref_indexed.attrib['regionRef']
|
|
|
|
region_id = region_ref_indexed.attrib['regionRef']
|
|
|
|
region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap)
|
|
|
|
region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap)
|
|
|
|
if region is not None:
|
|
|
|
if region is not None:
|
|
|
|
region_texts.append(region_text(region))
|
|
|
|
regions.append(ExtractedTextSegment.from_text_segment(region, nsmap))
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
warn('Not a TextRegion: "%s"' % region_id)
|
|
|
|
warn('Not a TextRegion: "%s"' % region_id)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
raise NotImplementedError
|
|
|
|
raise NotImplementedError
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap):
|
|
|
|
for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap):
|
|
|
|
region_texts.append(region_text(region))
|
|
|
|
regions.append(ExtractedTextSegment.from_text_segment(region, nsmap))
|
|
|
|
|
|
|
|
|
|
|
|
# XXX Does a file have to have regions etc.? region vs lines etc.
|
|
|
|
# XXX Does a file have to have regions etc.? region vs lines etc.
|
|
|
|
# Filter empty region texts
|
|
|
|
# Filter empty region texts
|
|
|
|
region_texts = (t for t in region_texts if t)
|
|
|
|
regions = (r for r in regions if r.text is not None)
|
|
|
|
return ExtractedText((ExtractedTextSegment(None, region_text) for region_text in region_texts), '\n')
|
|
|
|
|
|
|
|
# TODO This currently does not extract any segment id
|
|
|
|
return ExtractedText(regions, '\n')
|
|
|
|
# FIXME needs to handle normalization
|
|
|
|
# FIXME needs to handle normalization
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|