🚧 dinglehopper: Extract text while retaining segment id info

pull/38/head
Gerber, Mike 4 years ago
parent a09c1eae7e
commit 6d0db229fa

@ -4,6 +4,7 @@ from warnings import warn
from lxml import etree as ET from lxml import etree as ET
from lxml.etree import XMLSyntaxError from lxml.etree import XMLSyntaxError
from contextlib import suppress
import sys import sys
import attr import attr
import enum import enum
@ -51,10 +52,20 @@ class ExtractedTextSegment:
text = attr.ib(type=str) text = attr.ib(type=str)
@text.validator @text.validator
def check(self, attribute, value): def check(self, attribute, value):
if normalize(value, self.normalization) != value: if value is not None and normalize(value, self.normalization) != value:
raise ValueError('String "{}" is not normalized.'.format(value)) raise ValueError('String "{}" is not normalized.'.format(value))
normalization = attr.ib(converter=Normalization, default=Normalization.NFC) normalization = attr.ib(converter=Normalization, default=Normalization.NFC)
@classmethod
def from_text_segment(cls, text_segment, nsmap):
"""Build an ExtractedTextSegment from a PAGE content text element"""
segment_id = text_segment.attrib['id']
segment_text = None
with suppress(AttributeError):
segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
return cls(segment_id, segment_text)
def alto_namespace(tree): def alto_namespace(tree):
"""Return the ALTO namespace used in the given ElementTree. """Return the ALTO namespace used in the given ElementTree.
@ -106,13 +117,7 @@ def page_extract(tree):
nsmap = {'page': page_namespace(tree)} nsmap = {'page': page_namespace(tree)}
def region_text(region): regions = []
try:
return region.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
except AttributeError:
return None
region_texts = []
reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap) reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap)
if reading_order is not None: if reading_order is not None:
for group in reading_order.iterfind('./*', namespaces=nsmap): for group in reading_order.iterfind('./*', namespaces=nsmap):
@ -122,20 +127,20 @@ def page_extract(tree):
region_id = region_ref_indexed.attrib['regionRef'] region_id = region_ref_indexed.attrib['regionRef']
region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap) region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap)
if region is not None: if region is not None:
region_texts.append(region_text(region)) regions.append(ExtractedTextSegment.from_text_segment(region, nsmap))
else: else:
warn('Not a TextRegion: "%s"' % region_id) warn('Not a TextRegion: "%s"' % region_id)
else: else:
raise NotImplementedError raise NotImplementedError
else: else:
for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap): for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap):
region_texts.append(region_text(region)) regions.append(ExtractedTextSegment.from_text_segment(region, nsmap))
# XXX Does a file have to have regions etc.? region vs lines etc. # XXX Does a file have to have regions etc.? region vs lines etc.
# Filter empty region texts # Filter empty region texts
region_texts = (t for t in region_texts if t) regions = (r for r in regions if r.text is not None)
return ExtractedText((ExtractedTextSegment(None, region_text) for region_text in region_texts), '\n')
# TODO This currently does not extract any segment id return ExtractedText(regions, '\n')
# FIXME needs to handle normalization # FIXME needs to handle normalization

Loading…
Cancel
Save