mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 20:00:01 +02:00
🚧 dinglehopper: Hierarchical text representation
This commit is contained in:
parent
db6292611f
commit
96b55f1806
2 changed files with 90 additions and 67 deletions
|
@ -15,14 +15,52 @@ import unicodedata
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
class Normalization(enum.Enum):
|
||||||
|
NFC = 1
|
||||||
|
NFC_MUFI = 2 # TODO
|
||||||
|
NFC_SBB = 3
|
||||||
|
|
||||||
|
|
||||||
@attr.s(frozen=True)
|
@attr.s(frozen=True)
|
||||||
class ExtractedText:
|
class ExtractedText:
|
||||||
segments = attr.ib(converter=list)
|
segment_id = attr.ib(type=Optional[str])
|
||||||
joiner = attr.ib(type=str)
|
|
||||||
# TODO Types are not validated (attr does not do this yet)
|
@segment_id.validator
|
||||||
|
def check(self, _, value):
|
||||||
|
if value is None:
|
||||||
|
return
|
||||||
|
if not re.match(r'[\w\d_-]+', value):
|
||||||
|
raise ValueError('Malformed segment id "{}"'.format(value))
|
||||||
|
|
||||||
|
# An object contains either
|
||||||
|
# a. _text itself
|
||||||
|
# b. or segments (ExtractedText) and a joiner
|
||||||
|
# TODO validator
|
||||||
|
|
||||||
|
segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list))
|
||||||
|
joiner = attr.ib(type=Optional[str])
|
||||||
|
_text = attr.ib(type=Optional[str])
|
||||||
|
|
||||||
|
@segments.validator
|
||||||
|
def check(self, _, value):
|
||||||
|
if value is not None and self._text is not None:
|
||||||
|
raise ValueError("Can't have both segments and text")
|
||||||
|
|
||||||
|
@_text.validator
|
||||||
|
def check(self, _, value):
|
||||||
|
if value is not None and normalize(value, self.normalization) != value:
|
||||||
|
raise ValueError('String "{}" is not normalized.'.format(value))
|
||||||
|
|
||||||
|
normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def text(self):
|
def text(self):
|
||||||
|
if self._text is not None:
|
||||||
|
if self._text == '':
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return self._text
|
||||||
|
else:
|
||||||
return self.joiner.join(s.text for s in self.segments)
|
return self.joiner.join(s.text for s in self.segments)
|
||||||
|
|
||||||
_segment_id_for_pos = None
|
_segment_id_for_pos = None
|
||||||
|
@ -34,17 +72,30 @@ class ExtractedText:
|
||||||
for s in self.segments:
|
for s in self.segments:
|
||||||
segment_id_for_pos.extend(repeat(s.segment_id, len(s.text)))
|
segment_id_for_pos.extend(repeat(s.segment_id, len(s.text)))
|
||||||
segment_id_for_pos.extend(repeat(None, len(self.joiner)))
|
segment_id_for_pos.extend(repeat(None, len(self.joiner)))
|
||||||
|
segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)]
|
||||||
# This is frozen, so we have to jump through the hoop:
|
# This is frozen, so we have to jump through the hoop:
|
||||||
object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
|
object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
|
||||||
assert self._segment_id_for_pos
|
assert self._segment_id_for_pos
|
||||||
|
|
||||||
return self._segment_id_for_pos[pos]
|
return self._segment_id_for_pos[pos]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_text_segment(cls, text_segment, nsmap):
|
||||||
|
"""Build an ExtractedText from a PAGE content text element"""
|
||||||
|
|
||||||
|
segment_id = text_segment.attrib['id']
|
||||||
|
segment_text = None
|
||||||
|
with suppress(AttributeError):
|
||||||
|
segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
|
||||||
|
segment_text = segment_text or ''
|
||||||
|
segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization
|
||||||
|
segment_text = segment_text or ''
|
||||||
|
return cls(segment_id, None, None, segment_text)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_text(cls, text):
|
||||||
|
return cls(None, None, None, text)
|
||||||
|
|
||||||
class Normalization(enum.Enum):
|
|
||||||
NFC = 1
|
|
||||||
NFC_MUFI = 2 # TODO
|
|
||||||
NFC_SBB = 3
|
|
||||||
|
|
||||||
|
|
||||||
def normalize(text, normalization):
|
def normalize(text, normalization):
|
||||||
|
@ -63,37 +114,6 @@ def normalize_sbb(t):
|
||||||
return normalize(t, Normalization.NFC_SBB)
|
return normalize(t, Normalization.NFC_SBB)
|
||||||
|
|
||||||
|
|
||||||
@attr.s(frozen=True)
|
|
||||||
class ExtractedTextSegment:
|
|
||||||
segment_id = attr.ib(type=Optional[str])
|
|
||||||
|
|
||||||
@segment_id.validator
|
|
||||||
def check(self, _, value):
|
|
||||||
if value is None:
|
|
||||||
return
|
|
||||||
if not re.match(r'[\w\d_-]+', value):
|
|
||||||
raise ValueError('Malformed segment id "{}"'.format(value))
|
|
||||||
text = attr.ib(type=str)
|
|
||||||
|
|
||||||
@text.validator
|
|
||||||
def check(self, _, value):
|
|
||||||
if value is not None and normalize(value, self.normalization) != value:
|
|
||||||
raise ValueError('String "{}" is not normalized.'.format(value))
|
|
||||||
normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_text_segment(cls, text_segment, nsmap):
|
|
||||||
"""Build an ExtractedTextSegment from a PAGE content text element"""
|
|
||||||
|
|
||||||
segment_id = text_segment.attrib['id']
|
|
||||||
segment_text = None
|
|
||||||
with suppress(AttributeError):
|
|
||||||
segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
|
|
||||||
segment_text = segment_text or ''
|
|
||||||
segment_text = normalize_sbb(segment_text)
|
|
||||||
return cls(segment_id, segment_text)
|
|
||||||
|
|
||||||
|
|
||||||
def alto_namespace(tree):
|
def alto_namespace(tree):
|
||||||
"""Return the ALTO namespace used in the given ElementTree.
|
"""Return the ALTO namespace used in the given ElementTree.
|
||||||
|
|
||||||
|
@ -117,12 +137,14 @@ def alto_extract(tree):
|
||||||
for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap))
|
for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap))
|
||||||
|
|
||||||
return ExtractedText(
|
return ExtractedText(
|
||||||
(ExtractedTextSegment(None, normalize_sbb(line_text)) for line_text in lines),
|
None,
|
||||||
'\n'
|
(ExtractedText.from_text(normalize_sbb(line_text)) for line_text in lines),
|
||||||
|
'\n',
|
||||||
|
None
|
||||||
)
|
)
|
||||||
|
# FIXME hardcoded SBB normalization
|
||||||
# TODO This currently does not extract any segment id, because we are
|
# TODO This currently does not extract any segment id, because we are
|
||||||
# clueless about the ALTO format.
|
# clueless about the ALTO format.
|
||||||
# FIXME needs to handle normalization
|
|
||||||
|
|
||||||
|
|
||||||
def alto_text(tree):
|
def alto_text(tree):
|
||||||
|
@ -157,20 +179,19 @@ def page_extract(tree):
|
||||||
region_id = region_ref_indexed.attrib['regionRef']
|
region_id = region_ref_indexed.attrib['regionRef']
|
||||||
region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap)
|
region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap)
|
||||||
if region is not None:
|
if region is not None:
|
||||||
regions.append(ExtractedTextSegment.from_text_segment(region, nsmap))
|
regions.append(ExtractedText.from_text_segment(region, nsmap))
|
||||||
else:
|
else:
|
||||||
warn('Not a TextRegion: "%s"' % region_id)
|
warn('Not a TextRegion: "%s"' % region_id)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
else:
|
else:
|
||||||
for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap):
|
for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap):
|
||||||
regions.append(ExtractedTextSegment.from_text_segment(region, nsmap))
|
regions.append(ExtractedText.from_text_segment(region, nsmap))
|
||||||
|
|
||||||
# Filter empty region texts
|
# Filter empty region texts
|
||||||
regions = (r for r in regions if r.text is not None)
|
regions = (r for r in regions if r.text is not None)
|
||||||
|
|
||||||
return ExtractedText(regions, '\n')
|
return ExtractedText(None, regions, '\n', None)
|
||||||
# FIXME needs to handle normalization
|
|
||||||
|
|
||||||
|
|
||||||
def page_text(tree):
|
def page_text(tree):
|
||||||
|
@ -180,8 +201,10 @@ def page_text(tree):
|
||||||
def plain_extract(filename):
|
def plain_extract(filename):
|
||||||
with open(filename, 'r') as f:
|
with open(filename, 'r') as f:
|
||||||
return ExtractedText(
|
return ExtractedText(
|
||||||
(ExtractedTextSegment('line %d' % no, line) for no, line in enumerate(f.readlines())),
|
None,
|
||||||
'\n'
|
[ExtractedText('line %d' % no, None, None, line) for no, line in enumerate(f.readlines())],
|
||||||
|
'\n',
|
||||||
|
None
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,17 +1,17 @@
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import pytest
|
import pytest
|
||||||
from qurator.dinglehopper import ExtractedText, ExtractedTextSegment
|
from qurator.dinglehopper import ExtractedText
|
||||||
from uniseg.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
from qurator.dinglehopper import seq_align
|
from qurator.dinglehopper import seq_align
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
|
|
||||||
|
|
||||||
def test_text():
|
def test_text():
|
||||||
test1 = ExtractedText([
|
test1 = ExtractedText(None, [
|
||||||
ExtractedTextSegment('s0', 'foo'),
|
ExtractedText('s0', None, None, 'foo'),
|
||||||
ExtractedTextSegment('s1', 'bar'),
|
ExtractedText('s1', None, None, 'bar'),
|
||||||
ExtractedTextSegment('s2', 'bazinga')
|
ExtractedText('s2', None, None, 'bazinga')
|
||||||
], ' ')
|
], ' ', None)
|
||||||
|
|
||||||
assert test1.text == 'foo bar bazinga'
|
assert test1.text == 'foo bar bazinga'
|
||||||
assert test1.segment_id_for_pos(0) == 's0'
|
assert test1.segment_id_for_pos(0) == 's0'
|
||||||
|
@ -21,8 +21,8 @@ def test_text():
|
||||||
|
|
||||||
def test_normalization_check():
|
def test_normalization_check():
|
||||||
with pytest.raises(ValueError, match=r'.*is not normalized.*'):
|
with pytest.raises(ValueError, match=r'.*is not normalized.*'):
|
||||||
ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ'))
|
ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ'))
|
||||||
assert ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ'))
|
assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ'))
|
||||||
|
|
||||||
|
|
||||||
AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id')
|
AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id')
|
||||||
|
@ -36,17 +36,17 @@ def test_align():
|
||||||
not Python characters.
|
not Python characters.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
test1 = ExtractedText([
|
test1 = ExtractedText(None, [
|
||||||
ExtractedTextSegment('s0', 'foo'),
|
ExtractedText('s0', None, None, 'foo'),
|
||||||
ExtractedTextSegment('s1', 'bar'),
|
ExtractedText('s1', None, None, 'bar'),
|
||||||
ExtractedTextSegment('s2', 'batzinga')
|
ExtractedText('s2', None, None, 'batzinga')
|
||||||
], ' ')
|
], ' ', None)
|
||||||
test2 = ExtractedText([
|
test2 = ExtractedText(None, [
|
||||||
ExtractedTextSegment('x0', 'foo'),
|
ExtractedText('x0', None, None, 'foo'),
|
||||||
ExtractedTextSegment('x1', 'bar'),
|
ExtractedText('x1', None, None, 'bar'),
|
||||||
ExtractedTextSegment('x2', '.'), # extra .
|
ExtractedText('x2', None, None, '.'), # extra .
|
||||||
ExtractedTextSegment('x3', 'bazim̃ga'), # deletion + different grapheme cluster, m̃ also is two Python characters
|
ExtractedText('x3', None, None, 'bazim̃ga'), # deletion + different grapheme cluster, m̃ also is two Python characters
|
||||||
], ' ')
|
], ' ', None)
|
||||||
|
|
||||||
left_pos = 0; right_pos = 0; alignment = []
|
left_pos = 0; right_pos = 0; alignment = []
|
||||||
for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)):
|
for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue