mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 20:00:01 +02:00
🚧 dinglehopper: Re-introduce "substitute_equivalences" as Normalization.NFC_SBB
This commit is contained in:
parent
2579e0220c
commit
a320d5fd8f
1 changed files with 19 additions and 3 deletions
|
@ -5,6 +5,7 @@ from warnings import warn
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
from lxml.etree import XMLSyntaxError
|
from lxml.etree import XMLSyntaxError
|
||||||
from contextlib import suppress
|
from contextlib import suppress
|
||||||
|
from .substitute_equivalences import substitute_equivalences
|
||||||
import sys
|
import sys
|
||||||
import attr
|
import attr
|
||||||
import enum
|
import enum
|
||||||
|
@ -36,16 +37,27 @@ class ExtractedText:
|
||||||
|
|
||||||
class Normalization(enum.Enum):
|
class Normalization(enum.Enum):
|
||||||
NFC = 1
|
NFC = 1
|
||||||
NFC_MUFI = 2
|
NFC_MUFI = 2 # TODO
|
||||||
|
NFC_SBB = 3
|
||||||
|
|
||||||
|
|
||||||
def normalize(text, normalization):
|
def normalize(text, normalization):
|
||||||
if normalization == Normalization.NFC:
|
if normalization == Normalization.NFC:
|
||||||
return unicodedata.normalize('NFC', text)
|
return unicodedata.normalize('NFC', text)
|
||||||
|
if normalization == Normalization.NFC_MUFI:
|
||||||
|
raise NotImplementedError()
|
||||||
|
if normalization == Normalization.NFC_SBB:
|
||||||
|
# XXX This needs to be redone
|
||||||
|
# https://github.com/qurator-spk/dinglehopper/issues/11
|
||||||
|
return substitute_equivalences(text)
|
||||||
else:
|
else:
|
||||||
raise ValueError()
|
raise ValueError()
|
||||||
|
|
||||||
|
|
||||||
|
# XXX hack
|
||||||
|
normalize_sbb = lambda t: normalize(t, Normalization.NFC_SBB)
|
||||||
|
|
||||||
|
|
||||||
@attr.s(frozen=True)
|
@attr.s(frozen=True)
|
||||||
class ExtractedTextSegment:
|
class ExtractedTextSegment:
|
||||||
id = attr.ib(type=str)
|
id = attr.ib(type=str)
|
||||||
|
@ -54,7 +66,7 @@ class ExtractedTextSegment:
|
||||||
def check(self, attribute, value):
|
def check(self, attribute, value):
|
||||||
if value is not None and normalize(value, self.normalization) != value:
|
if value is not None and normalize(value, self.normalization) != value:
|
||||||
raise ValueError('String "{}" is not normalized.'.format(value))
|
raise ValueError('String "{}" is not normalized.'.format(value))
|
||||||
normalization = attr.ib(converter=Normalization, default=Normalization.NFC)
|
normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_text_segment(cls, text_segment, nsmap):
|
def from_text_segment(cls, text_segment, nsmap):
|
||||||
|
@ -64,6 +76,7 @@ class ExtractedTextSegment:
|
||||||
segment_text = None
|
segment_text = None
|
||||||
with suppress(AttributeError):
|
with suppress(AttributeError):
|
||||||
segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
|
segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
|
||||||
|
segment_text = normalize_sbb(segment_text)
|
||||||
return cls(segment_id, segment_text)
|
return cls(segment_id, segment_text)
|
||||||
|
|
||||||
|
|
||||||
|
@ -89,7 +102,10 @@ def alto_extract(tree):
|
||||||
' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap))
|
' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap))
|
||||||
for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap))
|
for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap))
|
||||||
|
|
||||||
return ExtractedText((ExtractedTextSegment(None, line_text) for line_text in lines), '\n')
|
return ExtractedText(
|
||||||
|
(ExtractedTextSegment(None, normalize_sbb(line_text)) for line_text in lines),
|
||||||
|
'\n'
|
||||||
|
)
|
||||||
# TODO This currently does not extract any segment id, because we are
|
# TODO This currently does not extract any segment id, because we are
|
||||||
# clueless about the ALTO format.
|
# clueless about the ALTO format.
|
||||||
# FIXME needs to handle normalization
|
# FIXME needs to handle normalization
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue