➡️ dinglehopper: Move ExtractedText to its own file

pull/38/head
Gerber, Mike 4 years ago
parent 9dd4ff0aae
commit 1077dc64ce

@ -7,7 +7,8 @@ from multimethod import multimethod
from uniseg.graphemecluster import grapheme_clusters
from qurator.dinglehopper.edit_distance import distance
from qurator.dinglehopper.ocr_files import ExtractedText
from qurator.dinglehopper.extracted_text import ExtractedText
@multimethod
def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:

@ -6,6 +6,7 @@ from markupsafe import escape
from qurator.dinglehopper import *
from qurator.dinglehopper import ExtractedText
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):

@ -8,7 +8,8 @@ import numpy as np
from multimethod import multimethod
from uniseg.graphemecluster import grapheme_clusters
from .ocr_files import ExtractedText
from . import ExtractedText
def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
"""Compute the matrix commonly computed to produce the Levenshtein distance.

@ -0,0 +1,118 @@
import enum
import re
import unicodedata
from contextlib import suppress
from itertools import repeat
from typing import Optional
import attr
from .substitute_equivalences import substitute_equivalences
class Normalization(enum.Enum):
NFC = 1
NFC_MUFI = 2 # TODO
NFC_SBB = 3
def normalize(text, normalization):
if normalization == Normalization.NFC:
return unicodedata.normalize('NFC', text)
if normalization == Normalization.NFC_MUFI:
raise NotImplementedError()
if normalization == Normalization.NFC_SBB:
return substitute_equivalences(text)
else:
raise ValueError()
# XXX hack
def normalize_sbb(t):
return normalize(t, Normalization.NFC_SBB)
@attr.s(frozen=True)
class ExtractedText:
"""
Extracted text
Objects of this class are guaranteed to be a. always in their normalization and
b. in NFC.
"""
segment_id = attr.ib(type=Optional[str])
@segment_id.validator
def check(self, _, value):
if value is None:
return
if not re.match(r'[\w\d_-]+', value):
raise ValueError('Malformed segment id "{}"'.format(value))
# An object contains either
# a. _text itself
# b. or segments (ExtractedText) and a joiner
segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list))
joiner = attr.ib(type=Optional[str])
_text = attr.ib(type=Optional[str])
@segments.validator
def check(self, _, value):
if value is not None and self._text is not None:
raise ValueError("Can't have both segments and text")
@_text.validator
def check(self, _, value):
if value is not None and self.segments is not None:
raise ValueError("Can't have both segments and text")
if value is not None and unicodedata.normalize('NFC', value) != value:
raise ValueError('String "{}" is not in NFC.'.format(value))
if value is not None and normalize(value, self.normalization) != value:
raise ValueError('String "{}" is not normalized.'.format(value))
normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB)
@property
def text(self):
if self._text is not None:
if self._text == '':
return None
else:
return self._text
else:
return self.joiner.join(s.text for s in self.segments)
_segment_id_for_pos = None
def segment_id_for_pos(self, pos):
# Calculate segment ids once, on the first call
if not self._segment_id_for_pos:
segment_id_for_pos = []
for s in self.segments:
segment_id_for_pos.extend(repeat(s.segment_id, len(s.text)))
segment_id_for_pos.extend(repeat(None, len(self.joiner)))
segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)]
# This is frozen, so we have to jump through the hoop:
object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
assert self._segment_id_for_pos
return self._segment_id_for_pos[pos]
@classmethod
def from_text_segment(cls, text_segment, nsmap):
"""Build an ExtractedText from a PAGE content text element"""
segment_id = text_segment.attrib['id']
segment_text = None
with suppress(AttributeError):
segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
segment_text = segment_text or ''
segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization
segment_text = segment_text or ''
return cls(segment_id, None, None, segment_text)
@classmethod
def from_str(cls, text, normalization=Normalization.NFC_SBB):
normalized_text = normalize(text, normalization)
return cls(None, None, None, normalized_text, normalization=normalization)

@ -1,126 +1,13 @@
from __future__ import division, print_function
from typing import Optional, Generator
from typing import Generator
from warnings import warn
import sys
from lxml import etree as ET
from lxml.etree import XMLSyntaxError
from contextlib import suppress
from itertools import repeat
from .substitute_equivalences import substitute_equivalences
import sys
import attr
import enum
import unicodedata
import re
class Normalization(enum.Enum):
NFC = 1
NFC_MUFI = 2 # TODO
NFC_SBB = 3
@attr.s(frozen=True)
class ExtractedText:
"""
Extracted text
Objects of this class are guaranteed to be a. always in their normalization and
b. in NFC.
"""
segment_id = attr.ib(type=Optional[str])
@segment_id.validator
def check(self, _, value):
if value is None:
return
if not re.match(r'[\w\d_-]+', value):
raise ValueError('Malformed segment id "{}"'.format(value))
# An object contains either
# a. _text itself
# b. or segments (ExtractedText) and a joiner
segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list))
joiner = attr.ib(type=Optional[str])
_text = attr.ib(type=Optional[str])
@segments.validator
def check(self, _, value):
if value is not None and self._text is not None:
raise ValueError("Can't have both segments and text")
@_text.validator
def check(self, _, value):
if value is not None and self.segments is not None:
raise ValueError("Can't have both segments and text")
if value is not None and unicodedata.normalize('NFC', value) != value:
raise ValueError('String "{}" is not in NFC.'.format(value))
if value is not None and normalize(value, self.normalization) != value:
raise ValueError('String "{}" is not normalized.'.format(value))
normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB)
@property
def text(self):
if self._text is not None:
if self._text == '':
return None
else:
return self._text
else:
return self.joiner.join(s.text for s in self.segments)
_segment_id_for_pos = None
def segment_id_for_pos(self, pos):
# Calculate segment ids once, on the first call
if not self._segment_id_for_pos:
segment_id_for_pos = []
for s in self.segments:
segment_id_for_pos.extend(repeat(s.segment_id, len(s.text)))
segment_id_for_pos.extend(repeat(None, len(self.joiner)))
segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)]
# This is frozen, so we have to jump through the hoop:
object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
assert self._segment_id_for_pos
return self._segment_id_for_pos[pos]
@classmethod
def from_text_segment(cls, text_segment, nsmap):
"""Build an ExtractedText from a PAGE content text element"""
segment_id = text_segment.attrib['id']
segment_text = None
with suppress(AttributeError):
segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
segment_text = segment_text or ''
segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization
segment_text = segment_text or ''
return cls(segment_id, None, None, segment_text)
@classmethod
def from_str(cls, text, normalization=Normalization.NFC_SBB):
normalized_text = normalize(text, normalization)
return cls(None, None, None, normalized_text, normalization=normalization)
def normalize(text, normalization):
if normalization == Normalization.NFC:
return unicodedata.normalize('NFC', text)
if normalization == Normalization.NFC_MUFI:
raise NotImplementedError()
if normalization == Normalization.NFC_SBB:
return substitute_equivalences(text)
else:
raise ValueError()
# XXX hack
def normalize_sbb(t):
return normalize(t, Normalization.NFC_SBB)
from .extracted_text import ExtractedText, normalize_sbb
def alto_namespace(tree: ET.ElementTree) -> str:
@ -192,7 +79,7 @@ def page_extract(tree):
regions.append(ExtractedText.from_text_segment(region, nsmap))
# Filter empty region texts
regions = (r for r in regions if r.text is not None)
regions = [r for r in regions if r.text is not None]
return ExtractedText(None, regions, '\n', None)

@ -7,7 +7,7 @@ from multimethod import multimethod
import uniseg.wordbreak
from .edit_distance import levenshtein
from .ocr_files import ExtractedText
from . import ExtractedText
@multimethod

Loading…
Cancel
Save