From 1077dc64ce918315ccafa49a26196296d86f32ab Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Oct 2020 13:25:20 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9E=A1=EF=B8=8F=20dinglehopper:=20Move=20Ext?= =?UTF-8?q?ractedText=20to=20its=20own=20file?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/character_error_rate.py | 3 +- qurator/dinglehopper/cli.py | 1 + qurator/dinglehopper/edit_distance.py | 3 +- qurator/dinglehopper/extracted_text.py | 118 ++++++++++++++++++ qurator/dinglehopper/ocr_files.py | 121 +------------------ qurator/dinglehopper/word_error_rate.py | 2 +- 6 files changed, 128 insertions(+), 120 deletions(-) create mode 100644 qurator/dinglehopper/extracted_text.py diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py index 998a3c2..055d6de 100644 --- a/qurator/dinglehopper/character_error_rate.py +++ b/qurator/dinglehopper/character_error_rate.py @@ -7,7 +7,8 @@ from multimethod import multimethod from uniseg.graphemecluster import grapheme_clusters from qurator.dinglehopper.edit_distance import distance -from qurator.dinglehopper.ocr_files import ExtractedText +from qurator.dinglehopper.extracted_text import ExtractedText + @multimethod def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]: diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 9c963c1..f568399 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -6,6 +6,7 @@ from markupsafe import escape from qurator.dinglehopper import * +from qurator.dinglehopper import ExtractedText def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index ed91443..e91d063 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -8,7 +8,8 @@ import numpy as np from multimethod import multimethod from uniseg.graphemecluster import grapheme_clusters -from .ocr_files import ExtractedText +from . import ExtractedText + def levenshtein_matrix(seq1: Sequence, seq2: Sequence): """Compute the matrix commonly computed to produce the Levenshtein distance. diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py new file mode 100644 index 0000000..6dcd921 --- /dev/null +++ b/qurator/dinglehopper/extracted_text.py @@ -0,0 +1,118 @@ +import enum +import re +import unicodedata +from contextlib import suppress +from itertools import repeat +from typing import Optional + +import attr + +from .substitute_equivalences import substitute_equivalences + + +class Normalization(enum.Enum): + NFC = 1 + NFC_MUFI = 2 # TODO + NFC_SBB = 3 + + +def normalize(text, normalization): + if normalization == Normalization.NFC: + return unicodedata.normalize('NFC', text) + if normalization == Normalization.NFC_MUFI: + raise NotImplementedError() + if normalization == Normalization.NFC_SBB: + return substitute_equivalences(text) + else: + raise ValueError() + + +# XXX hack +def normalize_sbb(t): + return normalize(t, Normalization.NFC_SBB) + + +@attr.s(frozen=True) +class ExtractedText: + """ + Extracted text + + Objects of this class are guaranteed to be a. always in their normalization and + b. in NFC. + """ + segment_id = attr.ib(type=Optional[str]) + + @segment_id.validator + def check(self, _, value): + if value is None: + return + if not re.match(r'[\w\d_-]+', value): + raise ValueError('Malformed segment id "{}"'.format(value)) + + # An object contains either + # a. _text itself + # b. or segments (ExtractedText) and a joiner + + segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list)) + joiner = attr.ib(type=Optional[str]) + _text = attr.ib(type=Optional[str]) + + @segments.validator + def check(self, _, value): + if value is not None and self._text is not None: + raise ValueError("Can't have both segments and text") + + @_text.validator + def check(self, _, value): + if value is not None and self.segments is not None: + raise ValueError("Can't have both segments and text") + if value is not None and unicodedata.normalize('NFC', value) != value: + raise ValueError('String "{}" is not in NFC.'.format(value)) + if value is not None and normalize(value, self.normalization) != value: + raise ValueError('String "{}" is not normalized.'.format(value)) + + normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB) + + @property + def text(self): + if self._text is not None: + if self._text == '': + return None + else: + return self._text + else: + return self.joiner.join(s.text for s in self.segments) + + _segment_id_for_pos = None + + def segment_id_for_pos(self, pos): + # Calculate segment ids once, on the first call + if not self._segment_id_for_pos: + segment_id_for_pos = [] + for s in self.segments: + segment_id_for_pos.extend(repeat(s.segment_id, len(s.text))) + segment_id_for_pos.extend(repeat(None, len(self.joiner))) + segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)] + # This is frozen, so we have to jump through the hoop: + object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos) + assert self._segment_id_for_pos + + return self._segment_id_for_pos[pos] + + @classmethod + def from_text_segment(cls, text_segment, nsmap): + """Build an ExtractedText from a PAGE content text element""" + + segment_id = text_segment.attrib['id'] + segment_text = None + with suppress(AttributeError): + segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text + segment_text = segment_text or '' + segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization + segment_text = segment_text or '' + return cls(segment_id, None, None, segment_text) + + @classmethod + def from_str(cls, text, normalization=Normalization.NFC_SBB): + normalized_text = normalize(text, normalization) + return cls(None, None, None, normalized_text, normalization=normalization) \ No newline at end of file diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 11a9836..78648eb 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -1,126 +1,13 @@ from __future__ import division, print_function -from typing import Optional, Generator +from typing import Generator from warnings import warn +import sys from lxml import etree as ET from lxml.etree import XMLSyntaxError -from contextlib import suppress -from itertools import repeat -from .substitute_equivalences import substitute_equivalences -import sys -import attr -import enum -import unicodedata -import re - - -class Normalization(enum.Enum): - NFC = 1 - NFC_MUFI = 2 # TODO - NFC_SBB = 3 - - -@attr.s(frozen=True) -class ExtractedText: - """ - Extracted text - - Objects of this class are guaranteed to be a. always in their normalization and - b. in NFC. - """ - segment_id = attr.ib(type=Optional[str]) - - @segment_id.validator - def check(self, _, value): - if value is None: - return - if not re.match(r'[\w\d_-]+', value): - raise ValueError('Malformed segment id "{}"'.format(value)) - - # An object contains either - # a. _text itself - # b. or segments (ExtractedText) and a joiner - - segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list)) - joiner = attr.ib(type=Optional[str]) - _text = attr.ib(type=Optional[str]) - - @segments.validator - def check(self, _, value): - if value is not None and self._text is not None: - raise ValueError("Can't have both segments and text") - - @_text.validator - def check(self, _, value): - if value is not None and self.segments is not None: - raise ValueError("Can't have both segments and text") - if value is not None and unicodedata.normalize('NFC', value) != value: - raise ValueError('String "{}" is not in NFC.'.format(value)) - if value is not None and normalize(value, self.normalization) != value: - raise ValueError('String "{}" is not normalized.'.format(value)) - - normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB) - - @property - def text(self): - if self._text is not None: - if self._text == '': - return None - else: - return self._text - else: - return self.joiner.join(s.text for s in self.segments) - - _segment_id_for_pos = None - - def segment_id_for_pos(self, pos): - # Calculate segment ids once, on the first call - if not self._segment_id_for_pos: - segment_id_for_pos = [] - for s in self.segments: - segment_id_for_pos.extend(repeat(s.segment_id, len(s.text))) - segment_id_for_pos.extend(repeat(None, len(self.joiner))) - segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)] - # This is frozen, so we have to jump through the hoop: - object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos) - assert self._segment_id_for_pos - - return self._segment_id_for_pos[pos] - - @classmethod - def from_text_segment(cls, text_segment, nsmap): - """Build an ExtractedText from a PAGE content text element""" - - segment_id = text_segment.attrib['id'] - segment_text = None - with suppress(AttributeError): - segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text - segment_text = segment_text or '' - segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization - segment_text = segment_text or '' - return cls(segment_id, None, None, segment_text) - - @classmethod - def from_str(cls, text, normalization=Normalization.NFC_SBB): - normalized_text = normalize(text, normalization) - return cls(None, None, None, normalized_text, normalization=normalization) - - -def normalize(text, normalization): - if normalization == Normalization.NFC: - return unicodedata.normalize('NFC', text) - if normalization == Normalization.NFC_MUFI: - raise NotImplementedError() - if normalization == Normalization.NFC_SBB: - return substitute_equivalences(text) - else: - raise ValueError() - -# XXX hack -def normalize_sbb(t): - return normalize(t, Normalization.NFC_SBB) +from .extracted_text import ExtractedText, normalize_sbb def alto_namespace(tree: ET.ElementTree) -> str: @@ -192,7 +79,7 @@ def page_extract(tree): regions.append(ExtractedText.from_text_segment(region, nsmap)) # Filter empty region texts - regions = (r for r in regions if r.text is not None) + regions = [r for r in regions if r.text is not None] return ExtractedText(None, regions, '\n', None) diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py index 95ea7f8..2f5a1f6 100644 --- a/qurator/dinglehopper/word_error_rate.py +++ b/qurator/dinglehopper/word_error_rate.py @@ -7,7 +7,7 @@ from multimethod import multimethod import uniseg.wordbreak from .edit_distance import levenshtein -from .ocr_files import ExtractedText +from . import ExtractedText @multimethod