mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-07-13 04:19:59 +02:00
➡️ dinglehopper: Move ExtractedText to its own file
This commit is contained in:
parent
9dd4ff0aae
commit
1077dc64ce
6 changed files with 128 additions and 120 deletions
|
@ -7,7 +7,8 @@ from multimethod import multimethod
|
||||||
from uniseg.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
from qurator.dinglehopper.edit_distance import distance
|
from qurator.dinglehopper.edit_distance import distance
|
||||||
from qurator.dinglehopper.ocr_files import ExtractedText
|
from qurator.dinglehopper.extracted_text import ExtractedText
|
||||||
|
|
||||||
|
|
||||||
@multimethod
|
@multimethod
|
||||||
def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
|
def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
|
||||||
|
|
|
@ -6,6 +6,7 @@ from markupsafe import escape
|
||||||
|
|
||||||
|
|
||||||
from qurator.dinglehopper import *
|
from qurator.dinglehopper import *
|
||||||
|
from qurator.dinglehopper import ExtractedText
|
||||||
|
|
||||||
|
|
||||||
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
|
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
|
||||||
|
|
|
@ -8,7 +8,8 @@ import numpy as np
|
||||||
from multimethod import multimethod
|
from multimethod import multimethod
|
||||||
from uniseg.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
from .ocr_files import ExtractedText
|
from . import ExtractedText
|
||||||
|
|
||||||
|
|
||||||
def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
|
def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
|
||||||
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
||||||
|
|
118
qurator/dinglehopper/extracted_text.py
Normal file
118
qurator/dinglehopper/extracted_text.py
Normal file
|
@ -0,0 +1,118 @@
|
||||||
|
import enum
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
from contextlib import suppress
|
||||||
|
from itertools import repeat
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import attr
|
||||||
|
|
||||||
|
from .substitute_equivalences import substitute_equivalences
|
||||||
|
|
||||||
|
|
||||||
|
class Normalization(enum.Enum):
|
||||||
|
NFC = 1
|
||||||
|
NFC_MUFI = 2 # TODO
|
||||||
|
NFC_SBB = 3
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(text, normalization):
|
||||||
|
if normalization == Normalization.NFC:
|
||||||
|
return unicodedata.normalize('NFC', text)
|
||||||
|
if normalization == Normalization.NFC_MUFI:
|
||||||
|
raise NotImplementedError()
|
||||||
|
if normalization == Normalization.NFC_SBB:
|
||||||
|
return substitute_equivalences(text)
|
||||||
|
else:
|
||||||
|
raise ValueError()
|
||||||
|
|
||||||
|
|
||||||
|
# XXX hack
|
||||||
|
def normalize_sbb(t):
|
||||||
|
return normalize(t, Normalization.NFC_SBB)
|
||||||
|
|
||||||
|
|
||||||
|
@attr.s(frozen=True)
|
||||||
|
class ExtractedText:
|
||||||
|
"""
|
||||||
|
Extracted text
|
||||||
|
|
||||||
|
Objects of this class are guaranteed to be a. always in their normalization and
|
||||||
|
b. in NFC.
|
||||||
|
"""
|
||||||
|
segment_id = attr.ib(type=Optional[str])
|
||||||
|
|
||||||
|
@segment_id.validator
|
||||||
|
def check(self, _, value):
|
||||||
|
if value is None:
|
||||||
|
return
|
||||||
|
if not re.match(r'[\w\d_-]+', value):
|
||||||
|
raise ValueError('Malformed segment id "{}"'.format(value))
|
||||||
|
|
||||||
|
# An object contains either
|
||||||
|
# a. _text itself
|
||||||
|
# b. or segments (ExtractedText) and a joiner
|
||||||
|
|
||||||
|
segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list))
|
||||||
|
joiner = attr.ib(type=Optional[str])
|
||||||
|
_text = attr.ib(type=Optional[str])
|
||||||
|
|
||||||
|
@segments.validator
|
||||||
|
def check(self, _, value):
|
||||||
|
if value is not None and self._text is not None:
|
||||||
|
raise ValueError("Can't have both segments and text")
|
||||||
|
|
||||||
|
@_text.validator
|
||||||
|
def check(self, _, value):
|
||||||
|
if value is not None and self.segments is not None:
|
||||||
|
raise ValueError("Can't have both segments and text")
|
||||||
|
if value is not None and unicodedata.normalize('NFC', value) != value:
|
||||||
|
raise ValueError('String "{}" is not in NFC.'.format(value))
|
||||||
|
if value is not None and normalize(value, self.normalization) != value:
|
||||||
|
raise ValueError('String "{}" is not normalized.'.format(value))
|
||||||
|
|
||||||
|
normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text(self):
|
||||||
|
if self._text is not None:
|
||||||
|
if self._text == '':
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return self._text
|
||||||
|
else:
|
||||||
|
return self.joiner.join(s.text for s in self.segments)
|
||||||
|
|
||||||
|
_segment_id_for_pos = None
|
||||||
|
|
||||||
|
def segment_id_for_pos(self, pos):
|
||||||
|
# Calculate segment ids once, on the first call
|
||||||
|
if not self._segment_id_for_pos:
|
||||||
|
segment_id_for_pos = []
|
||||||
|
for s in self.segments:
|
||||||
|
segment_id_for_pos.extend(repeat(s.segment_id, len(s.text)))
|
||||||
|
segment_id_for_pos.extend(repeat(None, len(self.joiner)))
|
||||||
|
segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)]
|
||||||
|
# This is frozen, so we have to jump through the hoop:
|
||||||
|
object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
|
||||||
|
assert self._segment_id_for_pos
|
||||||
|
|
||||||
|
return self._segment_id_for_pos[pos]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_text_segment(cls, text_segment, nsmap):
|
||||||
|
"""Build an ExtractedText from a PAGE content text element"""
|
||||||
|
|
||||||
|
segment_id = text_segment.attrib['id']
|
||||||
|
segment_text = None
|
||||||
|
with suppress(AttributeError):
|
||||||
|
segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
|
||||||
|
segment_text = segment_text or ''
|
||||||
|
segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization
|
||||||
|
segment_text = segment_text or ''
|
||||||
|
return cls(segment_id, None, None, segment_text)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_str(cls, text, normalization=Normalization.NFC_SBB):
|
||||||
|
normalized_text = normalize(text, normalization)
|
||||||
|
return cls(None, None, None, normalized_text, normalization=normalization)
|
|
@ -1,126 +1,13 @@
|
||||||
from __future__ import division, print_function
|
from __future__ import division, print_function
|
||||||
|
|
||||||
from typing import Optional, Generator
|
from typing import Generator
|
||||||
from warnings import warn
|
from warnings import warn
|
||||||
|
import sys
|
||||||
|
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
from lxml.etree import XMLSyntaxError
|
from lxml.etree import XMLSyntaxError
|
||||||
from contextlib import suppress
|
|
||||||
from itertools import repeat
|
|
||||||
from .substitute_equivalences import substitute_equivalences
|
|
||||||
import sys
|
|
||||||
import attr
|
|
||||||
import enum
|
|
||||||
import unicodedata
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
from .extracted_text import ExtractedText, normalize_sbb
|
||||||
class Normalization(enum.Enum):
|
|
||||||
NFC = 1
|
|
||||||
NFC_MUFI = 2 # TODO
|
|
||||||
NFC_SBB = 3
|
|
||||||
|
|
||||||
|
|
||||||
@attr.s(frozen=True)
|
|
||||||
class ExtractedText:
|
|
||||||
"""
|
|
||||||
Extracted text
|
|
||||||
|
|
||||||
Objects of this class are guaranteed to be a. always in their normalization and
|
|
||||||
b. in NFC.
|
|
||||||
"""
|
|
||||||
segment_id = attr.ib(type=Optional[str])
|
|
||||||
|
|
||||||
@segment_id.validator
|
|
||||||
def check(self, _, value):
|
|
||||||
if value is None:
|
|
||||||
return
|
|
||||||
if not re.match(r'[\w\d_-]+', value):
|
|
||||||
raise ValueError('Malformed segment id "{}"'.format(value))
|
|
||||||
|
|
||||||
# An object contains either
|
|
||||||
# a. _text itself
|
|
||||||
# b. or segments (ExtractedText) and a joiner
|
|
||||||
|
|
||||||
segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list))
|
|
||||||
joiner = attr.ib(type=Optional[str])
|
|
||||||
_text = attr.ib(type=Optional[str])
|
|
||||||
|
|
||||||
@segments.validator
|
|
||||||
def check(self, _, value):
|
|
||||||
if value is not None and self._text is not None:
|
|
||||||
raise ValueError("Can't have both segments and text")
|
|
||||||
|
|
||||||
@_text.validator
|
|
||||||
def check(self, _, value):
|
|
||||||
if value is not None and self.segments is not None:
|
|
||||||
raise ValueError("Can't have both segments and text")
|
|
||||||
if value is not None and unicodedata.normalize('NFC', value) != value:
|
|
||||||
raise ValueError('String "{}" is not in NFC.'.format(value))
|
|
||||||
if value is not None and normalize(value, self.normalization) != value:
|
|
||||||
raise ValueError('String "{}" is not normalized.'.format(value))
|
|
||||||
|
|
||||||
normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def text(self):
|
|
||||||
if self._text is not None:
|
|
||||||
if self._text == '':
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return self._text
|
|
||||||
else:
|
|
||||||
return self.joiner.join(s.text for s in self.segments)
|
|
||||||
|
|
||||||
_segment_id_for_pos = None
|
|
||||||
|
|
||||||
def segment_id_for_pos(self, pos):
|
|
||||||
# Calculate segment ids once, on the first call
|
|
||||||
if not self._segment_id_for_pos:
|
|
||||||
segment_id_for_pos = []
|
|
||||||
for s in self.segments:
|
|
||||||
segment_id_for_pos.extend(repeat(s.segment_id, len(s.text)))
|
|
||||||
segment_id_for_pos.extend(repeat(None, len(self.joiner)))
|
|
||||||
segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)]
|
|
||||||
# This is frozen, so we have to jump through the hoop:
|
|
||||||
object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
|
|
||||||
assert self._segment_id_for_pos
|
|
||||||
|
|
||||||
return self._segment_id_for_pos[pos]
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_text_segment(cls, text_segment, nsmap):
|
|
||||||
"""Build an ExtractedText from a PAGE content text element"""
|
|
||||||
|
|
||||||
segment_id = text_segment.attrib['id']
|
|
||||||
segment_text = None
|
|
||||||
with suppress(AttributeError):
|
|
||||||
segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
|
|
||||||
segment_text = segment_text or ''
|
|
||||||
segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization
|
|
||||||
segment_text = segment_text or ''
|
|
||||||
return cls(segment_id, None, None, segment_text)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_str(cls, text, normalization=Normalization.NFC_SBB):
|
|
||||||
normalized_text = normalize(text, normalization)
|
|
||||||
return cls(None, None, None, normalized_text, normalization=normalization)
|
|
||||||
|
|
||||||
|
|
||||||
def normalize(text, normalization):
|
|
||||||
if normalization == Normalization.NFC:
|
|
||||||
return unicodedata.normalize('NFC', text)
|
|
||||||
if normalization == Normalization.NFC_MUFI:
|
|
||||||
raise NotImplementedError()
|
|
||||||
if normalization == Normalization.NFC_SBB:
|
|
||||||
return substitute_equivalences(text)
|
|
||||||
else:
|
|
||||||
raise ValueError()
|
|
||||||
|
|
||||||
|
|
||||||
# XXX hack
|
|
||||||
def normalize_sbb(t):
|
|
||||||
return normalize(t, Normalization.NFC_SBB)
|
|
||||||
|
|
||||||
|
|
||||||
def alto_namespace(tree: ET.ElementTree) -> str:
|
def alto_namespace(tree: ET.ElementTree) -> str:
|
||||||
|
@ -192,7 +79,7 @@ def page_extract(tree):
|
||||||
regions.append(ExtractedText.from_text_segment(region, nsmap))
|
regions.append(ExtractedText.from_text_segment(region, nsmap))
|
||||||
|
|
||||||
# Filter empty region texts
|
# Filter empty region texts
|
||||||
regions = (r for r in regions if r.text is not None)
|
regions = [r for r in regions if r.text is not None]
|
||||||
|
|
||||||
return ExtractedText(None, regions, '\n', None)
|
return ExtractedText(None, regions, '\n', None)
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ from multimethod import multimethod
|
||||||
import uniseg.wordbreak
|
import uniseg.wordbreak
|
||||||
|
|
||||||
from .edit_distance import levenshtein
|
from .edit_distance import levenshtein
|
||||||
from .ocr_files import ExtractedText
|
from . import ExtractedText
|
||||||
|
|
||||||
|
|
||||||
@multimethod
|
@multimethod
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue