mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-07-01 14:40:00 +02:00
Merge pull request #37 from b2m/fix-sort-with-none
Sort textlines with missing indices
This commit is contained in:
commit
089f6d299e
2 changed files with 134 additions and 70 deletions
|
@ -4,9 +4,13 @@ import unicodedata
|
||||||
from contextlib import suppress
|
from contextlib import suppress
|
||||||
from itertools import repeat
|
from itertools import repeat
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from lxml import etree as ET
|
|
||||||
|
|
||||||
import attr
|
import attr
|
||||||
|
import numpy as np
|
||||||
|
from lxml import etree as ET
|
||||||
|
from ocrd_utils import getLogger
|
||||||
|
|
||||||
|
LOG = getLogger('processor.OcrdDinglehopperEvaluate')
|
||||||
|
|
||||||
|
|
||||||
class Normalization(enum.Enum):
|
class Normalization(enum.Enum):
|
||||||
|
@ -53,7 +57,9 @@ def unjoin_ligatures(s):
|
||||||
'\uf534': 'us', # eMOP: Latin small ligature us
|
'\uf534': 'us', # eMOP: Latin small ligature us
|
||||||
'\uf535': 'Qu', # eMOP: Latin ligature capital Q small u
|
'\uf535': 'Qu', # eMOP: Latin ligature capital Q small u
|
||||||
'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ
|
'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ
|
||||||
'\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly?
|
'\uE8BF': 'q&',
|
||||||
|
# MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET
|
||||||
|
# XXX How to replace this correctly?
|
||||||
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
|
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
|
||||||
'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST
|
'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST
|
||||||
}
|
}
|
||||||
|
@ -178,27 +184,6 @@ class ExtractedText:
|
||||||
def from_text_segment(cls, text_segment, nsmap, textequiv_level='region'):
|
def from_text_segment(cls, text_segment, nsmap, textequiv_level='region'):
|
||||||
"""Build an ExtractedText from a PAGE content text element"""
|
"""Build an ExtractedText from a PAGE content text element"""
|
||||||
|
|
||||||
def invert_dict(d):
|
|
||||||
"""Invert the given dict"""
|
|
||||||
return {v: k for k, v in d.items()}
|
|
||||||
|
|
||||||
def get_textequiv_unicode(s):
|
|
||||||
"""Get the TextEquiv/Unicode text of the given PAGE text element"""
|
|
||||||
textequivs = s.findall('./page:TextEquiv', namespaces=nsmap)
|
|
||||||
|
|
||||||
if not textequivs:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_index(te):
|
|
||||||
index = te.attrib.get('index')
|
|
||||||
try:
|
|
||||||
return int(index)
|
|
||||||
except TypeError:
|
|
||||||
return None
|
|
||||||
textequivs = sorted(textequivs, key=get_index)
|
|
||||||
|
|
||||||
return textequivs[0].find('./page:Unicode', namespaces=nsmap).text
|
|
||||||
|
|
||||||
localname_for_textequiv_level = {
|
localname_for_textequiv_level = {
|
||||||
'region': 'TextRegion',
|
'region': 'TextRegion',
|
||||||
'line': 'TextLine'
|
'line': 'TextLine'
|
||||||
|
@ -216,9 +201,9 @@ class ExtractedText:
|
||||||
if localname == localname_for_textequiv_level[textequiv_level]:
|
if localname == localname_for_textequiv_level[textequiv_level]:
|
||||||
segment_text = None
|
segment_text = None
|
||||||
with suppress(AttributeError):
|
with suppress(AttributeError):
|
||||||
segment_text = get_textequiv_unicode(text_segment)
|
segment_text = get_textequiv_unicode(text_segment, nsmap)
|
||||||
segment_text = segment_text or ''
|
# FIXME hardcoded SBB normalization
|
||||||
segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization
|
segment_text = normalize_sbb(segment_text)
|
||||||
segment_text = segment_text or ''
|
segment_text = segment_text or ''
|
||||||
return cls(segment_id, None, None, segment_text)
|
return cls(segment_id, None, None, segment_text)
|
||||||
else:
|
else:
|
||||||
|
@ -226,7 +211,8 @@ class ExtractedText:
|
||||||
sub_localname = children_for_localname[localname]
|
sub_localname = children_for_localname[localname]
|
||||||
sub_textequiv_level = textequiv_level_for_localname[sub_localname]
|
sub_textequiv_level = textequiv_level_for_localname[sub_localname]
|
||||||
segments = []
|
segments = []
|
||||||
for sub_segment in text_segment.iterfind('./page:%s' % sub_localname, namespaces=nsmap):
|
for sub_segment in text_segment.iterfind('./page:%s' % sub_localname,
|
||||||
|
namespaces=nsmap):
|
||||||
segments.append(
|
segments.append(
|
||||||
ExtractedText.from_text_segment(
|
ExtractedText.from_text_segment(
|
||||||
sub_segment, nsmap,
|
sub_segment, nsmap,
|
||||||
|
@ -235,8 +221,63 @@ class ExtractedText:
|
||||||
joiner = joiner_for_textequiv_level[sub_textequiv_level]
|
joiner = joiner_for_textequiv_level[sub_textequiv_level]
|
||||||
return cls(segment_id, segments, joiner, None)
|
return cls(segment_id, segments, joiner, None)
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_str(cls, text, normalization=Normalization.NFC_SBB):
|
def from_str(cls, text, normalization=Normalization.NFC_SBB):
|
||||||
normalized_text = normalize(text, normalization)
|
normalized_text = normalize(text, normalization)
|
||||||
return cls(None, None, None, normalized_text, normalization=normalization)
|
return cls(None, None, None, normalized_text, normalization=normalization)
|
||||||
|
|
||||||
|
|
||||||
|
def invert_dict(d):
|
||||||
|
"""Invert the given dict."""
|
||||||
|
return {v: k for k, v in d.items()}
|
||||||
|
|
||||||
|
|
||||||
|
def get_textequiv_unicode(text_segment, nsmap) -> str:
|
||||||
|
"""Get the TextEquiv/Unicode text of the given PAGE text element."""
|
||||||
|
segment_id = text_segment.attrib['id']
|
||||||
|
textequivs = text_segment.findall('./page:TextEquiv', namespaces=nsmap)
|
||||||
|
|
||||||
|
if not textequivs:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
textequiv = get_first_textequiv(textequivs, segment_id)
|
||||||
|
return textequiv.find('./page:Unicode', namespaces=nsmap).text or ''
|
||||||
|
|
||||||
|
|
||||||
|
def get_first_textequiv(textequivs, segment_id):
|
||||||
|
"""Get the first TextEquiv based on index or conf order if index is not present."""
|
||||||
|
if len(textequivs) == 1:
|
||||||
|
return textequivs[0]
|
||||||
|
|
||||||
|
# try ordering by index
|
||||||
|
indices = np.array([get_attr(te, 'index') for te in textequivs], dtype=float)
|
||||||
|
nan_mask = np.isnan(indices)
|
||||||
|
if np.any(~nan_mask):
|
||||||
|
if np.any(nan_mask):
|
||||||
|
LOG.warning("TextEquiv without index in %s.", segment_id)
|
||||||
|
index = np.nanargmin(indices)
|
||||||
|
else:
|
||||||
|
# try ordering by conf
|
||||||
|
confidences = np.array([get_attr(te, 'conf') for te in textequivs], dtype=float)
|
||||||
|
if np.any(~np.isnan(confidences)):
|
||||||
|
LOG.info("No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
|
||||||
|
segment_id)
|
||||||
|
index = np.nanargmax(confidences)
|
||||||
|
else:
|
||||||
|
# fallback to first entry in case of neither index or conf present
|
||||||
|
LOG.warning("No index attributes, use first TextEquiv in %s.", segment_id)
|
||||||
|
index = 0
|
||||||
|
return textequivs[index]
|
||||||
|
|
||||||
|
|
||||||
|
def get_attr(te, attr_name) -> float:
|
||||||
|
"""Extract the attribute for the given name.
|
||||||
|
|
||||||
|
Note: currently only handles numeric values!
|
||||||
|
Other or non existend values are encoded as np.nan.
|
||||||
|
"""
|
||||||
|
attr_value = te.attrib.get(attr_name)
|
||||||
|
try:
|
||||||
|
return float(attr_value)
|
||||||
|
except TypeError:
|
||||||
|
return np.nan
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
|
import logging
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import pytest
|
|
||||||
from uniseg.graphemecluster import grapheme_clusters
|
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
|
|
||||||
|
import pytest
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
from .. import seq_align, ExtractedText
|
from .. import seq_align, ExtractedText
|
||||||
|
|
||||||
|
@ -45,12 +47,17 @@ def test_align():
|
||||||
test2 = ExtractedText(None, [
|
test2 = ExtractedText(None, [
|
||||||
ExtractedText('x0', None, None, 'foo'),
|
ExtractedText('x0', None, None, 'foo'),
|
||||||
ExtractedText('x1', None, None, 'bar'),
|
ExtractedText('x1', None, None, 'bar'),
|
||||||
ExtractedText('x2', None, None, '.'), # extra .
|
# extra .
|
||||||
ExtractedText('x3', None, None, 'bazim̃ga'), # deletion + different grapheme cluster, m̃ also is two Python characters
|
ExtractedText('x2', None, None, '.'),
|
||||||
|
# deletion + different grapheme cluster, m̃ also is two Python characters
|
||||||
|
ExtractedText('x3', None, None, 'bazim̃ga'),
|
||||||
], ' ', None)
|
], ' ', None)
|
||||||
|
|
||||||
left_pos = 0; right_pos = 0; alignment = []
|
left_pos = 0
|
||||||
for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)):
|
right_pos = 0
|
||||||
|
alignment = []
|
||||||
|
for left, right in seq_align(grapheme_clusters(test1.text),
|
||||||
|
grapheme_clusters(test2.text)):
|
||||||
left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
|
left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
|
||||||
right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
|
right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
|
||||||
el = AlignmentElement(left, right, left_id, right_id)
|
el = AlignmentElement(left, right, left_id, right_id)
|
||||||
|
@ -69,27 +76,43 @@ def test_align():
|
||||||
assert alignment[15] == ('n', 'm̃', 's2', 'x3')
|
assert alignment[15] == ('n', 'm̃', 's2', 'x3')
|
||||||
|
|
||||||
|
|
||||||
def test_textequiv_index():
|
@pytest.mark.parametrize("attributes,expected_index,expected_log", [
|
||||||
"""
|
([], None, None),
|
||||||
Test that extracting text from a PAGE TextEquiv honors the "index".
|
(['index="0"'], 0, None),
|
||||||
"""
|
([''], 0, None),
|
||||||
|
(['conf="0.5"'], 0, None),
|
||||||
|
(['index="1"', 'index="0"'], 1, None),
|
||||||
|
(['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"),
|
||||||
|
(['conf="0.4"', 'conf="0.5"', 'conf="0.9"'], 2,
|
||||||
|
"No index attributes, use 'conf' attribute to sort TextEquiv"),
|
||||||
|
(['index="0"', ''], 0, "TextEquiv without index"),
|
||||||
|
(['', 'conf="0.4"'], 1,
|
||||||
|
"No index attributes, use 'conf' attribute to sort TextEquiv"),
|
||||||
|
(['', ''], 0, "No index attributes, use first TextEquiv"),
|
||||||
|
])
|
||||||
|
def test_textequiv(attributes, expected_index, expected_log, caplog):
|
||||||
|
"""Test that extracting text from a PAGE TextEquiv is working without index attr."""
|
||||||
|
caplog.set_level(logging.INFO)
|
||||||
|
xml = "<?xml version=\"1.0\"?>"
|
||||||
|
ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
|
||||||
|
text = ["Text {0}".format(i) for i in range(len(attributes) + 1)]
|
||||||
|
|
||||||
|
equiv = ["<TextEquiv {0}><Unicode>{1}</Unicode></TextEquiv>".format(attr, text[i])
|
||||||
|
for i, attr in enumerate(attributes)]
|
||||||
|
|
||||||
|
textline = "{0}<TextLine id=\"l3\" xmlns=\"{1}\">{2}</TextLine>"
|
||||||
|
textline = textline.format(xml, ns, ''.join(equiv))
|
||||||
|
|
||||||
# This example textline has two TextEquivs, the one with the lowest index
|
|
||||||
# should be used. The XML order of the TextEquivs is deliberately not
|
|
||||||
# in index order.
|
|
||||||
textline="""<?xml version="1.0"?>
|
|
||||||
<TextLine id="l3" xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15">
|
|
||||||
<TextEquiv index="1">
|
|
||||||
<Unicode>gefahren zu haben, einzelne Bemorkungen und Beobäch-</Unicode>
|
|
||||||
</TextEquiv>
|
|
||||||
<TextEquiv index="0">
|
|
||||||
<Unicode>gefahren zu haben, einzelne Bemerkungen und Beobach-</Unicode>
|
|
||||||
</TextEquiv>
|
|
||||||
</TextLine>
|
|
||||||
"""
|
|
||||||
root = ET.fromstring(textline)
|
root = ET.fromstring(textline)
|
||||||
nsmap = {'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" }
|
result = ExtractedText.from_text_segment(root,
|
||||||
result = ExtractedText.from_text_segment(root, nsmap, textequiv_level='line').text
|
{'page': ns},
|
||||||
expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-"
|
textequiv_level='line').text
|
||||||
|
if expected_index is None:
|
||||||
|
assert not result
|
||||||
|
else:
|
||||||
|
assert result == text[expected_index]
|
||||||
|
|
||||||
assert expected == result
|
if expected_log is None:
|
||||||
|
assert "no_index" not in caplog.text
|
||||||
|
else:
|
||||||
|
assert expected_log in caplog.text
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue