Merge pull request #37 from b2m/fix-sort-with-none

Sort textlines with missing indices
pull/42/head
Mike Gerber 4 years ago committed by GitHub
commit 089f6d299e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -4,9 +4,13 @@ import unicodedata
from contextlib import suppress from contextlib import suppress
from itertools import repeat from itertools import repeat
from typing import Optional from typing import Optional
from lxml import etree as ET
import attr import attr
import numpy as np
from lxml import etree as ET
from ocrd_utils import getLogger
LOG = getLogger('processor.OcrdDinglehopperEvaluate')
class Normalization(enum.Enum): class Normalization(enum.Enum):
@ -53,7 +57,9 @@ def unjoin_ligatures(s):
'\uf534': 'us', # eMOP: Latin small ligature us '\uf534': 'us', # eMOP: Latin small ligature us
'\uf535': 'Qu', # eMOP: Latin ligature capital Q small u '\uf535': 'Qu', # eMOP: Latin ligature capital Q small u
'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ 'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ
'\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly? '\uE8BF': 'q&',
# MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET
# XXX How to replace this correctly?
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P '\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
'': 'st', # U+FB06 LATIN SMALL LIGATURE ST '': 'st', # U+FB06 LATIN SMALL LIGATURE ST
} }
@ -178,27 +184,6 @@ class ExtractedText:
def from_text_segment(cls, text_segment, nsmap, textequiv_level='region'): def from_text_segment(cls, text_segment, nsmap, textequiv_level='region'):
"""Build an ExtractedText from a PAGE content text element""" """Build an ExtractedText from a PAGE content text element"""
def invert_dict(d):
"""Invert the given dict"""
return {v: k for k, v in d.items()}
def get_textequiv_unicode(s):
"""Get the TextEquiv/Unicode text of the given PAGE text element"""
textequivs = s.findall('./page:TextEquiv', namespaces=nsmap)
if not textequivs:
return None
def get_index(te):
index = te.attrib.get('index')
try:
return int(index)
except TypeError:
return None
textequivs = sorted(textequivs, key=get_index)
return textequivs[0].find('./page:Unicode', namespaces=nsmap).text
localname_for_textequiv_level = { localname_for_textequiv_level = {
'region': 'TextRegion', 'region': 'TextRegion',
'line': 'TextLine' 'line': 'TextLine'
@ -216,9 +201,9 @@ class ExtractedText:
if localname == localname_for_textequiv_level[textequiv_level]: if localname == localname_for_textequiv_level[textequiv_level]:
segment_text = None segment_text = None
with suppress(AttributeError): with suppress(AttributeError):
segment_text = get_textequiv_unicode(text_segment) segment_text = get_textequiv_unicode(text_segment, nsmap)
segment_text = segment_text or '' # FIXME hardcoded SBB normalization
segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization segment_text = normalize_sbb(segment_text)
segment_text = segment_text or '' segment_text = segment_text or ''
return cls(segment_id, None, None, segment_text) return cls(segment_id, None, None, segment_text)
else: else:
@ -226,7 +211,8 @@ class ExtractedText:
sub_localname = children_for_localname[localname] sub_localname = children_for_localname[localname]
sub_textequiv_level = textequiv_level_for_localname[sub_localname] sub_textequiv_level = textequiv_level_for_localname[sub_localname]
segments = [] segments = []
for sub_segment in text_segment.iterfind('./page:%s' % sub_localname, namespaces=nsmap): for sub_segment in text_segment.iterfind('./page:%s' % sub_localname,
namespaces=nsmap):
segments.append( segments.append(
ExtractedText.from_text_segment( ExtractedText.from_text_segment(
sub_segment, nsmap, sub_segment, nsmap,
@ -235,8 +221,63 @@ class ExtractedText:
joiner = joiner_for_textequiv_level[sub_textequiv_level] joiner = joiner_for_textequiv_level[sub_textequiv_level]
return cls(segment_id, segments, joiner, None) return cls(segment_id, segments, joiner, None)
@classmethod @classmethod
def from_str(cls, text, normalization=Normalization.NFC_SBB): def from_str(cls, text, normalization=Normalization.NFC_SBB):
normalized_text = normalize(text, normalization) normalized_text = normalize(text, normalization)
return cls(None, None, None, normalized_text, normalization=normalization) return cls(None, None, None, normalized_text, normalization=normalization)
def invert_dict(d):
"""Invert the given dict."""
return {v: k for k, v in d.items()}
def get_textequiv_unicode(text_segment, nsmap) -> str:
"""Get the TextEquiv/Unicode text of the given PAGE text element."""
segment_id = text_segment.attrib['id']
textequivs = text_segment.findall('./page:TextEquiv', namespaces=nsmap)
if not textequivs:
return ''
textequiv = get_first_textequiv(textequivs, segment_id)
return textequiv.find('./page:Unicode', namespaces=nsmap).text or ''
def get_first_textequiv(textequivs, segment_id):
"""Get the first TextEquiv based on index or conf order if index is not present."""
if len(textequivs) == 1:
return textequivs[0]
# try ordering by index
indices = np.array([get_attr(te, 'index') for te in textequivs], dtype=float)
nan_mask = np.isnan(indices)
if np.any(~nan_mask):
if np.any(nan_mask):
LOG.warning("TextEquiv without index in %s.", segment_id)
index = np.nanargmin(indices)
else:
# try ordering by conf
confidences = np.array([get_attr(te, 'conf') for te in textequivs], dtype=float)
if np.any(~np.isnan(confidences)):
LOG.info("No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
segment_id)
index = np.nanargmax(confidences)
else:
# fallback to first entry in case of neither index or conf present
LOG.warning("No index attributes, use first TextEquiv in %s.", segment_id)
index = 0
return textequivs[index]
def get_attr(te, attr_name) -> float:
"""Extract the attribute for the given name.
Note: currently only handles numeric values!
Other or non existend values are encoded as np.nan.
"""
attr_value = te.attrib.get(attr_name)
try:
return float(attr_value)
except TypeError:
return np.nan

@ -1,8 +1,10 @@
import logging
import unicodedata import unicodedata
import pytest
from uniseg.graphemecluster import grapheme_clusters
from collections import namedtuple from collections import namedtuple
import pytest
from lxml import etree as ET from lxml import etree as ET
from uniseg.graphemecluster import grapheme_clusters
from .. import seq_align, ExtractedText from .. import seq_align, ExtractedText
@ -45,12 +47,17 @@ def test_align():
test2 = ExtractedText(None, [ test2 = ExtractedText(None, [
ExtractedText('x0', None, None, 'foo'), ExtractedText('x0', None, None, 'foo'),
ExtractedText('x1', None, None, 'bar'), ExtractedText('x1', None, None, 'bar'),
ExtractedText('x2', None, None, '.'), # extra . # extra .
ExtractedText('x3', None, None, 'bazim̃ga'), # deletion + different grapheme cluster, m̃ also is two Python characters ExtractedText('x2', None, None, '.'),
# deletion + different grapheme cluster, m̃ also is two Python characters
ExtractedText('x3', None, None, 'bazim̃ga'),
], ' ', None) ], ' ', None)
left_pos = 0; right_pos = 0; alignment = [] left_pos = 0
for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)): right_pos = 0
alignment = []
for left, right in seq_align(grapheme_clusters(test1.text),
grapheme_clusters(test2.text)):
left_id = test1.segment_id_for_pos(left_pos) if left is not None else None left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
right_id = test2.segment_id_for_pos(right_pos) if right is not None else None right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
el = AlignmentElement(left, right, left_id, right_id) el = AlignmentElement(left, right, left_id, right_id)
@ -69,27 +76,43 @@ def test_align():
assert alignment[15] == ('n', '', 's2', 'x3') assert alignment[15] == ('n', '', 's2', 'x3')
def test_textequiv_index(): @pytest.mark.parametrize("attributes,expected_index,expected_log", [
""" ([], None, None),
Test that extracting text from a PAGE TextEquiv honors the "index". (['index="0"'], 0, None),
""" ([''], 0, None),
(['conf="0.5"'], 0, None),
(['index="1"', 'index="0"'], 1, None),
(['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"),
(['conf="0.4"', 'conf="0.5"', 'conf="0.9"'], 2,
"No index attributes, use 'conf' attribute to sort TextEquiv"),
(['index="0"', ''], 0, "TextEquiv without index"),
(['', 'conf="0.4"'], 1,
"No index attributes, use 'conf' attribute to sort TextEquiv"),
(['', ''], 0, "No index attributes, use first TextEquiv"),
])
def test_textequiv(attributes, expected_index, expected_log, caplog):
"""Test that extracting text from a PAGE TextEquiv is working without index attr."""
caplog.set_level(logging.INFO)
xml = "<?xml version=\"1.0\"?>"
ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
text = ["Text {0}".format(i) for i in range(len(attributes) + 1)]
equiv = ["<TextEquiv {0}><Unicode>{1}</Unicode></TextEquiv>".format(attr, text[i])
for i, attr in enumerate(attributes)]
textline = "{0}<TextLine id=\"l3\" xmlns=\"{1}\">{2}</TextLine>"
textline = textline.format(xml, ns, ''.join(equiv))
# This example textline has two TextEquivs, the one with the lowest index
# should be used. The XML order of the TextEquivs is deliberately not
# in index order.
textline="""<?xml version="1.0"?>
<TextLine id="l3" xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15">
<TextEquiv index="1">
<Unicode>gefahren zu haben, einzelne Bemorkungen und Beobäch-</Unicode>
</TextEquiv>
<TextEquiv index="0">
<Unicode>gefahren zu haben, einzelne Bemerkungen und Beobach-</Unicode>
</TextEquiv>
</TextLine>
"""
root = ET.fromstring(textline) root = ET.fromstring(textline)
nsmap = {'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" } result = ExtractedText.from_text_segment(root,
result = ExtractedText.from_text_segment(root, nsmap, textequiv_level='line').text {'page': ns},
expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-" textequiv_level='line').text
if expected_index is None:
assert expected == result assert not result
else:
assert result == text[expected_index]
if expected_log is None:
assert "no_index" not in caplog.text
else:
assert expected_log in caplog.text

Loading…
Cancel
Save