Fix annoying logging exceptions and encoding errors.

pull/47/head
Benjamin Rosemann 4 years ago
parent 0dd5fc0ee5
commit 84d34f5b26

@ -1,4 +1,5 @@
import enum import enum
import logging
import re import re
import unicodedata import unicodedata
from contextlib import suppress from contextlib import suppress
@ -8,7 +9,8 @@ from typing import Optional
import attr import attr
import numpy as np import numpy as np
from lxml import etree as ET from lxml import etree as ET
from ocrd_utils import getLogger
LOG = logging.getLogger("processor.OcrdDinglehopperEvaluate")
class Normalization(enum.Enum): class Normalization(enum.Enum):
@ -239,7 +241,6 @@ def get_textequiv_unicode(text_segment, nsmap) -> str:
def get_first_textequiv(textequivs, segment_id): def get_first_textequiv(textequivs, segment_id):
"""Get the first TextEquiv based on index or conf order if index is not present.""" """Get the first TextEquiv based on index or conf order if index is not present."""
log = getLogger("processor.OcrdDinglehopperEvaluate")
if len(textequivs) == 1: if len(textequivs) == 1:
return textequivs[0] return textequivs[0]
@ -248,20 +249,20 @@ def get_first_textequiv(textequivs, segment_id):
nan_mask = np.isnan(indices) nan_mask = np.isnan(indices)
if np.any(~nan_mask): if np.any(~nan_mask):
if np.any(nan_mask): if np.any(nan_mask):
log.warning("TextEquiv without index in %s.", segment_id) LOG.warning("TextEquiv without index in %s.", segment_id)
index = np.nanargmin(indices) index = np.nanargmin(indices)
else: else:
# try ordering by conf # try ordering by conf
confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float) confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float)
if np.any(~np.isnan(confidences)): if np.any(~np.isnan(confidences)):
log.info( LOG.info(
"No index attributes, use 'conf' attribute to sort TextEquiv in %s.", "No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
segment_id, segment_id,
) )
index = np.nanargmax(confidences) index = np.nanargmax(confidences)
else: else:
# fallback to first entry in case of neither index or conf present # fallback to first entry in case of neither index or conf present
log.warning("No index attributes, use first TextEquiv in %s.", segment_id) LOG.warning("No index attributes, use first TextEquiv in %s.", segment_id)
index = 0 index = 0
return textequivs[index] return textequivs[index]

@ -125,7 +125,7 @@ def page_text(tree, *, textequiv_level="region"):
def plain_extract(filename): def plain_extract(filename):
with open(filename, "r") as f: with open(filename, "r", encoding="utf8") as f:
return ExtractedText( return ExtractedText(
None, None,
[ [

@ -4,6 +4,7 @@ from collections import namedtuple
import pytest import pytest
from lxml import etree as ET from lxml import etree as ET
from ocrd_utils import getLogger
from uniseg.graphemecluster import grapheme_clusters from uniseg.graphemecluster import grapheme_clusters
from .. import seq_align, ExtractedText from .. import seq_align, ExtractedText
@ -117,6 +118,7 @@ def test_align():
) )
def test_textequiv(attributes, expected_index, expected_log, caplog): def test_textequiv(attributes, expected_index, expected_log, caplog):
"""Test that extracting text from a PAGE TextEquiv is working without index attr.""" """Test that extracting text from a PAGE TextEquiv is working without index attr."""
getLogger("processor.OcrdDinglehopperEvaluate")
caplog.set_level(logging.INFO) caplog.set_level(logging.INFO)
xml = '<?xml version="1.0"?>' xml = '<?xml version="1.0"?>'
ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
@ -134,6 +136,7 @@ def test_textequiv(attributes, expected_index, expected_log, caplog):
result = ExtractedText.from_text_segment( result = ExtractedText.from_text_segment(
root, {"page": ns}, textequiv_level="line" root, {"page": ns}, textequiv_level="line"
).text ).text
if expected_index is None: if expected_index is None:
assert not result assert not result
else: else:

Loading…
Cancel
Save