Fix annoying logging exceptions and encoding errors.

pull/47/head
Benjamin Rosemann 4 years ago
parent 0dd5fc0ee5
commit 84d34f5b26

@ -1,4 +1,5 @@
import enum
import logging
import re
import unicodedata
from contextlib import suppress
@ -8,7 +9,8 @@ from typing import Optional
import attr
import numpy as np
from lxml import etree as ET
from ocrd_utils import getLogger
LOG = logging.getLogger("processor.OcrdDinglehopperEvaluate")
class Normalization(enum.Enum):
@ -239,7 +241,6 @@ def get_textequiv_unicode(text_segment, nsmap) -> str:
def get_first_textequiv(textequivs, segment_id):
"""Get the first TextEquiv based on index or conf order if index is not present."""
log = getLogger("processor.OcrdDinglehopperEvaluate")
if len(textequivs) == 1:
return textequivs[0]
@ -248,20 +249,20 @@ def get_first_textequiv(textequivs, segment_id):
nan_mask = np.isnan(indices)
if np.any(~nan_mask):
if np.any(nan_mask):
log.warning("TextEquiv without index in %s.", segment_id)
LOG.warning("TextEquiv without index in %s.", segment_id)
index = np.nanargmin(indices)
else:
# try ordering by conf
confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float)
if np.any(~np.isnan(confidences)):
log.info(
LOG.info(
"No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
segment_id,
)
index = np.nanargmax(confidences)
else:
# fallback to first entry in case of neither index or conf present
log.warning("No index attributes, use first TextEquiv in %s.", segment_id)
LOG.warning("No index attributes, use first TextEquiv in %s.", segment_id)
index = 0
return textequivs[index]

@ -125,7 +125,7 @@ def page_text(tree, *, textequiv_level="region"):
def plain_extract(filename):
with open(filename, "r") as f:
with open(filename, "r", encoding="utf8") as f:
return ExtractedText(
None,
[

@ -4,6 +4,7 @@ from collections import namedtuple
import pytest
from lxml import etree as ET
from ocrd_utils import getLogger
from uniseg.graphemecluster import grapheme_clusters
from .. import seq_align, ExtractedText
@ -117,6 +118,7 @@ def test_align():
)
def test_textequiv(attributes, expected_index, expected_log, caplog):
"""Test that extracting text from a PAGE TextEquiv is working without index attr."""
getLogger("processor.OcrdDinglehopperEvaluate")
caplog.set_level(logging.INFO)
xml = '<?xml version="1.0"?>'
ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
@ -134,6 +136,7 @@ def test_textequiv(attributes, expected_index, expected_log, caplog):
result = ExtractedText.from_text_segment(
root, {"page": ns}, textequiv_level="line"
).text
if expected_index is None:
assert not result
else:

Loading…
Cancel
Save