mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-07-01 06:29:59 +02:00
Fix annoying logging exceptions and encoding errors.
This commit is contained in:
parent
0dd5fc0ee5
commit
84d34f5b26
3 changed files with 10 additions and 6 deletions
|
@ -1,4 +1,5 @@
|
|||
import enum
|
||||
import logging
|
||||
import re
|
||||
import unicodedata
|
||||
from contextlib import suppress
|
||||
|
@ -8,7 +9,8 @@ from typing import Optional
|
|||
import attr
|
||||
import numpy as np
|
||||
from lxml import etree as ET
|
||||
from ocrd_utils import getLogger
|
||||
|
||||
LOG = logging.getLogger("processor.OcrdDinglehopperEvaluate")
|
||||
|
||||
|
||||
class Normalization(enum.Enum):
|
||||
|
@ -239,7 +241,6 @@ def get_textequiv_unicode(text_segment, nsmap) -> str:
|
|||
|
||||
def get_first_textequiv(textequivs, segment_id):
|
||||
"""Get the first TextEquiv based on index or conf order if index is not present."""
|
||||
log = getLogger("processor.OcrdDinglehopperEvaluate")
|
||||
if len(textequivs) == 1:
|
||||
return textequivs[0]
|
||||
|
||||
|
@ -248,20 +249,20 @@ def get_first_textequiv(textequivs, segment_id):
|
|||
nan_mask = np.isnan(indices)
|
||||
if np.any(~nan_mask):
|
||||
if np.any(nan_mask):
|
||||
log.warning("TextEquiv without index in %s.", segment_id)
|
||||
LOG.warning("TextEquiv without index in %s.", segment_id)
|
||||
index = np.nanargmin(indices)
|
||||
else:
|
||||
# try ordering by conf
|
||||
confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float)
|
||||
if np.any(~np.isnan(confidences)):
|
||||
log.info(
|
||||
LOG.info(
|
||||
"No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
|
||||
segment_id,
|
||||
)
|
||||
index = np.nanargmax(confidences)
|
||||
else:
|
||||
# fallback to first entry in case of neither index or conf present
|
||||
log.warning("No index attributes, use first TextEquiv in %s.", segment_id)
|
||||
LOG.warning("No index attributes, use first TextEquiv in %s.", segment_id)
|
||||
index = 0
|
||||
return textequivs[index]
|
||||
|
||||
|
|
|
@ -125,7 +125,7 @@ def page_text(tree, *, textequiv_level="region"):
|
|||
|
||||
|
||||
def plain_extract(filename):
|
||||
with open(filename, "r") as f:
|
||||
with open(filename, "r", encoding="utf8") as f:
|
||||
return ExtractedText(
|
||||
None,
|
||||
[
|
||||
|
|
|
@ -4,6 +4,7 @@ from collections import namedtuple
|
|||
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
from ocrd_utils import getLogger
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
from .. import seq_align, ExtractedText
|
||||
|
@ -117,6 +118,7 @@ def test_align():
|
|||
)
|
||||
def test_textequiv(attributes, expected_index, expected_log, caplog):
|
||||
"""Test that extracting text from a PAGE TextEquiv is working without index attr."""
|
||||
getLogger("processor.OcrdDinglehopperEvaluate")
|
||||
caplog.set_level(logging.INFO)
|
||||
xml = '<?xml version="1.0"?>'
|
||||
ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
|
||||
|
@ -134,6 +136,7 @@ def test_textequiv(attributes, expected_index, expected_log, caplog):
|
|||
result = ExtractedText.from_text_segment(
|
||||
root, {"page": ns}, textequiv_level="line"
|
||||
).text
|
||||
|
||||
if expected_index is None:
|
||||
assert not result
|
||||
else:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue