mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-07 19:05:13 +02:00
Fix annoying logging exceptions and encoding errors.
This commit is contained in:
parent
0dd5fc0ee5
commit
84d34f5b26
3 changed files with 10 additions and 6 deletions
|
@ -1,4 +1,5 @@
|
||||||
import enum
|
import enum
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from contextlib import suppress
|
from contextlib import suppress
|
||||||
|
@ -8,7 +9,8 @@ from typing import Optional
|
||||||
import attr
|
import attr
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
from ocrd_utils import getLogger
|
|
||||||
|
LOG = logging.getLogger("processor.OcrdDinglehopperEvaluate")
|
||||||
|
|
||||||
|
|
||||||
class Normalization(enum.Enum):
|
class Normalization(enum.Enum):
|
||||||
|
@ -239,7 +241,6 @@ def get_textequiv_unicode(text_segment, nsmap) -> str:
|
||||||
|
|
||||||
def get_first_textequiv(textequivs, segment_id):
|
def get_first_textequiv(textequivs, segment_id):
|
||||||
"""Get the first TextEquiv based on index or conf order if index is not present."""
|
"""Get the first TextEquiv based on index or conf order if index is not present."""
|
||||||
log = getLogger("processor.OcrdDinglehopperEvaluate")
|
|
||||||
if len(textequivs) == 1:
|
if len(textequivs) == 1:
|
||||||
return textequivs[0]
|
return textequivs[0]
|
||||||
|
|
||||||
|
@ -248,20 +249,20 @@ def get_first_textequiv(textequivs, segment_id):
|
||||||
nan_mask = np.isnan(indices)
|
nan_mask = np.isnan(indices)
|
||||||
if np.any(~nan_mask):
|
if np.any(~nan_mask):
|
||||||
if np.any(nan_mask):
|
if np.any(nan_mask):
|
||||||
log.warning("TextEquiv without index in %s.", segment_id)
|
LOG.warning("TextEquiv without index in %s.", segment_id)
|
||||||
index = np.nanargmin(indices)
|
index = np.nanargmin(indices)
|
||||||
else:
|
else:
|
||||||
# try ordering by conf
|
# try ordering by conf
|
||||||
confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float)
|
confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float)
|
||||||
if np.any(~np.isnan(confidences)):
|
if np.any(~np.isnan(confidences)):
|
||||||
log.info(
|
LOG.info(
|
||||||
"No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
|
"No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
|
||||||
segment_id,
|
segment_id,
|
||||||
)
|
)
|
||||||
index = np.nanargmax(confidences)
|
index = np.nanargmax(confidences)
|
||||||
else:
|
else:
|
||||||
# fallback to first entry in case of neither index or conf present
|
# fallback to first entry in case of neither index or conf present
|
||||||
log.warning("No index attributes, use first TextEquiv in %s.", segment_id)
|
LOG.warning("No index attributes, use first TextEquiv in %s.", segment_id)
|
||||||
index = 0
|
index = 0
|
||||||
return textequivs[index]
|
return textequivs[index]
|
||||||
|
|
||||||
|
|
|
@ -125,7 +125,7 @@ def page_text(tree, *, textequiv_level="region"):
|
||||||
|
|
||||||
|
|
||||||
def plain_extract(filename):
|
def plain_extract(filename):
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r", encoding="utf8") as f:
|
||||||
return ExtractedText(
|
return ExtractedText(
|
||||||
None,
|
None,
|
||||||
[
|
[
|
||||||
|
|
|
@ -4,6 +4,7 @@ from collections import namedtuple
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
|
from ocrd_utils import getLogger
|
||||||
from uniseg.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
from .. import seq_align, ExtractedText
|
from .. import seq_align, ExtractedText
|
||||||
|
@ -117,6 +118,7 @@ def test_align():
|
||||||
)
|
)
|
||||||
def test_textequiv(attributes, expected_index, expected_log, caplog):
|
def test_textequiv(attributes, expected_index, expected_log, caplog):
|
||||||
"""Test that extracting text from a PAGE TextEquiv is working without index attr."""
|
"""Test that extracting text from a PAGE TextEquiv is working without index attr."""
|
||||||
|
getLogger("processor.OcrdDinglehopperEvaluate")
|
||||||
caplog.set_level(logging.INFO)
|
caplog.set_level(logging.INFO)
|
||||||
xml = '<?xml version="1.0"?>'
|
xml = '<?xml version="1.0"?>'
|
||||||
ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
|
ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
|
||||||
|
@ -134,6 +136,7 @@ def test_textequiv(attributes, expected_index, expected_log, caplog):
|
||||||
result = ExtractedText.from_text_segment(
|
result = ExtractedText.from_text_segment(
|
||||||
root, {"page": ns}, textequiv_level="line"
|
root, {"page": ns}, textequiv_level="line"
|
||||||
).text
|
).text
|
||||||
|
|
||||||
if expected_index is None:
|
if expected_index is None:
|
||||||
assert not result
|
assert not result
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue