Fix annoying logging exceptions and encoding errors.

2026-07-29 15:02:33 +02:00 · 2020-11-24 17:10:18 +01:00 · 2020-11-24 17:10:18 +01:00 · 84d34f5b26
commit 84d34f5b26
parent 0dd5fc0ee5
3 changed files with 10 additions and 6 deletions
--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
@ -1,4 +1,5 @@
 import enum
+import logging
 import re
 import unicodedata
 from contextlib import suppress
@ -8,7 +9,8 @@ from typing import Optional
 import attr
 import numpy as np
 from lxml import etree as ET
-from ocrd_utils import getLogger
+
+LOG = logging.getLogger("processor.OcrdDinglehopperEvaluate")


 class Normalization(enum.Enum):
@ -239,7 +241,6 @@ def get_textequiv_unicode(text_segment, nsmap) -> str:

 def get_first_textequiv(textequivs, segment_id):
    """Get the first TextEquiv based on index or conf order if index is not present."""
-    log = getLogger("processor.OcrdDinglehopperEvaluate")
    if len(textequivs) == 1:
        return textequivs[0]

@ -248,20 +249,20 @@ def get_first_textequiv(textequivs, segment_id):
    nan_mask = np.isnan(indices)
    if np.any(~nan_mask):
        if np.any(nan_mask):
-            log.warning("TextEquiv without index in %s.", segment_id)
+            LOG.warning("TextEquiv without index in %s.", segment_id)
        index = np.nanargmin(indices)
    else:
        # try ordering by conf
        confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float)
        if np.any(~np.isnan(confidences)):
-            log.info(
+            LOG.info(
                "No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
                segment_id,
            )
            index = np.nanargmax(confidences)
        else:
            # fallback to first entry in case of neither index or conf present
-            log.warning("No index attributes, use first TextEquiv in %s.", segment_id)
+            LOG.warning("No index attributes, use first TextEquiv in %s.", segment_id)
            index = 0
    return textequivs[index]

--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@ -125,7 +125,7 @@ def page_text(tree, *, textequiv_level="region"):


 def plain_extract(filename):
-    with open(filename, "r") as f:
+    with open(filename, "r", encoding="utf8") as f:
        return ExtractedText(
            None,
            [
--- a/qurator/dinglehopper/tests/extracted_text_test.py
+++ b/qurator/dinglehopper/tests/extracted_text_test.py
@ -4,6 +4,7 @@ from collections import namedtuple

 import pytest
 from lxml import etree as ET
+from ocrd_utils import getLogger
 from uniseg.graphemecluster import grapheme_clusters

 from .. import seq_align, ExtractedText
@ -117,6 +118,7 @@ def test_align():
 )
 def test_textequiv(attributes, expected_index, expected_log, caplog):
    """Test that extracting text from a PAGE TextEquiv is working without index attr."""
+    getLogger("processor.OcrdDinglehopperEvaluate")
    caplog.set_level(logging.INFO)
    xml = '<?xml version="1.0"?>'
    ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
@ -134,6 +136,7 @@ def test_textequiv(attributes, expected_index, expected_log, caplog):
    result = ExtractedText.from_text_segment(
        root, {"page": ns}, textequiv_level="line"
    ).text
+
    if expected_index is None:
        assert not result
    else: