From 84d34f5b2671938cf2d17db998497bd21bebc9fc Mon Sep 17 00:00:00 2001
From: Benjamin Rosemann <benjamin.rosemann@la-bw.de>
Date: Tue, 24 Nov 2020 17:10:18 +0100
Subject: [PATCH] Fix annoying logging exceptions and encoding errors.

---
 qurator/dinglehopper/extracted_text.py            | 11 ++++++-----
 qurator/dinglehopper/ocr_files.py                 |  2 +-
 qurator/dinglehopper/tests/extracted_text_test.py |  3 +++
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py
index 9703b6b..c779836 100644
--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
@@ -1,4 +1,5 @@
 import enum
+import logging
 import re
 import unicodedata
 from contextlib import suppress
@@ -8,7 +9,8 @@ from typing import Optional
 import attr
 import numpy as np
 from lxml import etree as ET
-from ocrd_utils import getLogger
+
+LOG = logging.getLogger("processor.OcrdDinglehopperEvaluate")
 
 
 class Normalization(enum.Enum):
@@ -239,7 +241,6 @@ def get_textequiv_unicode(text_segment, nsmap) -> str:
 
 def get_first_textequiv(textequivs, segment_id):
     """Get the first TextEquiv based on index or conf order if index is not present."""
-    log = getLogger("processor.OcrdDinglehopperEvaluate")
     if len(textequivs) == 1:
         return textequivs[0]
 
@@ -248,20 +249,20 @@ def get_first_textequiv(textequivs, segment_id):
     nan_mask = np.isnan(indices)
     if np.any(~nan_mask):
         if np.any(nan_mask):
-            log.warning("TextEquiv without index in %s.", segment_id)
+            LOG.warning("TextEquiv without index in %s.", segment_id)
         index = np.nanargmin(indices)
     else:
         # try ordering by conf
         confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float)
         if np.any(~np.isnan(confidences)):
-            log.info(
+            LOG.info(
                 "No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
                 segment_id,
             )
             index = np.nanargmax(confidences)
         else:
             # fallback to first entry in case of neither index or conf present
-            log.warning("No index attributes, use first TextEquiv in %s.", segment_id)
+            LOG.warning("No index attributes, use first TextEquiv in %s.", segment_id)
             index = 0
     return textequivs[index]
 
diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py
index 57ebd3f..6f2dd40 100644
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@@ -125,7 +125,7 @@ def page_text(tree, *, textequiv_level="region"):
 
 
 def plain_extract(filename):
-    with open(filename, "r") as f:
+    with open(filename, "r", encoding="utf8") as f:
         return ExtractedText(
             None,
             [
diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py
index 8a81587..c39b3a3 100644
--- a/qurator/dinglehopper/tests/extracted_text_test.py
+++ b/qurator/dinglehopper/tests/extracted_text_test.py
@@ -4,6 +4,7 @@ from collections import namedtuple
 
 import pytest
 from lxml import etree as ET
+from ocrd_utils import getLogger
 from uniseg.graphemecluster import grapheme_clusters
 
 from .. import seq_align, ExtractedText
@@ -117,6 +118,7 @@ def test_align():
 )
 def test_textequiv(attributes, expected_index, expected_log, caplog):
     """Test that extracting text from a PAGE TextEquiv is working without index attr."""
+    getLogger("processor.OcrdDinglehopperEvaluate")
     caplog.set_level(logging.INFO)
     xml = '<?xml version="1.0"?>'
     ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
@@ -134,6 +136,7 @@ def test_textequiv(attributes, expected_index, expected_log, caplog):
     result = ExtractedText.from_text_segment(
         root, {"page": ns}, textequiv_level="line"
     ).text
+
     if expected_index is None:
         assert not result
     else: