From 84d34f5b2671938cf2d17db998497bd21bebc9fc Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Tue, 24 Nov 2020 17:10:18 +0100 Subject: [PATCH] Fix annoying logging exceptions and encoding errors. --- qurator/dinglehopper/extracted_text.py | 11 ++++++----- qurator/dinglehopper/ocr_files.py | 2 +- qurator/dinglehopper/tests/extracted_text_test.py | 3 +++ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py index 9703b6b..c779836 100644 --- a/qurator/dinglehopper/extracted_text.py +++ b/qurator/dinglehopper/extracted_text.py @@ -1,4 +1,5 @@ import enum +import logging import re import unicodedata from contextlib import suppress @@ -8,7 +9,8 @@ from typing import Optional import attr import numpy as np from lxml import etree as ET -from ocrd_utils import getLogger + +LOG = logging.getLogger("processor.OcrdDinglehopperEvaluate") class Normalization(enum.Enum): @@ -239,7 +241,6 @@ def get_textequiv_unicode(text_segment, nsmap) -> str: def get_first_textequiv(textequivs, segment_id): """Get the first TextEquiv based on index or conf order if index is not present.""" - log = getLogger("processor.OcrdDinglehopperEvaluate") if len(textequivs) == 1: return textequivs[0] @@ -248,20 +249,20 @@ def get_first_textequiv(textequivs, segment_id): nan_mask = np.isnan(indices) if np.any(~nan_mask): if np.any(nan_mask): - log.warning("TextEquiv without index in %s.", segment_id) + LOG.warning("TextEquiv without index in %s.", segment_id) index = np.nanargmin(indices) else: # try ordering by conf confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float) if np.any(~np.isnan(confidences)): - log.info( + LOG.info( "No index attributes, use 'conf' attribute to sort TextEquiv in %s.", segment_id, ) index = np.nanargmax(confidences) else: # fallback to first entry in case of neither index or conf present - log.warning("No index attributes, use first TextEquiv in %s.", segment_id) + LOG.warning("No index attributes, use first TextEquiv in %s.", segment_id) index = 0 return textequivs[index] diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 57ebd3f..6f2dd40 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -125,7 +125,7 @@ def page_text(tree, *, textequiv_level="region"): def plain_extract(filename): - with open(filename, "r") as f: + with open(filename, "r", encoding="utf8") as f: return ExtractedText( None, [ diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index 8a81587..c39b3a3 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -4,6 +4,7 @@ from collections import namedtuple import pytest from lxml import etree as ET +from ocrd_utils import getLogger from uniseg.graphemecluster import grapheme_clusters from .. import seq_align, ExtractedText @@ -117,6 +118,7 @@ def test_align(): ) def test_textequiv(attributes, expected_index, expected_log, caplog): """Test that extracting text from a PAGE TextEquiv is working without index attr.""" + getLogger("processor.OcrdDinglehopperEvaluate") caplog.set_level(logging.INFO) xml = '' ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" @@ -134,6 +136,7 @@ def test_textequiv(attributes, expected_index, expected_log, caplog): result = ExtractedText.from_text_segment( root, {"page": ns}, textequiv_level="line" ).text + if expected_index is None: assert not result else: