From 714b5691952f79180f52e41715470d998ce1b6ef Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Fri, 11 Jun 2021 16:09:19 +0200 Subject: [PATCH] Fixed some flake8 and mypy issues. --- qurator/dinglehopper/__init__.py | 3 --- qurator/dinglehopper/align.py | 2 +- qurator/dinglehopper/cli.py | 8 ++---- qurator/dinglehopper/cli_extract.py | 3 --- qurator/dinglehopper/edit_distance.py | 27 ++++++++++--------- qurator/dinglehopper/metrics/__init__.py | 17 +++++++++--- .../metrics/bag_of_chars_accuracy.py | 9 +++---- .../metrics/bag_of_words_accuracy.py | 9 +++---- .../metrics/character_accuracy.py | 4 +-- qurator/dinglehopper/metrics/utils.py | 13 ++++----- qurator/dinglehopper/normalize.py | 19 ++++--------- qurator/dinglehopper/ocr_files.py | 21 ++++++++------- qurator/dinglehopper/ocrd_cli.py | 1 + .../dinglehopper/tests/extracted_text_test.py | 3 ++- .../test_integ_character_accuracy_ocr.py | 2 +- .../tests/metrics/test_integ_word_accuracy.py | 2 +- qurator/dinglehopper/tests/test_align.py | 3 ++- .../dinglehopper/tests/test_edit_distance.py | 4 +-- qurator/dinglehopper/tests/test_editops.py | 2 +- .../dinglehopper/tests/test_integ_align.py | 5 ++-- .../tests/test_integ_edit_distance_ocr.py | 3 ++- .../tests/test_integ_table_extraction.py | 2 +- qurator/dinglehopper/tests/test_ocr_files.py | 7 +++-- 23 files changed, 81 insertions(+), 88 deletions(-) diff --git a/qurator/dinglehopper/__init__.py b/qurator/dinglehopper/__init__.py index f883dfc..e69de29 100644 --- a/qurator/dinglehopper/__init__.py +++ b/qurator/dinglehopper/__init__.py @@ -1,3 +0,0 @@ -from .ocr_files import * -from .extracted_text import * -from .align import * diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py index cc7230b..cc35c63 100644 --- a/qurator/dinglehopper/align.py +++ b/qurator/dinglehopper/align.py @@ -1,4 +1,4 @@ -from .edit_distance import * +from .edit_distance import seq_editops from .normalize import chars_normalized diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 8452fa8..dbf9b28 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -36,13 +36,9 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_) if css_classes: - return '{html_t}'.format( - css_classes=css_classes, - html_t=html_t, - html_custom_attrs=html_custom_attrs, - ) + return f"{html_t}" else: - return "{html_t}".format(html_t=html_t) + return f"{html_t}" if isinstance(gt_in, ExtractedText): if not isinstance(ocr_in, ExtractedText): diff --git a/qurator/dinglehopper/cli_extract.py b/qurator/dinglehopper/cli_extract.py index 0d4f713..9c51d34 100644 --- a/qurator/dinglehopper/cli_extract.py +++ b/qurator/dinglehopper/cli_extract.py @@ -1,9 +1,6 @@ -import os - import click from ocrd_utils import initLogging -from .extracted_text import ExtractedText from .ocr_files import extract diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index 798419c..91966a3 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -12,15 +12,16 @@ from .normalize import chars_normalized def levenshtein_matrix(seq1: Sequence, seq2: Sequence): """Compute the matrix commonly computed to produce the Levenshtein distance. - This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired - edit distance. + This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom + right contains the desired edit distance. - This algorithm is implemented here because we need an implementation that can work with sequences other than - strings, e.g. lists of grapheme clusters or lists of word strings. + This algorithm is implemented here because we need an implementation that can work + with sequences other than strings, e.g. lists of grapheme clusters or lists of word + strings. """ - # Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input - # sequences to tuples to make them hashable. + # Internally, we use a cached version. As the cache only works on hashable + # parameters, we convert the input sequences to tuples to make them hashable. return _levenshtein_matrix(tuple(seq1), tuple(seq2)) @@ -28,7 +29,8 @@ def levenshtein_matrix(seq1: Sequence, seq2: Sequence): def _levenshtein_matrix(seq1: Tuple, seq2: Tuple): """Compute the matrix commonly computed to produce the Levenshtein distance. - This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead. + This is a LRU cached function not meant to be used directly. + Use levenshtein_matrix() instead. """ m = len(seq1) n = len(seq2) @@ -36,7 +38,7 @@ def _levenshtein_matrix(seq1: Tuple, seq2: Tuple): def from_to(start, stop): return range(start, stop + 1, 1) - D = np.zeros((m + 1, n + 1), np.int) + D = np.zeros((m + 1, n + 1), int) D[0, 0] = 0 for i in from_to(1, m): D[i, 0] = i @@ -75,8 +77,9 @@ def levenshtein_matrix_cache_clear(): def distance(s1: str, s2: str): """Compute the Levenshtein edit distance between two Unicode strings - Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme - clusters. This should be the correct way to compare two Unicode strings. + Note that this is different from levenshtein() as this function knows about Unicode + normalization and grapheme clusters. This should be the correct way to compare two + Unicode strings. """ seq1 = chars_normalized(s1) seq2 = chars_normalized(s2) @@ -87,8 +90,8 @@ def seq_editops(seq1, seq2): """ Return sequence of edit operations transforming one sequence to another. - This aims to return the same/similar results as python-Levenshtein's editops(), just generalized to arbitrary - sequences. + This aims to return the same/similar results as python-Levenshtein's editops(), + just generalized to arbitrary sequences. """ seq1 = list(seq1) seq2 = list(seq2) diff --git a/qurator/dinglehopper/metrics/__init__.py b/qurator/dinglehopper/metrics/__init__.py index 07bb077..347d042 100644 --- a/qurator/dinglehopper/metrics/__init__.py +++ b/qurator/dinglehopper/metrics/__init__.py @@ -1,5 +1,14 @@ -from .bag_of_chars_accuracy import * -from .bag_of_words_accuracy import * -from .character_accuracy import * +from .bag_of_chars_accuracy import bag_of_chars_accuracy +from .bag_of_words_accuracy import bag_of_words_accuracy +from .character_accuracy import character_accuracy from .utils import MetricResult, Weights -from .word_accuracy import * +from .word_accuracy import word_accuracy + +__all__ = [ + "bag_of_chars_accuracy", + "bag_of_words_accuracy", + "character_accuracy", + "word_accuracy", + "MetricResult", + "Weights", +] diff --git a/qurator/dinglehopper/metrics/bag_of_chars_accuracy.py b/qurator/dinglehopper/metrics/bag_of_chars_accuracy.py index 79ed34e..a6c8813 100644 --- a/qurator/dinglehopper/metrics/bag_of_chars_accuracy.py +++ b/qurator/dinglehopper/metrics/bag_of_chars_accuracy.py @@ -9,9 +9,8 @@ from .utils import bag_accuracy, MetricResult, Weights def bag_of_chars_accuracy( reference: str, compared: str, weights: Weights = Weights(1, 0, 1) ) -> MetricResult: - reference_chars = Counter(grapheme_clusters(normalize("NFC", reference))) - compared_chars = Counter(grapheme_clusters(normalize("NFC", compared))) - result = bag_accuracy(reference_chars, compared_chars, weights) - return MetricResult( - **{**result._asdict(), "metric": bag_of_chars_accuracy.__name__} + reference_chars: Counter = Counter(grapheme_clusters(normalize("NFC", reference))) + compared_chars: Counter = Counter(grapheme_clusters(normalize("NFC", compared))) + return bag_accuracy( + reference_chars, compared_chars, weights, bag_of_chars_accuracy.__name__ ) diff --git a/qurator/dinglehopper/metrics/bag_of_words_accuracy.py b/qurator/dinglehopper/metrics/bag_of_words_accuracy.py index f2e0a88..2426496 100644 --- a/qurator/dinglehopper/metrics/bag_of_words_accuracy.py +++ b/qurator/dinglehopper/metrics/bag_of_words_accuracy.py @@ -7,9 +7,8 @@ from ..normalize import words_normalized def bag_of_words_accuracy( reference: str, compared: str, weights: Weights = Weights(1, 0, 1) ) -> MetricResult: - reference_words = Counter(words_normalized(reference)) - compared_words = Counter(words_normalized(compared)) - result = bag_accuracy(reference_words, compared_words, weights) - return MetricResult( - **{**result._asdict(), "metric": bag_of_words_accuracy.__name__} + reference_words: Counter = Counter(words_normalized(reference)) + compared_words: Counter = Counter(words_normalized(compared)) + return bag_accuracy( + reference_words, compared_words, weights, bag_of_words_accuracy.__name__ ) diff --git a/qurator/dinglehopper/metrics/character_accuracy.py b/qurator/dinglehopper/metrics/character_accuracy.py index 1e89a76..abe4e4f 100644 --- a/qurator/dinglehopper/metrics/character_accuracy.py +++ b/qurator/dinglehopper/metrics/character_accuracy.py @@ -1,7 +1,5 @@ -from __future__ import division - from .utils import MetricResult, Weights -from .. import distance +from ..edit_distance import distance from ..normalize import chars_normalized diff --git a/qurator/dinglehopper/metrics/utils.py b/qurator/dinglehopper/metrics/utils.py index 179f462..33469be 100644 --- a/qurator/dinglehopper/metrics/utils.py +++ b/qurator/dinglehopper/metrics/utils.py @@ -37,10 +37,7 @@ class MetricResult(NamedTuple): We deviate from the builtin _asdict() function by including our properties. """ return { - **{ - key: value - for key, value in self._asdict().items() - }, + **{key: value for key, value in self._asdict().items()}, "accuracy": self.accuracy, "error_rate": self.error_rate, "weights": self.weights._asdict(), @@ -48,7 +45,10 @@ class MetricResult(NamedTuple): def bag_accuracy( - reference: Counter, compared: Counter, weights: Weights + reference: Counter, + compared: Counter, + weights: Weights, + metric: str = "bag_accuracy", ) -> MetricResult: """Calculates the the weighted errors for two bags (Counter). @@ -61,6 +61,7 @@ def bag_accuracy( :param reference: Bag used as reference (ground truth). :param compared: Bag used to compare (ocr). :param weights: Weights/costs for editing operations. + :param metric: Name of the (original) metric. :return: NamedTuple representing the results of this metric. """ n_ref = sum(reference.values()) @@ -77,7 +78,7 @@ def bag_accuracy( + weights.replacements * replacements ) return MetricResult( - metric=bag_accuracy.__name__, + metric=metric, weights=weights, weighted_errors=weighted_errors, reference_elements=n_ref, diff --git a/qurator/dinglehopper/normalize.py b/qurator/dinglehopper/normalize.py index 4ae6617..14eefbc 100644 --- a/qurator/dinglehopper/normalize.py +++ b/qurator/dinglehopper/normalize.py @@ -1,26 +1,19 @@ import unicodedata -from typing import Union import uniseg.wordbreak from uniseg.graphemecluster import grapheme_clusters -from .extracted_text import ExtractedText - -def chars_normalized(s: Union[str, ExtractedText]): +def chars_normalized(s: str): """Normalize characters in string.""" - if isinstance(s, ExtractedText): - s = s.text return list(grapheme_clusters(unicodedata.normalize("NFC", s))) -def words(s: Union[str, ExtractedText]): +def words(s: str): """Extract words from a string""" - if isinstance(s, ExtractedText): - s = s.text - - # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also + # Patch uniseg.wordbreak.word_break to deal with our private use characters. + # See also # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt old_word_break = uniseg.wordbreak.word_break @@ -54,8 +47,6 @@ def words(s: Union[str, ExtractedText]): yield word -def words_normalized(s: Union[str, ExtractedText]): +def words_normalized(s: str): """Extract words from string and normalize them.""" - if isinstance(s, ExtractedText): - s = s.text return words(unicodedata.normalize("NFC", s)) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 1f319f5..0eab0c5 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -1,7 +1,4 @@ -from __future__ import division, print_function - from typing import Iterator -from warnings import warn import sys from lxml import etree as ET @@ -13,7 +10,8 @@ from .extracted_text import ExtractedText, normalize_sbb def alto_namespace(tree: ET.ElementTree) -> str: """Return the ALTO namespace used in the given ElementTree. - This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not + This relies on the assumption that, in any given ALTO file, + the root element has the local name "alto". We do not check if the files uses any valid ALTO namespace. """ root_name = ET.QName(tree.getroot().tag) @@ -47,8 +45,9 @@ def alto_text(tree): def page_namespace(tree): """Return the PAGE content namespace used in the given ElementTree. - This relies on the assumption that, in any given PAGE content file, the root element has the local name "PcGts". We - do not check if the files uses any valid PAGE namespace. + This relies on the assumption that, in any given PAGE content file, + the root element has the local name "PcGts". + We do not check if the files uses any valid PAGE namespace. """ root_name = ET.QName(tree.getroot().tag) if root_name.localname == "PcGts": @@ -97,14 +96,18 @@ def extract_texts_from_reading_order_group(group, tree, nsmap, textequiv_level): ro_children = filter(lambda child: "index" in child.attrib.keys(), ro_children) ro_children = sorted(ro_children, key=lambda child: int(child.attrib["index"])) - elif ET.QName(group.tag).localname in ["UnorderedGroup","UnorderedGroupIndexed"]: + elif ET.QName(group.tag).localname in ["UnorderedGroup", "UnorderedGroupIndexed"]: ro_children = list(group) else: raise NotImplementedError - for ro_child in ro_children: - if ET.QName(ro_child.tag).localname in ["OrderedGroup", "OrderedGroupIndexed", "UnorderedGroup", "UnorderedGroupIndexed"]: + if ET.QName(ro_child.tag).localname in [ + "OrderedGroup", + "OrderedGroupIndexed", + "UnorderedGroup", + "UnorderedGroupIndexed", + ]: regions.extend( extract_texts_from_reading_order_group( ro_child, tree, nsmap, textequiv_level diff --git a/qurator/dinglehopper/ocrd_cli.py b/qurator/dinglehopper/ocrd_cli.py index adfbbab..c7aea19 100644 --- a/qurator/dinglehopper/ocrd_cli.py +++ b/qurator/dinglehopper/ocrd_cli.py @@ -77,5 +77,6 @@ class OcrdDinglehopperEvaluate(Processor): # Clear cache between files levenshtein_matrix_cache_clear() + if __name__ == "__main__": ocrd_dinglehopper() diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index 8a81587..c5301ec 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -6,7 +6,8 @@ import pytest from lxml import etree as ET from uniseg.graphemecluster import grapheme_clusters -from .. import seq_align, ExtractedText +from ..align import seq_align +from ..extracted_text import ExtractedText def test_text(): diff --git a/qurator/dinglehopper/tests/metrics/test_integ_character_accuracy_ocr.py b/qurator/dinglehopper/tests/metrics/test_integ_character_accuracy_ocr.py index b12c08c..9dcd0db 100644 --- a/qurator/dinglehopper/tests/metrics/test_integ_character_accuracy_ocr.py +++ b/qurator/dinglehopper/tests/metrics/test_integ_character_accuracy_ocr.py @@ -6,8 +6,8 @@ import pytest from lxml import etree as ET from uniseg.graphemecluster import grapheme_clusters -from ... import page_text, alto_text from ...metrics import character_accuracy +from ...ocr_files import alto_text, page_text data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../", "data") diff --git a/qurator/dinglehopper/tests/metrics/test_integ_word_accuracy.py b/qurator/dinglehopper/tests/metrics/test_integ_word_accuracy.py index 849dc99..99169de 100644 --- a/qurator/dinglehopper/tests/metrics/test_integ_word_accuracy.py +++ b/qurator/dinglehopper/tests/metrics/test_integ_word_accuracy.py @@ -5,9 +5,9 @@ import os import pytest from lxml import etree as ET -from ... import alto_text, page_text from ...metrics import word_accuracy from ...normalize import words +from ...ocr_files import alto_text, page_text data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../", "data") diff --git a/qurator/dinglehopper/tests/test_align.py b/qurator/dinglehopper/tests/test_align.py index 9f9d926..4ecbda2 100644 --- a/qurator/dinglehopper/tests/test_align.py +++ b/qurator/dinglehopper/tests/test_align.py @@ -1,5 +1,6 @@ from .util import unzip -from .. import align, seq_align, distance +from ..align import align, seq_align +from ..edit_distance import distance def test_left_empty(): diff --git a/qurator/dinglehopper/tests/test_edit_distance.py b/qurator/dinglehopper/tests/test_edit_distance.py index dc1f202..b1ad47f 100644 --- a/qurator/dinglehopper/tests/test_edit_distance.py +++ b/qurator/dinglehopper/tests/test_edit_distance.py @@ -1,8 +1,6 @@ -from __future__ import division, print_function - import unicodedata -from .. import levenshtein, distance +from ..edit_distance import levenshtein, distance def test_levenshtein(): diff --git a/qurator/dinglehopper/tests/test_editops.py b/qurator/dinglehopper/tests/test_editops.py index 06afbfc..b27fcd8 100644 --- a/qurator/dinglehopper/tests/test_editops.py +++ b/qurator/dinglehopper/tests/test_editops.py @@ -1,6 +1,6 @@ import unicodedata -from .. import seq_editops, editops +from ..edit_distance import seq_editops, editops def test_trivial(): diff --git a/qurator/dinglehopper/tests/test_integ_align.py b/qurator/dinglehopper/tests/test_integ_align.py index 74b8c7e..4ab2eac 100644 --- a/qurator/dinglehopper/tests/test_integ_align.py +++ b/qurator/dinglehopper/tests/test_integ_align.py @@ -1,11 +1,10 @@ -from __future__ import division, print_function - import os import pytest from lxml import etree as ET -from .. import align, page_text +from ..align import align +from ..ocr_files import page_text data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") diff --git a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py index 0e1e7da..8bfbad8 100644 --- a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py +++ b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py @@ -5,7 +5,8 @@ import os import pytest from lxml import etree as ET -from .. import distance, page_text, alto_text +from ..edit_distance import distance +from ..ocr_files import alto_text, page_text data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") diff --git a/qurator/dinglehopper/tests/test_integ_table_extraction.py b/qurator/dinglehopper/tests/test_integ_table_extraction.py index 868308c..b7703bb 100644 --- a/qurator/dinglehopper/tests/test_integ_table_extraction.py +++ b/qurator/dinglehopper/tests/test_integ_table_extraction.py @@ -3,7 +3,7 @@ import os import pytest from lxml import etree as ET -from .. import page_text +from ..ocr_files import page_text @pytest.mark.parametrize( diff --git a/qurator/dinglehopper/tests/test_ocr_files.py b/qurator/dinglehopper/tests/test_ocr_files.py index fb38c4a..f48c59c 100644 --- a/qurator/dinglehopper/tests/test_ocr_files.py +++ b/qurator/dinglehopper/tests/test_ocr_files.py @@ -1,13 +1,12 @@ import os import re - -import lxml.etree as ET import textwrap -import pytest +import lxml.etree as ET from .util import working_directory -from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text +from ..ocr_files import alto_namespace, alto_text, page_namespace, page_text, \ + plain_text, text data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")