From 14421c8e53291d788514796158b9e83e263fc42d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 10 Nov 2020 12:29:55 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=A8=20dinglehopper:=20Reformat=20using?= =?UTF-8?q?=20black?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/__init__.py | 3 +- qurator/dinglehopper/align.py | 10 +- qurator/dinglehopper/character_error_rate.py | 10 +- qurator/dinglehopper/cli.py | 91 ++++++----- qurator/dinglehopper/cli_extract.py | 11 +- qurator/dinglehopper/edit_distance.py | 25 +-- qurator/dinglehopper/extracted_text.py | 130 ++++++++-------- qurator/dinglehopper/ocr_files.py | 82 ++++++---- qurator/dinglehopper/ocrd_cli.py | 66 ++++---- .../dinglehopper/tests/extracted_text_test.py | 145 +++++++++++------- qurator/dinglehopper/tests/test_align.py | 135 +++++++++++----- .../tests/test_character_error_rate.py | 44 +++--- .../dinglehopper/tests/test_edit_distance.py | 44 +++--- qurator/dinglehopper/tests/test_editops.py | 53 ++++--- .../dinglehopper/tests/test_integ_align.py | 6 +- .../test_integ_character_error_rate_ocr.py | 38 +++-- .../tests/test_integ_cli_valid_json.py | 32 ++-- .../tests/test_integ_edit_distance_ocr.py | 32 +++- .../dinglehopper/tests/test_integ_ocrd_cli.py | 23 +-- .../tests/test_integ_word_error_rate_ocr.py | 50 ++++-- qurator/dinglehopper/tests/test_ocr_files.py | 93 ++++++----- .../tests/test_word_error_rate.py | 83 ++++++++-- qurator/dinglehopper/tests/util.py | 1 + qurator/dinglehopper/word_error_rate.py | 15 +- setup.py | 36 ++--- 25 files changed, 783 insertions(+), 475 deletions(-) diff --git a/qurator/__init__.py b/qurator/__init__.py index 8d17c21..5284146 100644 --- a/qurator/__init__.py +++ b/qurator/__init__.py @@ -1,2 +1 @@ -__import__('pkg_resources').declare_namespace(__name__) - +__import__("pkg_resources").declare_namespace(__name__) diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py index 87febb7..c7e7733 100644 --- a/qurator/dinglehopper/align.py +++ b/qurator/dinglehopper/align.py @@ -3,8 +3,8 @@ from .edit_distance import * def align(t1, t2): """Align text.""" - s1 = list(grapheme_clusters(unicodedata.normalize('NFC', t1))) - s2 = list(grapheme_clusters(unicodedata.normalize('NFC', t2))) + s1 = list(grapheme_clusters(unicodedata.normalize("NFC", t1))) + s2 = list(grapheme_clusters(unicodedata.normalize("NFC", t2))) return seq_align(s1, s2) @@ -27,13 +27,13 @@ def seq_align(s1, s2): pass if o: - if o[0] == 'insert': + if o[0] == "insert": yield None, s2[j] j += 1 - elif o[0] == 'delete': + elif o[0] == "delete": yield s1[i], None i += 1 - elif o[0] == 'replace': + elif o[0] == "replace": yield s1[i], s2[j] i += 1 j += 1 diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py index 2b13f55..0c3ef7d 100644 --- a/qurator/dinglehopper/character_error_rate.py +++ b/qurator/dinglehopper/character_error_rate.py @@ -19,19 +19,21 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]: """ d = distance(reference, compared) - n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference)))) + n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference)))) if d == 0: return 0, n if n == 0: - return float('inf'), n - return d/n, n + return float("inf"), n + return d / n, n # XXX Should we really count newlines here? @multimethod -def character_error_rate_n(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]: +def character_error_rate_n( + reference: ExtractedText, compared: ExtractedText +) -> Tuple[float, int]: return character_error_rate_n(reference.text, compared.text) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 2aef644..09c26f0 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -12,16 +12,17 @@ from .extracted_text import ExtractedText from .ocr_files import extract from .config import Config + def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): - gtx = '' - ocrx = '' + gtx = "" + ocrx = "" def format_thing(t, css_classes=None, id_=None): if t is None: html_t = none - css_classes += ' ellipsis' - elif t == '\n': - html_t = '
' + css_classes += " ellipsis" + elif t == "\n": + html_t = "
" else: html_t = escape(t) @@ -32,9 +33,13 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_) if css_classes: - return '{html_t}'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs) + return '{html_t}'.format( + css_classes=css_classes, + html_t=html_t, + html_custom_attrs=html_custom_attrs, + ) else: - return '{html_t}'.format(html_t=html_t) + return "{html_t}".format(html_t=html_t) if isinstance(gt_in, ExtractedText): if not isinstance(ocr_in, ExtractedText): @@ -46,8 +51,6 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): gt_things = gt_in ocr_things = ocr_in - - g_pos = 0 o_pos = 0 for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)): @@ -55,7 +58,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): gt_id = None ocr_id = None if g != o: - css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k) + css_classes = "{css_prefix}diff{k} diff".format(css_prefix=css_prefix, k=k) if isinstance(gt_in, ExtractedText): gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None @@ -70,17 +73,17 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): if o is not None: o_pos += len(o) - - return \ - ''' + return """
{}
{}
- '''.format(gtx, ocrx) + """.format( + gtx, ocrx + ) -def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'): +def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"): """Check OCR result against GT. The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use @@ -93,36 +96,47 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'): cer, n_characters = character_error_rate_n(gt_text, ocr_text) wer, n_words = word_error_rate_n(gt_text, ocr_text) - char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·') + char_diff_report = gen_diff_report( + gt_text, ocr_text, css_prefix="c", joiner="", none="·" + ) gt_words = words_normalized(gt_text) ocr_words = words_normalized(ocr_text) - word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯') + word_diff_report = gen_diff_report( + gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯" + ) def json_float(value): """Convert a float value to an JSON float. This is here so that float('inf') yields "Infinity", not "inf". """ - if value == float('inf'): - return 'Infinity' - elif value == float('-inf'): - return '-Infinity' + if value == float("inf"): + return "Infinity" + elif value == float("-inf"): + return "-Infinity" else: return str(value) - env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates'))) - env.filters['json_float'] = json_float + env = Environment( + loader=FileSystemLoader( + os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates") + ) + ) + env.filters["json_float"] = json_float - for report_suffix in ('.html', '.json'): - template_fn = 'report' + report_suffix + '.j2' + for report_suffix in (".html", ".json"): + template_fn = "report" + report_suffix + ".j2" out_fn = report_prefix + report_suffix template = env.get_template(template_fn) template.stream( - gt=gt, ocr=ocr, - cer=cer, n_characters=n_characters, - wer=wer, n_words=n_words, + gt=gt, + ocr=ocr, + cer=cer, + n_characters=n_characters, + wer=wer, + n_words=n_words, char_diff_report=char_diff_report, word_diff_report=word_diff_report, metrics=metrics, @@ -130,12 +144,19 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'): @click.command() -@click.argument('gt', type=click.Path(exists=True)) -@click.argument('ocr', type=click.Path(exists=True)) -@click.argument('report_prefix', type=click.Path(), default='report') -@click.option('--metrics/--no-metrics', default=True, help='Enable/disable metrics and green/red') -@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL') -@click.option('--progress', default=False, is_flag=True, help='Show progress bar') +@click.argument("gt", type=click.Path(exists=True)) +@click.argument("ocr", type=click.Path(exists=True)) +@click.argument("report_prefix", type=click.Path(), default="report") +@click.option( + "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red" +) +@click.option( + "--textequiv-level", + default="region", + help="PAGE TextEquiv level to extract text from", + metavar="LEVEL", +) +@click.option("--progress", default=False, is_flag=True, help="Show progress bar") def main(gt, ocr, report_prefix, metrics, textequiv_level, progress): """ Compare the PAGE/ALTO/text document GT against the document OCR. @@ -159,5 +180,5 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress): process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/qurator/dinglehopper/cli_extract.py b/qurator/dinglehopper/cli_extract.py index a5d36d8..ce49db4 100644 --- a/qurator/dinglehopper/cli_extract.py +++ b/qurator/dinglehopper/cli_extract.py @@ -7,8 +7,13 @@ from .ocr_files import extract @click.command() -@click.argument('input_file', type=click.Path(exists=True)) -@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL') +@click.argument("input_file", type=click.Path(exists=True)) +@click.option( + "--textequiv-level", + default="region", + help="PAGE TextEquiv level to extract text from", + metavar="LEVEL", +) def main(input_file, textequiv_level): """ Extract the text of the given INPUT_FILE. @@ -23,5 +28,5 @@ def main(input_file, textequiv_level): print(input_text) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index 721296d..0b9c8f4 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -48,9 +48,10 @@ def _levenshtein_matrix(seq1: Tuple, seq2: Tuple): for i in tqdm(from_to(1, m), disable=not Config.progress): for j in from_to(1, n): D[i, j] = min( - D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution + D[i - 1, j - 1] + + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution D[i, j - 1] + 1, # Insertion - D[i - 1, j] + 1 # Deletion + D[i - 1, j] + 1, # Deletion ) return D @@ -81,8 +82,8 @@ def distance(s1: str, s2: str): Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme clusters. This should be the correct way to compare two Unicode strings. """ - seq1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1))) - seq2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2))) + seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1))) + seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2))) return levenshtein(seq1, seq2) @@ -106,11 +107,17 @@ def seq_editops(seq1, seq2): def _tail_backtrace(i, j, accumulator): if i > 0 and D[i - 1, j] + 1 == D[i, j]: - return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator) + return partial( + _tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator + ) if j > 0 and D[i, j - 1] + 1 == D[i, j]: - return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator) + return partial( + _tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator + ) if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]: - return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator) + return partial( + _tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator + ) if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]: return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP return accumulator @@ -132,6 +139,6 @@ def editops(word1, word2): Note that this returns indices to the _grapheme clusters_, not characters! """ - word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1))) - word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2))) + word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1))) + word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2))) return seq_editops(word1, word2) diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py index d785754..9703b6b 100644 --- a/qurator/dinglehopper/extracted_text.py +++ b/qurator/dinglehopper/extracted_text.py @@ -10,6 +10,7 @@ import numpy as np from lxml import etree as ET from ocrd_utils import getLogger + class Normalization(enum.Enum): NFC = 1 NFC_MUFI = 2 # TODO @@ -18,7 +19,7 @@ class Normalization(enum.Enum): def normalize(text, normalization): if normalization == Normalization.NFC: - return unicodedata.normalize('NFC', text) + return unicodedata.normalize("NFC", text) if normalization == Normalization.NFC_MUFI: raise NotImplementedError() if normalization == Normalization.NFC_SBB: @@ -36,31 +37,31 @@ def unjoin_ligatures(s): """Unjoin ligatures, i.e. ff becomes ff.""" equivalences = { - '': 'ſſ', - "\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I - '': 'ch', - '': 'ck', - '': 'll', - '': 'ſi', - '': 'ſt', - 'fi': 'fi', - 'ff': 'ff', - 'fl': 'fl', - 'ffi': 'ffi', - '': 'ct', - '': 'tz', # MUFI: LATIN SMALL LIGATURE TZ - '\uf532': 'as', # eMOP: Latin small ligature as - '\uf533': 'is', # eMOP: Latin small ligature is - '\uf534': 'us', # eMOP: Latin small ligature us - '\uf535': 'Qu', # eMOP: Latin ligature capital Q small u - 'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ - '\uE8BF': 'q&', + "": "ſſ", + "\ueba7": "ſſi", # MUFI: LATIN SMALL LIGATURE LONG S LONG S I + "": "ch", + "": "ck", + "": "ll", + "": "ſi", + "": "ſt", + "fi": "fi", + "ff": "ff", + "fl": "fl", + "ffi": "ffi", + "": "ct", + "": "tz", # MUFI: LATIN SMALL LIGATURE TZ + "\uf532": "as", # eMOP: Latin small ligature as + "\uf533": "is", # eMOP: Latin small ligature is + "\uf534": "us", # eMOP: Latin small ligature us + "\uf535": "Qu", # eMOP: Latin ligature capital Q small u + "ij": "ij", # U+0133 LATIN SMALL LIGATURE IJ + "\uE8BF": "q&", # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET # XXX How to replace this correctly? - '\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P - 'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST + "\uEBA5": "ſp", # MUFI: LATIN SMALL LIGATURE LONG S P + "st": "st", # U+FB06 LATIN SMALL LIGATURE ST } - s = unicodedata.normalize('NFC', s) + s = unicodedata.normalize("NFC", s) for fr, to in equivalences.items(): s = s.replace(fr, to) return s @@ -70,20 +71,20 @@ def substitute_equivalences(s): # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR # It might make sense to use different rules for GT and for the different OCR equivalences = { - '': 'ü', - '': 'ä', - '==': '–', # → en-dash - '—': '–', # em-dash → en-dash - '': 'ö', - '’': '\'', - '⸗': '-', - 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E - 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E - 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E - '\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT + "": "ü", + "": "ä", + "==": "–", # → en-dash + "—": "–", # em-dash → en-dash + "": "ö", + "’": "'", + "⸗": "-", + "aͤ": "ä", # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E + "oͤ": "ö", # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E + "uͤ": "ü", # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E + "\uF50E": "q́", # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT } - s = unicodedata.normalize('NFC', s) + s = unicodedata.normalize("NFC", s) s = unjoin_ligatures(s) for fr, to in equivalences.items(): s = s.replace(fr, to) @@ -115,13 +116,14 @@ class ExtractedText: Objects of this class are guaranteed to be a. always in their normalization and b. in NFC. """ + segment_id = attr.ib(type=Optional[str]) @segment_id.validator def check(self, _, value): if value is None: return - if not re.match(r'[\w\d_-]+', value): + if not re.match(r"[\w\d_-]+", value): raise ValueError('Malformed segment id "{}"'.format(value)) # An object contains either @@ -141,7 +143,7 @@ class ExtractedText: def check(self, _, value): if value is not None and self.segments is not None: raise ValueError("Can't have both segments and text") - if value is not None and unicodedata.normalize('NFC', value) != value: + if value is not None and unicodedata.normalize("NFC", value) != value: raise ValueError('String "{}" is not in NFC.'.format(value)) if value is not None and normalize(value, self.normalization) != value: raise ValueError('String "{}" is not normalized.'.format(value)) @@ -169,31 +171,24 @@ class ExtractedText: seg_ids = [s.segment_id_for_pos(i) for i in range(len(s.text))] segment_id_for_pos.extend(seg_ids) segment_id_for_pos.extend(repeat(None, len(self.joiner))) - segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)] + segment_id_for_pos = segment_id_for_pos[: -len(self.joiner)] # This is frozen, so we have to jump through the hoop: - object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos) + object.__setattr__(self, "_segment_id_for_pos", segment_id_for_pos) assert self._segment_id_for_pos return self._segment_id_for_pos[pos] @classmethod - def from_text_segment(cls, text_segment, nsmap, textequiv_level='region'): + def from_text_segment(cls, text_segment, nsmap, textequiv_level="region"): """Build an ExtractedText from a PAGE content text element""" - localname_for_textequiv_level = { - 'region': 'TextRegion', - 'line': 'TextLine' - } + localname_for_textequiv_level = {"region": "TextRegion", "line": "TextLine"} textequiv_level_for_localname = invert_dict(localname_for_textequiv_level) - children_for_localname = { - 'TextRegion': 'TextLine' - } - joiner_for_textequiv_level = { - 'line': '\n' - } - - segment_id = text_segment.attrib['id'] + children_for_localname = {"TextRegion": "TextLine"} + joiner_for_textequiv_level = {"line": "\n"} + + segment_id = text_segment.attrib["id"] localname = ET.QName(text_segment).localname if localname == localname_for_textequiv_level[textequiv_level]: segment_text = None @@ -201,19 +196,20 @@ class ExtractedText: segment_text = get_textequiv_unicode(text_segment, nsmap) # FIXME hardcoded SBB normalization segment_text = normalize_sbb(segment_text) - segment_text = segment_text or '' + segment_text = segment_text or "" return cls(segment_id, None, None, segment_text) else: # Recurse sub_localname = children_for_localname[localname] sub_textequiv_level = textequiv_level_for_localname[sub_localname] segments = [] - for sub_segment in text_segment.iterfind('./page:%s' % sub_localname, - namespaces=nsmap): + for sub_segment in text_segment.iterfind( + "./page:%s" % sub_localname, namespaces=nsmap + ): segments.append( ExtractedText.from_text_segment( - sub_segment, nsmap, - textequiv_level=sub_textequiv_level) + sub_segment, nsmap, textequiv_level=sub_textequiv_level + ) ) joiner = joiner_for_textequiv_level[sub_textequiv_level] return cls(segment_id, segments, joiner, None) @@ -231,24 +227,24 @@ def invert_dict(d): def get_textequiv_unicode(text_segment, nsmap) -> str: """Get the TextEquiv/Unicode text of the given PAGE text element.""" - segment_id = text_segment.attrib['id'] - textequivs = text_segment.findall('./page:TextEquiv', namespaces=nsmap) + segment_id = text_segment.attrib["id"] + textequivs = text_segment.findall("./page:TextEquiv", namespaces=nsmap) if not textequivs: - return '' + return "" textequiv = get_first_textequiv(textequivs, segment_id) - return textequiv.find('./page:Unicode', namespaces=nsmap).text or '' + return textequiv.find("./page:Unicode", namespaces=nsmap).text or "" def get_first_textequiv(textequivs, segment_id): """Get the first TextEquiv based on index or conf order if index is not present.""" - log = getLogger('processor.OcrdDinglehopperEvaluate') + log = getLogger("processor.OcrdDinglehopperEvaluate") if len(textequivs) == 1: return textequivs[0] # try ordering by index - indices = np.array([get_attr(te, 'index') for te in textequivs], dtype=float) + indices = np.array([get_attr(te, "index") for te in textequivs], dtype=float) nan_mask = np.isnan(indices) if np.any(~nan_mask): if np.any(nan_mask): @@ -256,10 +252,12 @@ def get_first_textequiv(textequivs, segment_id): index = np.nanargmin(indices) else: # try ordering by conf - confidences = np.array([get_attr(te, 'conf') for te in textequivs], dtype=float) + confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float) if np.any(~np.isnan(confidences)): - log.info("No index attributes, use 'conf' attribute to sort TextEquiv in %s.", - segment_id) + log.info( + "No index attributes, use 'conf' attribute to sort TextEquiv in %s.", + segment_id, + ) index = np.nanargmax(confidences) else: # fallback to first entry in case of neither index or conf present diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 9cb2475..755061c 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -17,24 +17,27 @@ def alto_namespace(tree: ET.ElementTree) -> str: check if the files uses any valid ALTO namespace. """ root_name = ET.QName(tree.getroot().tag) - if root_name.localname == 'alto': + if root_name.localname == "alto": return root_name.namespace else: - raise ValueError('Not an ALTO tree') + raise ValueError("Not an ALTO tree") def alto_extract_lines(tree: ET.ElementTree) -> Generator[ExtractedText, None, None]: - nsmap = {'alto': alto_namespace(tree)} - for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap): - line_id = line.attrib.get('ID') - line_text = ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap)) + nsmap = {"alto": alto_namespace(tree)} + for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap): + line_id = line.attrib.get("ID") + line_text = " ".join( + string.attrib.get("CONTENT") + for string in line.iterfind("alto:String", namespaces=nsmap) + ) yield ExtractedText(line_id, None, None, normalize_sbb(line_text)) # FIXME hardcoded SBB normalization def alto_extract(tree: ET.ElementTree()) -> ExtractedText: """Extract text from the given ALTO ElementTree.""" - return ExtractedText(None, list(alto_extract_lines(tree)), '\n', None) + return ExtractedText(None, list(alto_extract_lines(tree)), "\n", None) def alto_text(tree): @@ -48,56 +51,73 @@ def page_namespace(tree): do not check if the files uses any valid PAGE namespace. """ root_name = ET.QName(tree.getroot().tag) - if root_name.localname == 'PcGts': + if root_name.localname == "PcGts": return root_name.namespace else: - raise ValueError('Not a PAGE tree') + raise ValueError("Not a PAGE tree") -def page_extract(tree, *, textequiv_level='region'): +def page_extract(tree, *, textequiv_level="region"): """Extract text from the given PAGE content ElementTree.""" # Internally, this is just parsing the Reading Order (if it exists) and # and leaves reading the TextRegions to ExtractedText.from_text_segment(). - nsmap = {'page': page_namespace(tree)} + nsmap = {"page": page_namespace(tree)} regions = [] - reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap) + reading_order = tree.find(".//page:ReadingOrder", namespaces=nsmap) if reading_order is not None: - for group in reading_order.iterfind('./*', namespaces=nsmap): - if ET.QName(group.tag).localname == 'OrderedGroup': - region_ref_indexeds = group.findall('./page:RegionRefIndexed', namespaces=nsmap) - for region_ref_indexed in sorted(region_ref_indexeds, key=lambda r: int(r.attrib['index'])): - region_id = region_ref_indexed.attrib['regionRef'] - region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap) + for group in reading_order.iterfind("./*", namespaces=nsmap): + if ET.QName(group.tag).localname == "OrderedGroup": + region_ref_indexeds = group.findall( + "./page:RegionRefIndexed", namespaces=nsmap + ) + for region_ref_indexed in sorted( + region_ref_indexeds, key=lambda r: int(r.attrib["index"]) + ): + region_id = region_ref_indexed.attrib["regionRef"] + region = tree.find( + './/page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap + ) if region is not None: - regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level)) + regions.append( + ExtractedText.from_text_segment( + region, nsmap, textequiv_level=textequiv_level + ) + ) else: pass # Not a TextRegion else: raise NotImplementedError else: - for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap): - regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level)) + for region in tree.iterfind(".//page:TextRegion", namespaces=nsmap): + regions.append( + ExtractedText.from_text_segment( + region, nsmap, textequiv_level=textequiv_level + ) + ) # Filter empty region texts - regions = [r for r in regions if r.text != ''] + regions = [r for r in regions if r.text != ""] - return ExtractedText(None, regions, '\n', None) + return ExtractedText(None, regions, "\n", None) -def page_text(tree, *, textequiv_level='region'): +def page_text(tree, *, textequiv_level="region"): return page_extract(tree, textequiv_level=textequiv_level).text def plain_extract(filename): - with open(filename, 'r') as f: + with open(filename, "r") as f: return ExtractedText( - None, - [ExtractedText('line %d' % no, None, None, line) for no, line in enumerate(f.readlines())], - '\n', - None + None, + [ + ExtractedText("line %d" % no, None, None, line) + for no, line in enumerate(f.readlines()) + ], + "\n", + None, ) @@ -105,7 +125,7 @@ def plain_text(filename): return plain_extract(filename).text -def extract(filename, *, textequiv_level='region'): +def extract(filename, *, textequiv_level="region"): """Extract the text from the given file. Supports PAGE, ALTO and falls back to plain text. @@ -124,5 +144,5 @@ def text(filename): return extract(filename).text -if __name__ == '__main__': +if __name__ == "__main__": print(text(sys.argv[1])) diff --git a/qurator/dinglehopper/ocrd_cli.py b/qurator/dinglehopper/ocrd_cli.py index 1850eb1..008b70c 100644 --- a/qurator/dinglehopper/ocrd_cli.py +++ b/qurator/dinglehopper/ocrd_cli.py @@ -10,7 +10,7 @@ from pkg_resources import resource_string from .cli import process as cli_process from .edit_distance import levenshtein_matrix_cache_clear -OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) +OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8")) @click.command() @@ -20,20 +20,19 @@ def ocrd_dinglehopper(*args, **kwargs): class OcrdDinglehopperEvaluate(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dinglehopper'] + kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"] super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs) def process(self): - assert_file_grp_cardinality(self.input_file_grp, 2, 'GT and OCR') + assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR") assert_file_grp_cardinality(self.output_file_grp, 1) - log = getLogger('processor.OcrdDinglehopperEvaluate') + log = getLogger("processor.OcrdDinglehopperEvaluate") - metrics = self.parameter['metrics'] - textequiv_level = self.parameter['textequiv_level'] - gt_grp, ocr_grp = self.input_file_grp.split(',') + metrics = self.parameter["metrics"] + textequiv_level = self.parameter["textequiv_level"] + gt_grp, ocr_grp = self.input_file_grp.split(",") input_file_tuples = self._zip_input_files([gt_grp, ocr_grp]) for n, (gt_file, ocr_file) in enumerate(input_file_tuples): @@ -55,40 +54,47 @@ class OcrdDinglehopperEvaluate(Processor): except FileExistsError: pass cli_process( - gt_file.local_filename, - ocr_file.local_filename, - report_prefix, - metrics=metrics, - textequiv_level=textequiv_level + gt_file.local_filename, + ocr_file.local_filename, + report_prefix, + metrics=metrics, + textequiv_level=textequiv_level, ) # Add reports to the workspace - for report_suffix, mimetype in \ - [ - ['.html', 'text/html'], - ['.json', 'application/json'] - ]: + for report_suffix, mimetype in [ + [".html", "text/html"], + [".json", "application/json"], + ]: self.workspace.add_file( - ID=file_id + report_suffix, - file_grp=self.output_file_grp, - pageId=page_id, - mimetype=mimetype, - local_filename=report_prefix + report_suffix) + ID=file_id + report_suffix, + file_grp=self.output_file_grp, + pageId=page_id, + mimetype=mimetype, + local_filename=report_prefix + report_suffix, + ) # Clear cache between files levenshtein_matrix_cache_clear() def _zip_input_files(self, input_file_grps): - log = getLogger('processor.OcrdDinglehopperEvaluate') + log = getLogger("processor.OcrdDinglehopperEvaluate") input_file_tuples = list() - for page_id in ([self.page_id] if self.page_id else - self.workspace.mets.physical_pages): + for page_id in ( + [self.page_id] if self.page_id else self.workspace.mets.physical_pages + ): ifiles = list() for input_file_grp in input_file_grps: - log.debug("Adding input file group %s to page %s", input_file_grp, page_id) - files = self.workspace.mets.find_all_files(pageId=page_id, fileGrp=input_file_grp) + log.debug( + "Adding input file group %s to page %s", input_file_grp, page_id + ) + files = self.workspace.mets.find_all_files( + pageId=page_id, fileGrp=input_file_grp + ) if not files: - log.error('Found no page "%s" in file group %s', page_id, input_file_grp) + log.error( + 'Found no page "%s" in file group %s', page_id, input_file_grp + ) ifiles.append(None) else: ifiles.append(files[0]) @@ -97,5 +103,5 @@ class OcrdDinglehopperEvaluate(Processor): return input_file_tuples -if __name__ == '__main__': +if __name__ == "__main__": ocrd_dinglehopper() diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index 2ce81cd..8a81587 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -10,25 +10,30 @@ from .. import seq_align, ExtractedText def test_text(): - test1 = ExtractedText(None, [ - ExtractedText('s0', None, None, 'foo'), - ExtractedText('s1', None, None, 'bar'), - ExtractedText('s2', None, None, 'bazinga') - ], ' ', None) - - assert test1.text == 'foo bar bazinga' - assert test1.segment_id_for_pos(0) == 's0' + test1 = ExtractedText( + None, + [ + ExtractedText("s0", None, None, "foo"), + ExtractedText("s1", None, None, "bar"), + ExtractedText("s2", None, None, "bazinga"), + ], + " ", + None, + ) + + assert test1.text == "foo bar bazinga" + assert test1.segment_id_for_pos(0) == "s0" assert test1.segment_id_for_pos(3) is None - assert test1.segment_id_for_pos(10) == 's2' + assert test1.segment_id_for_pos(10) == "s2" def test_normalization_check(): - with pytest.raises(ValueError, match=r'.*is not in NFC.*'): - ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ')) - assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ')) + with pytest.raises(ValueError, match=r".*is not in NFC.*"): + ExtractedText("foo", None, None, unicodedata.normalize("NFD", "Schlyñ")) + assert ExtractedText("foo", None, None, unicodedata.normalize("NFC", "Schlyñ")) -AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id') +AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id") def test_align(): @@ -39,25 +44,36 @@ def test_align(): not Python characters. """ - test1 = ExtractedText(None, [ - ExtractedText('s0', None, None, 'foo'), - ExtractedText('s1', None, None, 'bar'), - ExtractedText('s2', None, None, 'batzinga') - ], ' ', None) - test2 = ExtractedText(None, [ - ExtractedText('x0', None, None, 'foo'), - ExtractedText('x1', None, None, 'bar'), - # extra . - ExtractedText('x2', None, None, '.'), - # deletion + different grapheme cluster, m̃ also is two Python characters - ExtractedText('x3', None, None, 'bazim̃ga'), - ], ' ', None) + test1 = ExtractedText( + None, + [ + ExtractedText("s0", None, None, "foo"), + ExtractedText("s1", None, None, "bar"), + ExtractedText("s2", None, None, "batzinga"), + ], + " ", + None, + ) + test2 = ExtractedText( + None, + [ + ExtractedText("x0", None, None, "foo"), + ExtractedText("x1", None, None, "bar"), + # extra . + ExtractedText("x2", None, None, "."), + # deletion + different grapheme cluster, m̃ also is two Python characters + ExtractedText("x3", None, None, "bazim̃ga"), + ], + " ", + None, + ) left_pos = 0 right_pos = 0 alignment = [] - for left, right in seq_align(grapheme_clusters(test1.text), - grapheme_clusters(test2.text)): + for left, right in seq_align( + grapheme_clusters(test1.text), grapheme_clusters(test2.text) + ): left_id = test1.segment_id_for_pos(left_pos) if left is not None else None right_id = test2.segment_id_for_pos(right_pos) if right is not None else None el = AlignmentElement(left, right, left_id, right_id) @@ -67,46 +83,57 @@ def test_align(): if right is not None: right_pos += len(right) - print('test1: {}'.format(test1.text)) - print('test2: {}'.format(test2.text)) - - assert alignment[0] == ('f', 'f', 's0', 'x0') - assert alignment[8] == (None, '.', None, 'x2') - assert alignment[12] == ('t', None, 's2', None) - assert alignment[15] == ('n', 'm̃', 's2', 'x3') - - -@pytest.mark.parametrize("attributes,expected_index,expected_log", [ - ([], None, None), - (['index="0"'], 0, None), - ([''], 0, None), - (['conf="0.5"'], 0, None), - (['index="1"', 'index="0"'], 1, None), - (['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"), - (['conf="0.4"', 'conf="0.5"', 'conf="0.9"'], 2, - "No index attributes, use 'conf' attribute to sort TextEquiv"), - (['index="0"', ''], 0, "TextEquiv without index"), - (['', 'conf="0.4"'], 1, - "No index attributes, use 'conf' attribute to sort TextEquiv"), - (['', ''], 0, "No index attributes, use first TextEquiv"), -]) + print("test1: {}".format(test1.text)) + print("test2: {}".format(test2.text)) + + assert alignment[0] == ("f", "f", "s0", "x0") + assert alignment[8] == (None, ".", None, "x2") + assert alignment[12] == ("t", None, "s2", None) + assert alignment[15] == ("n", "m̃", "s2", "x3") + + +@pytest.mark.parametrize( + "attributes,expected_index,expected_log", + [ + ([], None, None), + (['index="0"'], 0, None), + ([""], 0, None), + (['conf="0.5"'], 0, None), + (['index="1"', 'index="0"'], 1, None), + (['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"), + ( + ['conf="0.4"', 'conf="0.5"', 'conf="0.9"'], + 2, + "No index attributes, use 'conf' attribute to sort TextEquiv", + ), + (['index="0"', ""], 0, "TextEquiv without index"), + ( + ["", 'conf="0.4"'], + 1, + "No index attributes, use 'conf' attribute to sort TextEquiv", + ), + (["", ""], 0, "No index attributes, use first TextEquiv"), + ], +) def test_textequiv(attributes, expected_index, expected_log, caplog): """Test that extracting text from a PAGE TextEquiv is working without index attr.""" caplog.set_level(logging.INFO) - xml = "" + xml = '' ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" text = ["Text {0}".format(i) for i in range(len(attributes) + 1)] - equiv = ["{1}".format(attr, text[i]) - for i, attr in enumerate(attributes)] + equiv = [ + "{1}".format(attr, text[i]) + for i, attr in enumerate(attributes) + ] - textline = "{0}{2}" - textline = textline.format(xml, ns, ''.join(equiv)) + textline = '{0}{2}' + textline = textline.format(xml, ns, "".join(equiv)) root = ET.fromstring(textline) - result = ExtractedText.from_text_segment(root, - {'page': ns}, - textequiv_level='line').text + result = ExtractedText.from_text_segment( + root, {"page": ns}, textequiv_level="line" + ).text if expected_index is None: assert not result else: diff --git a/qurator/dinglehopper/tests/test_align.py b/qurator/dinglehopper/tests/test_align.py index 23483f8..9f9d926 100644 --- a/qurator/dinglehopper/tests/test_align.py +++ b/qurator/dinglehopper/tests/test_align.py @@ -3,64 +3,85 @@ from .. import align, seq_align, distance def test_left_empty(): - result = list(align('', 'foo')) - expected = [(None, 'f'), (None, 'o'), (None, 'o')] + result = list(align("", "foo")) + expected = [(None, "f"), (None, "o"), (None, "o")] assert result == expected def test_right_empty(): - result = list(align('foo', '')) - expected = [('f', None), ('o', None), ('o', None)] + result = list(align("foo", "")) + expected = [("f", None), ("o", None), ("o", None)] assert result == expected def test_left_longer(): - result = list(align('food', 'foo')) - expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), ('d', None)] + result = list(align("food", "foo")) + expected = [("f", "f"), ("o", "o"), ("o", "o"), ("d", None)] assert result == expected def test_right_longer(): - result = list(align('foo', 'food')) - expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), (None, 'd')] + result = list(align("foo", "food")) + expected = [("f", "f"), ("o", "o"), ("o", "o"), (None, "d")] assert result == expected def test_some_diff(): - result = list(align('abcde', 'aaadef')) + result = list(align("abcde", "aaadef")) left, right = unzip(result) - assert list(left) == ['a', 'b', 'c', 'd', 'e', None] - assert list(right) == ['a', 'a', 'a', 'd', 'e', 'f'] + assert list(left) == ["a", "b", "c", "d", "e", None] + assert list(right) == ["a", "a", "a", "d", "e", "f"] def test_longer(): - s1 = 'Dies ist eine Tst!' - s2 = 'Dies ist ein Test.' + s1 = "Dies ist eine Tst!" + s2 = "Dies ist ein Test." result = list(align(s1, s2)) # ; diffprint(*unzip(result)) - expected = [('D', 'D'), ('i', 'i'), ('e', 'e'), ('s', 's'), (' ', ' '), - ('i', 'i'), ('s', 's'), ('t', 't'), (' ', ' '), - ('e', 'e'), ('i', 'i'), ('n', 'n'), ('e', None), (' ', ' '), - ('T', 'T'), (None, 'e'), ('s', 's'), ('t', 't'), ('!', '.')] + expected = [ + ("D", "D"), + ("i", "i"), + ("e", "e"), + ("s", "s"), + (" ", " "), + ("i", "i"), + ("s", "s"), + ("t", "t"), + (" ", " "), + ("e", "e"), + ("i", "i"), + ("n", "n"), + ("e", None), + (" ", " "), + ("T", "T"), + (None, "e"), + ("s", "s"), + ("t", "t"), + ("!", "."), + ] assert result == expected def test_completely_different(): - assert len(list(align('abcde', 'fghij'))) == 5 + assert len(list(align("abcde", "fghij"))) == 5 def test_with_some_fake_ocr_errors(): - result = list(align('Über die vielen Sorgen wegen desselben vergaß', - 'SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab')) + result = list( + align( + "Über die vielen Sorgen wegen desselben vergaß", + "SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab", + ) + ) left, right = unzip(result) # Beginning - assert list(left[:18]) == [None]*18 - assert list(right[:18]) == list('SomeJunk MoreJunk ') + assert list(left[:18]) == [None] * 18 + assert list(right[:18]) == list("SomeJunk MoreJunk ") # End - assert list(left[-1:]) == ['ß'] - assert list(right[-1:]) == ['b'] + assert list(left[-1:]) == ["ß"] + assert list(right[-1:]) == ["b"] def test_lines(): @@ -68,13 +89,30 @@ def test_lines(): This mainly serves as documentation for comparing lists of lines. """ - result = list(seq_align( - ['This is a line.', 'This is another', 'And the last line'], - ['This is a line.', 'This is another', 'J u n k', 'And the last line'] - )) + result = list( + seq_align( + ["This is a line.", "This is another", "And the last line"], + [ + "This is a line.", + "This is another", + "J u n k", + "And the last line", + ], + ) + ) left, right = unzip(result) - assert list(left) == ['This is a line.', 'This is another', None, 'And the last line'] - assert list(right) == ['This is a line.', 'This is another', 'J u n k', 'And the last line'] + assert list(left) == [ + "This is a line.", + "This is another", + None, + "And the last line", + ] + assert list(right) == [ + "This is a line.", + "This is another", + "J u n k", + "And the last line", + ] def test_lines_similar(): @@ -92,7 +130,7 @@ def test_lines_similar(): # Just an example! min_len = min(len(self._string), len(other._string)) if min_len > 0: - normalized_distance = distance(self._string, other._string)/min_len + normalized_distance = distance(self._string, other._string) / min_len similar = normalized_distance < 0.1 else: similar = False @@ -102,18 +140,39 @@ def test_lines_similar(): return not self.__eq__(other) def __repr__(self): - return 'SimilarString(\'%s\')' % self._string + return "SimilarString('%s')" % self._string def __hash__(self): return hash(self._string) - result = list(seq_align( - [SimilarString('This is a line.'), SimilarString('This is another'), SimilarString('And the last line')], - [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')] - )) + result = list( + seq_align( + [ + SimilarString("This is a line."), + SimilarString("This is another"), + SimilarString("And the last line"), + ], + [ + SimilarString("This is a ljne."), + SimilarString("This is another"), + SimilarString("J u n k"), + SimilarString("And the last line"), + ], + ) + ) left, right = unzip(result) - assert list(left) == [SimilarString('This is a line.'), SimilarString('This is another'), None, SimilarString('And the last line')] - assert list(right) == [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')] + assert list(left) == [ + SimilarString("This is a line."), + SimilarString("This is another"), + None, + SimilarString("And the last line"), + ] + assert list(right) == [ + SimilarString("This is a ljne."), + SimilarString("This is another"), + SimilarString("J u n k"), + SimilarString("And the last line"), + ] # Test __eq__ (i.e. is it a substitution or a similar string?) assert list(left)[0] == list(right)[0] diff --git a/qurator/dinglehopper/tests/test_character_error_rate.py b/qurator/dinglehopper/tests/test_character_error_rate.py index b16d37c..39301b4 100644 --- a/qurator/dinglehopper/tests/test_character_error_rate.py +++ b/qurator/dinglehopper/tests/test_character_error_rate.py @@ -7,31 +7,35 @@ from .. import character_error_rate def test_character_error_rate(): - assert character_error_rate('a', 'a') == 0 - assert character_error_rate('a', 'b') == 1/1 - assert character_error_rate('Foo', 'Bar') == 3/3 + assert character_error_rate("a", "a") == 0 + assert character_error_rate("a", "b") == 1 / 1 + assert character_error_rate("Foo", "Bar") == 3 / 3 - assert character_error_rate('Foo', '') == 3/3 + assert character_error_rate("Foo", "") == 3 / 3 - assert character_error_rate('', '') == 0 - assert math.isinf(character_error_rate('', 'Foo')) + assert character_error_rate("", "") == 0 + assert math.isinf(character_error_rate("", "Foo")) - assert character_error_rate('Foo', 'Food') == 1/3 - assert character_error_rate('Fnord', 'Food') == 2/5 - assert character_error_rate('Müll', 'Mull') == 1/4 - assert character_error_rate('Abstand', 'Sand') == 4/7 + assert character_error_rate("Foo", "Food") == 1 / 3 + assert character_error_rate("Fnord", "Food") == 2 / 5 + assert character_error_rate("Müll", "Mull") == 1 / 4 + assert character_error_rate("Abstand", "Sand") == 4 / 7 def test_character_error_rate_hard(): - s1 = unicodedata.normalize('NFC', 'Schlyñ lorem ipsum.') - s2 = unicodedata.normalize('NFD', 'Schlyñ lorem ipsum!') # Different, decomposed! - assert character_error_rate(s1, s2) == 1/19 - - s1 = 'Schlyñ' - assert len(s1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points - s2 = 'Schlym̃' - assert len(s2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points + s1 = unicodedata.normalize("NFC", "Schlyñ lorem ipsum.") + s2 = unicodedata.normalize("NFD", "Schlyñ lorem ipsum!") # Different, decomposed! + assert character_error_rate(s1, s2) == 1 / 19 + + s1 = "Schlyñ" + assert ( + len(s1) == 6 + ) # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points + s2 = "Schlym̃" + assert ( + len(s2) == 7 + ) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points # Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical. - assert character_error_rate(s2, s1) == 1/6 - assert character_error_rate(s1, s2) == 1/6 + assert character_error_rate(s2, s1) == 1 / 6 + assert character_error_rate(s1, s2) == 1 / 6 diff --git a/qurator/dinglehopper/tests/test_edit_distance.py b/qurator/dinglehopper/tests/test_edit_distance.py index fa901a8..dc1f202 100644 --- a/qurator/dinglehopper/tests/test_edit_distance.py +++ b/qurator/dinglehopper/tests/test_edit_distance.py @@ -6,35 +6,39 @@ from .. import levenshtein, distance def test_levenshtein(): - assert levenshtein('a', 'a') == 0 - assert levenshtein('a', 'b') == 1 - assert levenshtein('Foo', 'Bar') == 3 + assert levenshtein("a", "a") == 0 + assert levenshtein("a", "b") == 1 + assert levenshtein("Foo", "Bar") == 3 - assert levenshtein('', '') == 0 - assert levenshtein('Foo', '') == 3 - assert levenshtein('', 'Foo') == 3 + assert levenshtein("", "") == 0 + assert levenshtein("Foo", "") == 3 + assert levenshtein("", "Foo") == 3 - assert levenshtein('Foo', 'Food') == 1 - assert levenshtein('Fnord', 'Food') == 2 - assert levenshtein('Müll', 'Mull') == 1 - assert levenshtein('Abstand', 'Sand') == 4 + assert levenshtein("Foo", "Food") == 1 + assert levenshtein("Fnord", "Food") == 2 + assert levenshtein("Müll", "Mull") == 1 + assert levenshtein("Abstand", "Sand") == 4 def test_levenshtein_other_sequences(): - assert levenshtein(['a', 'ab'], ['a', 'ab', 'c']) == 1 - assert levenshtein(['a', 'ab'], ['a', 'c']) == 1 + assert levenshtein(["a", "ab"], ["a", "ab", "c"]) == 1 + assert levenshtein(["a", "ab"], ["a", "c"]) == 1 def test_distance(): - assert distance('Fnord', 'Food') == 2 - assert distance('Müll', 'Mull') == 1 + assert distance("Fnord", "Food") == 2 + assert distance("Müll", "Mull") == 1 - word1 = unicodedata.normalize('NFC', 'Schlyñ') - word2 = unicodedata.normalize('NFD', 'Schlyñ') # Different, decomposed! + word1 = unicodedata.normalize("NFC", "Schlyñ") + word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed! assert distance(word1, word2) == 0 - word1 = 'Schlyñ' - assert len(word1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points - word2 = 'Schlym̃' - assert len(word2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points + word1 = "Schlyñ" + assert ( + len(word1) == 6 + ) # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points + word2 = "Schlym̃" + assert ( + len(word2) == 7 + ) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points assert distance(word1, word2) == 1 diff --git a/qurator/dinglehopper/tests/test_editops.py b/qurator/dinglehopper/tests/test_editops.py index 8fafe5d..06afbfc 100644 --- a/qurator/dinglehopper/tests/test_editops.py +++ b/qurator/dinglehopper/tests/test_editops.py @@ -4,45 +4,60 @@ from .. import seq_editops, editops def test_trivial(): - assert seq_editops('abc', 'abc') == [] - assert seq_editops('', '') == [] + assert seq_editops("abc", "abc") == [] + assert seq_editops("", "") == [] def test_insert(): - assert seq_editops('bc', 'abc') == [('insert', 0, 0)] - assert seq_editops('ac', 'abc') == [('insert', 1, 1)] - assert seq_editops('ab', 'abc') == [('insert', 2, 2)] - assert seq_editops('', 'a') == [('insert', 0, 0)] + assert seq_editops("bc", "abc") == [("insert", 0, 0)] + assert seq_editops("ac", "abc") == [("insert", 1, 1)] + assert seq_editops("ab", "abc") == [("insert", 2, 2)] + assert seq_editops("", "a") == [("insert", 0, 0)] def test_multiple(): - assert seq_editops('bcd', 'abce') == [('insert', 0, 0), ('replace', 2, 3)] + assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)] def test_delete(): - assert seq_editops('abcdef', 'cdef') == [('delete', 0, 0), ('delete', 1, 0)] - assert seq_editops('Xabcdef', 'Xcdef') == [('delete', 1, 1), ('delete', 2, 1)] - assert seq_editops('abcdefg', 'acdefX') == [('delete', 1, 1), ('replace', 6, 5)] - assert seq_editops('abcde', 'aabcd') == [('insert', 1, 1), ('delete', 4, 5)] - assert seq_editops('Foo', '') == [('delete', 0, 0), ('delete', 1, 0), ('delete', 2, 0)] - assert seq_editops('Foolish', 'Foo') == [('delete', 3, 3), ('delete', 4, 3), ('delete', 5, 3), ('delete', 6, 3)] + assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)] + assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)] + assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)] + assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)] + assert seq_editops("Foo", "") == [ + ("delete", 0, 0), + ("delete", 1, 0), + ("delete", 2, 0), + ] + assert seq_editops("Foolish", "Foo") == [ + ("delete", 3, 3), + ("delete", 4, 3), + ("delete", 5, 3), + ("delete", 6, 3), + ] def test_ambiguous(): - assert seq_editops('bcd', 'abcef') == [('insert', 0, 0), ('replace', 2, 3), ('insert', 3, 4)] + assert seq_editops("bcd", "abcef") == [ + ("insert", 0, 0), + ("replace", 2, 3), + ("insert", 3, 4), + ] def test_editops(): """Test editops() in cases where dealing with grapheme clusters matters""" # In these cases, one of the words has a composed form, the other one does not. - assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)] - assert editops('oͤde', 'öde') == [('replace', 0, 0)] + assert editops("Schlyñ", "Schlym̃") == [("replace", 5, 5)] + assert editops("oͤde", "öde") == [("replace", 0, 0)] def test_editops_canonically_equivalent(): - left = unicodedata.lookup('LATIN SMALL LETTER N') + unicodedata.lookup('COMBINING TILDE') - right = unicodedata.lookup('LATIN SMALL LETTER N WITH TILDE') + left = unicodedata.lookup("LATIN SMALL LETTER N") + unicodedata.lookup( + "COMBINING TILDE" + ) + right = unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE") assert left != right - assert unicodedata.normalize('NFC', left) == unicodedata.normalize('NFC', right) + assert unicodedata.normalize("NFC", left) == unicodedata.normalize("NFC", right) assert editops(left, right) == [] diff --git a/qurator/dinglehopper/tests/test_integ_align.py b/qurator/dinglehopper/tests/test_integ_align.py index b35974b..74b8c7e 100644 --- a/qurator/dinglehopper/tests/test_integ_align.py +++ b/qurator/dinglehopper/tests/test_integ_align.py @@ -7,7 +7,7 @@ from lxml import etree as ET from .. import align, page_text -data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") @pytest.mark.integration @@ -17,8 +17,8 @@ def test_align_page_files(): # (currently) not counted due to normalization. # NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters. - gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) - ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) + gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml"))) + ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml"))) result = list(align(gt, ocr)) for left, right in result: diff --git a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py index 1c3bf52..e307a84 100644 --- a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py +++ b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py @@ -8,26 +8,34 @@ from uniseg.graphemecluster import grapheme_clusters from .. import character_error_rate, page_text, alto_text -data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") @pytest.mark.integration def test_character_error_rate_between_page_files(): # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. # The fi ligature does not count. - gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) - ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) + gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml"))) + ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml"))) gt_len = len(list(grapheme_clusters(gt))) - expected_cer = 2/gt_len + expected_cer = 2 / gt_len assert character_error_rate(gt, ocr) == expected_cer @pytest.mark.integration def test_character_error_rate_between_page_alto(): - gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml'))) - ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml'))) + gt = page_text( + ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml")) + ) + ocr = alto_text( + ET.parse( + os.path.join( + data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml" + ) + ) + ) assert gt == ocr assert character_error_rate(gt, ocr) == 0 @@ -35,7 +43,17 @@ def test_character_error_rate_between_page_alto(): @pytest.mark.integration def test_character_error_rate_between_page_alto_2(): - gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml'))) - ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml'))) - - assert character_error_rate(gt, ocr) == 8/591 # Manually verified + gt = page_text( + ET.parse( + os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml") + ) + ) + ocr = alto_text( + ET.parse( + os.path.join( + data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml" + ) + ) + ) + + assert character_error_rate(gt, ocr) == 8 / 591 # Manually verified diff --git a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py index d71bc14..d251f9d 100644 --- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py +++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py @@ -10,31 +10,31 @@ def test_cli_json(tmp_path): """Test that the cli/process() yields a loadable JSON report""" with working_directory(str(tmp_path)): - with open('gt.txt', 'w') as gtf: - gtf.write('AAAAA') - with open('ocr.txt', 'w') as ocrf: - ocrf.write('AAAAB') + with open("gt.txt", "w") as gtf: + gtf.write("AAAAA") + with open("ocr.txt", "w") as ocrf: + ocrf.write("AAAAB") - with open('gt.txt', 'r') as gtf: + with open("gt.txt", "r") as gtf: print(gtf.read()) - process('gt.txt', 'ocr.txt', 'report') - with open('report.json', 'r') as jsonf: + process("gt.txt", "ocr.txt", "report") + with open("report.json", "r") as jsonf: print(jsonf.read()) - with open('report.json', 'r') as jsonf: + with open("report.json", "r") as jsonf: j = json.load(jsonf) - assert j['cer'] == pytest.approx(0.2) + assert j["cer"] == pytest.approx(0.2) def test_cli_json_cer_is_infinity(tmp_path): """Test that the cli/process() yields a loadable JSON report when CER == inf""" with working_directory(str(tmp_path)): - with open('gt.txt', 'w') as gtf: - gtf.write('') # Empty to yield CER == inf - with open('ocr.txt', 'w') as ocrf: - ocrf.write('Not important') + with open("gt.txt", "w") as gtf: + gtf.write("") # Empty to yield CER == inf + with open("ocr.txt", "w") as ocrf: + ocrf.write("Not important") - process('gt.txt', 'ocr.txt', 'report') - with open('report.json', 'r') as jsonf: + process("gt.txt", "ocr.txt", "report") + with open("report.json", "r") as jsonf: j = json.load(jsonf) - assert j['cer'] == pytest.approx(float('inf')) + assert j["cer"] == pytest.approx(float("inf")) diff --git a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py index cbe12f8..0e1e7da 100644 --- a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py +++ b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py @@ -7,7 +7,7 @@ from lxml import etree as ET from .. import distance, page_text, alto_text -data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") @pytest.mark.integration @@ -15,15 +15,23 @@ def test_distance_between_page_files(): # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. # Due to normalization, we don't count the ligature. # → 2 differences - gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) - ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) + gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml"))) + ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml"))) assert distance(gt, ocr) == 2 @pytest.mark.integration def test_distance_between_page_alto(): - gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml'))) - ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml'))) + gt = page_text( + ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml")) + ) + ocr = alto_text( + ET.parse( + os.path.join( + data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml" + ) + ) + ) assert gt == ocr assert distance(gt, ocr) == 0 @@ -31,7 +39,17 @@ def test_distance_between_page_alto(): @pytest.mark.integration def test_distance_between_page_alto_2(): - gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml'))) - ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml'))) + gt = page_text( + ET.parse( + os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml") + ) + ) + ocr = alto_text( + ET.parse( + os.path.join( + data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml" + ) + ) + ) assert distance(gt, ocr) == 8 # Manually verified diff --git a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py index 5e535b5..5cf6a41 100644 --- a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py +++ b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py @@ -10,27 +10,32 @@ from .util import working_directory from ..ocrd_cli import ocrd_dinglehopper -data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") def test_ocrd_cli(tmp_path): """Test OCR-D interface""" # Copy test workspace - test_workspace_dir_source = Path(data_dir) / 'actevedef_718448162' - test_workspace_dir = tmp_path / 'test_ocrd_cli' + test_workspace_dir_source = Path(data_dir) / "actevedef_718448162" + test_workspace_dir = tmp_path / "test_ocrd_cli" shutil.copytree(str(test_workspace_dir_source), str(test_workspace_dir)) # Run through the OCR-D interface with working_directory(str(test_workspace_dir)): runner = CliRunner() args = [ - '-m', 'mets.xml', - '-I', 'OCR-D-GT-PAGE,OCR-D-OCR-CALAMARI', - '-O', 'OCR-D-OCR-CALAMARI-EVAL' + "-m", + "mets.xml", + "-I", + "OCR-D-GT-PAGE,OCR-D-OCR-CALAMARI", + "-O", + "OCR-D-OCR-CALAMARI-EVAL", ] - sys.argv[1:] = args # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments + sys.argv[ + 1: + ] = args # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments result = runner.invoke(ocrd_dinglehopper, args) assert result.exit_code == 0 - result_json = list((test_workspace_dir / 'OCR-D-OCR-CALAMARI-EVAL').glob('*.json')) - assert json.load(open(str(result_json[0])))['cer'] < 0.03 + result_json = list((test_workspace_dir / "OCR-D-OCR-CALAMARI-EVAL").glob("*.json")) + assert json.load(open(str(result_json[0])))["cer"] < 0.03 diff --git a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py index f5c922b..ba865b4 100644 --- a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py +++ b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py @@ -7,26 +7,36 @@ from lxml import etree as ET from .. import word_error_rate, words, page_text, alto_text -data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") @pytest.mark.integration def test_word_error_rate_between_page_files(): # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words, # the ligature does not count → 2 errors - gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) + gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml"))) - gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line + gt_word_count = ( + 7 + 6 + 5 + 8 + 7 + 6 + 7 + 8 + 6 + 7 + 7 + 5 + 6 + 8 + 8 + 7 + 7 + 6 + 5 + 4 + ) # Manually verified word count per line assert len(list(words(gt))) == gt_word_count - ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) - assert word_error_rate(gt, ocr) == 2/gt_word_count + ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml"))) + assert word_error_rate(gt, ocr) == 2 / gt_word_count @pytest.mark.integration def test_word_error_rate_between_page_alto(): - gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml'))) - ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml'))) + gt = page_text( + ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml")) + ) + ocr = alto_text( + ET.parse( + os.path.join( + data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml" + ) + ) + ) assert gt == ocr assert word_error_rate(gt, ocr) == 0 @@ -34,11 +44,25 @@ def test_word_error_rate_between_page_alto(): @pytest.mark.integration def test_word_error_rate_between_page_alto_2(): - gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml'))) - - gt_word_count = 14+18+17+14+17+17+3 # Manually verified word count per line + gt = page_text( + ET.parse( + os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml") + ) + ) + + gt_word_count = ( + 14 + 18 + 17 + 14 + 17 + 17 + 3 + ) # Manually verified word count per line assert len(list(words(gt))) == gt_word_count - ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml'))) - - assert word_error_rate(gt, ocr) == 7/gt_word_count # Manually verified, 6 words are wrong, 1 got split (=2 errors) + ocr = alto_text( + ET.parse( + os.path.join( + data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml" + ) + ) + ) + + assert ( + word_error_rate(gt, ocr) == 7 / gt_word_count + ) # Manually verified, 6 words are wrong, 1 got split (=2 errors) diff --git a/qurator/dinglehopper/tests/test_ocr_files.py b/qurator/dinglehopper/tests/test_ocr_files.py index 6848fa1..fb38c4a 100644 --- a/qurator/dinglehopper/tests/test_ocr_files.py +++ b/qurator/dinglehopper/tests/test_ocr_files.py @@ -9,46 +9,54 @@ import pytest from .util import working_directory from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text -data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") def test_alto_namespace(): - tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml')) - assert alto_namespace(tree) == 'http://www.loc.gov/standards/alto/ns-v3#' + tree = ET.parse(os.path.join(data_dir, "test.alto3.xml")) + assert alto_namespace(tree) == "http://www.loc.gov/standards/alto/ns-v3#" def test_alto_text(): - tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml')) + tree = ET.parse(os.path.join(data_dir, "test.alto3.xml")) result = alto_text(tree) - expected = textwrap.dedent("""\ + expected = textwrap.dedent( + """\ über die vielen Sorgen wegen deſſelben vergaß Hartkopf, der Frau Amtmännin das ver- - ſprochene zu überliefern.""") + ſprochene zu überliefern.""" + ) assert result == expected def test_alto_text_ALTO1(): - tree = ET.parse(os.path.join(data_dir, 'test.alto1.xml')) + tree = ET.parse(os.path.join(data_dir, "test.alto1.xml")) assert "being erected at the Broadway stock" in alto_text(tree) def test_alto_text_ALTO2(): - tree = ET.parse(os.path.join(data_dir, 'test.alto2.xml')) - assert "Halbmonde, die genau durch einen Ouerstrich halbiert\nsind und an beiden Enden" in alto_text(tree) + tree = ET.parse(os.path.join(data_dir, "test.alto2.xml")) + assert ( + "Halbmonde, die genau durch einen Ouerstrich halbiert\nsind und an beiden Enden" + in alto_text(tree) + ) def test_alto_text_ALTO3(): - tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml')) + tree = ET.parse(os.path.join(data_dir, "test.alto3.xml")) assert "über die vielen Sorgen wegen deſſelben vergaß" in alto_text(tree) def test_page_namespace(): - tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml')) - assert page_namespace(tree) == 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15' + tree = ET.parse(os.path.join(data_dir, "test.page2018.xml")) + assert ( + page_namespace(tree) + == "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" + ) def test_page_test(): - tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml')) + tree = ET.parse(os.path.join(data_dir, "test.page2018.xml")) result = page_text(tree) # We are currently normalizing on extraction, so the text is normalized. @@ -74,7 +82,8 @@ def test_page_test(): # Jndeß mangelten do einige Generalia, die # alſo wegfielen. — Hartkopf gieng ſelb # mit und berbrate es. —""") - expected = textwrap.dedent("""\ + expected = textwrap.dedent( + """\ über die vielen Sorgen wegen deſſelben vergaß Hartkopf, der Frau Amtmännin das ver- ſprochene zu überliefern. – Ein Erpreſſer @@ -94,7 +103,8 @@ def test_page_test(): ſie das, was da wäre, herbeyſchaffen möchte. Jndeß mangelten doch einige Generalia, die alſo wegfielen. – Hartkopf gieng ſelbſt - mit und überbrachte es. –""") + mit und überbrachte es. –""" + ) assert result == expected @@ -107,56 +117,69 @@ def test_page_with_empty_region(): # # # - tree = ET.parse(os.path.join(data_dir, 'brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml')) + tree = ET.parse( + os.path.join(data_dir, "brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml") + ) result = page_text(tree) assert result def test_page_order(): # This file contains TextRegions where file order is not the same as reading order. - tree = ET.parse(os.path.join(data_dir, 'order.page.xml')) + tree = ET.parse(os.path.join(data_dir, "order.page.xml")) result = page_text(tree) print(result) - assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die', result, re.DOTALL) + assert re.search( + r"Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die", + result, + re.DOTALL, + ) def test_page_mixed_regions(): # This file contains ImageRegions and TextRegions in the ReadingOrder - tree = ET.parse(os.path.join(data_dir, 'mixed-regions.page.xml')) + tree = ET.parse(os.path.join(data_dir, "mixed-regions.page.xml")) result = page_text(tree) - assert 'non exaudiam uos. Chriſtiani uero quia orant iuxta' in result + assert "non exaudiam uos. Chriſtiani uero quia orant iuxta" in result def test_page_level(): # This file contains inconsistent TextRegion and TextLine texts # TextRegion - tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml')) + tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml")) result = page_text(tree) - assert result == 'Inconsistent dummy region text' - tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml')) - result = page_text(tree, textequiv_level='region') - assert result == 'Inconsistent dummy region text' + assert result == "Inconsistent dummy region text" + tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml")) + result = page_text(tree, textequiv_level="region") + assert result == "Inconsistent dummy region text" # TextLine - tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml')) - result = page_text(tree, textequiv_level='line') - assert result == 'Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-' + tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml")) + result = page_text(tree, textequiv_level="line") + assert ( + result + == "Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-" + ) def test_text(): - assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml')) - assert "wieder ein. – Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml')) - assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt')) + assert "being erected at the Broadway stock" in text( + os.path.join(data_dir, "test.alto1.xml") + ) + assert "wieder ein. – Er langte den Zettel aus dem" in text( + os.path.join(data_dir, "test.page2018.xml") + ) + assert "Lorem ipsum" in text(os.path.join(data_dir, "test.txt")) def test_plain(tmp_path): with working_directory(str(tmp_path)): - with open('ocr.txt', 'w') as ocrf: - ocrf.write('AAAAB') + with open("ocr.txt", "w") as ocrf: + ocrf.write("AAAAB") - result = plain_text('ocr.txt') - expected = 'AAAAB' + result = plain_text("ocr.txt") + expected = "AAAAB" assert result == expected diff --git a/qurator/dinglehopper/tests/test_word_error_rate.py b/qurator/dinglehopper/tests/test_word_error_rate.py index ad19172..bc7b91e 100644 --- a/qurator/dinglehopper/tests/test_word_error_rate.py +++ b/qurator/dinglehopper/tests/test_word_error_rate.py @@ -6,32 +6,81 @@ from .. import word_error_rate, words def test_words(): - result = list(words('Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?')) - expected = ['Der', 'schnelle', 'braune', 'Fuchs', 'kann', 'keine', '3,14', 'Meter', 'springen', 'oder'] + result = list( + words("Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?") + ) + expected = [ + "Der", + "schnelle", + "braune", + "Fuchs", + "kann", + "keine", + "3,14", + "Meter", + "springen", + "oder", + ] assert result == expected def test_words_private_use_area(): - result = list(words( - 'ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n' - 'ſproene zu berliefern.')) + result = list( + words( + "ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n" + "ſproene zu berliefern." + ) + ) expected = [ - 'ber', 'die', 'vielen', 'Sorgen', 'wegen', 'deelben', 'vergaß', 'Hartkopf', - 'der', 'Frau', 'Amtmnnin', 'das', 'ver', - 'ſproene', 'zu', 'berliefern'] + "ber", + "die", + "vielen", + "Sorgen", + "wegen", + "deelben", + "vergaß", + "Hartkopf", + "der", + "Frau", + "Amtmnnin", + "das", + "ver", + "ſproene", + "zu", + "berliefern", + ] assert result == expected def test_word_error_rate(): - assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ist ein Beispielsatz!') == 0 - assert word_error_rate('Dies. ist ein Beispielsatz!', 'Dies ist ein Beispielsatz!') == 0 - assert word_error_rate('Dies. ist ein Beispielsatz!', 'Dies ist ein Beispielsatz.') == 0 + assert ( + word_error_rate("Dies ist ein Beispielsatz!", "Dies ist ein Beispielsatz!") == 0 + ) + assert ( + word_error_rate("Dies. ist ein Beispielsatz!", "Dies ist ein Beispielsatz!") + == 0 + ) + assert ( + word_error_rate("Dies. ist ein Beispielsatz!", "Dies ist ein Beispielsatz.") + == 0 + ) - assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ist ein Beispielsarz:') == 1/4 - assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ein ist Beispielsatz!') == 2/4 + assert ( + word_error_rate("Dies ist ein Beispielsatz!", "Dies ist ein Beispielsarz:") + == 1 / 4 + ) + assert ( + word_error_rate("Dies ist ein Beispielsatz!", "Dies ein ist Beispielsatz!") + == 2 / 4 + ) - assert word_error_rate('Dies ist ein Beispielsatz!', '') == 4/4 - assert math.isinf(word_error_rate('', 'Dies ist ein Beispielsatz!')) - assert word_error_rate('', '') == 0 + assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4 + assert math.isinf(word_error_rate("", "Dies ist ein Beispielsatz!")) + assert word_error_rate("", "") == 0 - assert word_error_rate('Schlyñ lorem ipsum dolor sit amet,', 'Schlym̃ lorem ipsum dolor sit amet.') == 1/6 + assert ( + word_error_rate( + "Schlyñ lorem ipsum dolor sit amet,", "Schlym̃ lorem ipsum dolor sit amet." + ) + == 1 / 6 + ) diff --git a/qurator/dinglehopper/tests/util.py b/qurator/dinglehopper/tests/util.py index 1f224e5..8a735aa 100644 --- a/qurator/dinglehopper/tests/util.py +++ b/qurator/dinglehopper/tests/util.py @@ -27,6 +27,7 @@ def unzip(an_iterable_of_tuples): class working_directory: """Context manager to temporarily change the working directory""" + def __init__(self, wd): self.wd = wd diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py index 2f5a1f6..dde57b9 100644 --- a/qurator/dinglehopper/word_error_rate.py +++ b/qurator/dinglehopper/word_error_rate.py @@ -20,9 +20,10 @@ def words(s: str): def new_word_break(c, index=0): if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area - return 'ALetter' + return "ALetter" else: return old_word_break(c, index) + uniseg.wordbreak.word_break = new_word_break # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar @@ -30,8 +31,8 @@ def words(s: str): # See https://www.fileformat.info/info/unicode/category/index.htm # and https://unicodebook.readthedocs.io/unicode.html#categories - unwanted_categories = 'O', 'M', 'P', 'Z', 'S' - unwanted_subcategories = 'Cc', 'Cf' + unwanted_categories = "O", "M", "P", "Z", "S" + unwanted_subcategories = "Cc", "Cf" subcat = unicodedata.category(c) cat = subcat[0] @@ -53,7 +54,7 @@ def words(s: ExtractedText): @multimethod def words_normalized(s: str): - return words(unicodedata.normalize('NFC', s)) + return words(unicodedata.normalize("NFC", s)) @multimethod @@ -69,7 +70,9 @@ def word_error_rate_n(reference: str, compared: str) -> Tuple[float, int]: @multimethod -def word_error_rate_n(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]: +def word_error_rate_n( + reference: ExtractedText, compared: ExtractedText +) -> Tuple[float, int]: return word_error_rate_n(reference.text, compared.text) @@ -84,7 +87,7 @@ def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, i if d == 0: return 0, n if n == 0: - return float('inf'), n + return float("inf"), n return d / n, n diff --git a/setup.py b/setup.py index 7b8107a..56ae184 100644 --- a/setup.py +++ b/setup.py @@ -1,29 +1,29 @@ from io import open from setuptools import find_packages, setup -with open('requirements.txt') as fp: +with open("requirements.txt") as fp: install_requires = fp.read() setup( - name='dinglehopper', - author='Mike Gerber, The QURATOR SPK Team', - author_email='mike.gerber@sbb.spk-berlin.de, qurator@sbb.spk-berlin.de', - description='The OCR evaluation tool', - long_description=open('README.md', 'r', encoding='utf-8').read(), - long_description_content_type='text/markdown', - keywords='qurator ocr', - license='Apache', - namespace_packages=['qurator'], - packages=find_packages(exclude=['*.tests', '*.tests.*', 'tests.*', 'tests']), + name="dinglehopper", + author="Mike Gerber, The QURATOR SPK Team", + author_email="mike.gerber@sbb.spk-berlin.de, qurator@sbb.spk-berlin.de", + description="The OCR evaluation tool", + long_description=open("README.md", "r", encoding="utf-8").read(), + long_description_content_type="text/markdown", + keywords="qurator ocr", + license="Apache", + namespace_packages=["qurator"], + packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), install_requires=install_requires, package_data={ - '': ['*.json', 'templates/*'], + "": ["*.json", "templates/*"], }, entry_points={ - 'console_scripts': [ - 'dinglehopper=qurator.dinglehopper.cli:main', - 'dinglehopper-extract=qurator.dinglehopper.cli_extract:main', - 'ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper', - ] - } + "console_scripts": [ + "dinglehopper=qurator.dinglehopper.cli:main", + "dinglehopper-extract=qurator.dinglehopper.cli_extract:main", + "ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper", + ] + }, )