diff --git a/.vimrc b/.vimrc new file mode 100644 index 0000000..3b935a0 --- /dev/null +++ b/.vimrc @@ -0,0 +1,2 @@ +" project-specific .vimrc (needs set exrc + set secure) +set textwidth=90 diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py index ab44760..87febb7 100644 --- a/qurator/dinglehopper/align.py +++ b/qurator/dinglehopper/align.py @@ -28,16 +28,16 @@ def seq_align(s1, s2): if o: if o[0] == 'insert': - yield (None, s2[j]) + yield None, s2[j] j += 1 elif o[0] == 'delete': - yield (s1[i], None) + yield s1[i], None i += 1 elif o[0] == 'replace': - yield (s1[i], s2[j]) + yield s1[i], s2[j] i += 1 j += 1 else: - yield (s1[i], s2[j]) + yield s1[i], s2[j] i += 1 j += 1 diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py index 05cc931..e99f391 100644 --- a/qurator/dinglehopper/character_error_rate.py +++ b/qurator/dinglehopper/character_error_rate.py @@ -15,6 +15,10 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]: :return: character error rate and length of the reference """ d = distance(reference, compared) + # XXX + from .cli import ExtractedText + if isinstance(reference, ExtractedText): + reference = reference.text n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference)))) if d == 0: diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 63bfd92..9c963c1 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -8,11 +8,11 @@ from markupsafe import escape from qurator.dinglehopper import * -def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align): +def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): gtx = '' ocrx = '' - def format_thing(t, css_classes=None): + def format_thing(t, css_classes=None, id_=None): if t is None: html_t = none css_classes += ' ellipsis' @@ -21,19 +21,51 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align): else: html_t = escape(t) + html_custom_attrs = "" + + # Set Bootstrap tooltip to the segment id + if id_: + html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_) + if css_classes: - return '{html_t}'.format(css_classes=css_classes, html_t=html_t) + return '{html_t}'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs) else: return '{html_t}'.format(html_t=html_t) - for k, (g, o) in enumerate(align(gt_things, ocr_things)): - if g == o: - css_classes = None - else: + if isinstance(gt_in, ExtractedText): + if not isinstance(ocr_in, ExtractedText): + raise TypeError() + # XXX splitting should be done in ExtractedText + gt_things = list(grapheme_clusters(gt_in.text)) + ocr_things = list(grapheme_clusters(ocr_in.text)) + else: + gt_things = gt_in + ocr_things = ocr_in + + + + g_pos = 0 + o_pos = 0 + for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)): + css_classes = None + gt_id = None + ocr_id = None + if g != o: css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k) + if isinstance(gt_in, ExtractedText): + gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None + ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None + # Deletions and inserts only produce one id + None, UI must + # support this, i.e. display for the one id produced + + gtx += joiner + format_thing(g, css_classes, gt_id) + ocrx += joiner + format_thing(o, css_classes, ocr_id) + + if g is not None: + g_pos += len(g) + if o is not None: + o_pos += len(o) - gtx += joiner + format_thing(g, css_classes) - ocrx += joiner + format_thing(o, css_classes) return \ ''' @@ -51,20 +83,17 @@ def process(gt, ocr, report_prefix, *, metrics=True): Click on a wrapper. """ - gt_text = text(gt) - ocr_text = text(ocr) - - gt_text = substitute_equivalences(gt_text) - ocr_text = substitute_equivalences(ocr_text) + gt_text = extract(gt) + ocr_text = extract(ocr) cer, n_characters = character_error_rate_n(gt_text, ocr_text) wer, n_words = word_error_rate_n(gt_text, ocr_text) - char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align) + char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·') gt_words = words_normalized(gt_text) ocr_words = words_normalized(ocr_text) - word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯', align=seq_align) + word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯') def json_float(value): """Convert a float value to an JSON float. diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index 8ca24d3..284b676 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -75,6 +75,12 @@ def distance(s1, s2): Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme clusters. This should be the correct way to compare two Unicode strings. """ + # XXX + from .cli import ExtractedText + if isinstance(s1, ExtractedText): + s1 = s1.text + if isinstance(s2, ExtractedText): + s2 = s2.text s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1))) s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2))) return levenshtein(s1, s2) @@ -116,7 +122,11 @@ def seq_editops(seq1, seq2): def editops(word1, word2): - # XXX Note that this returns indices to the _grapheme clusters_, not characters! + """ + Return sequence of edit operations transforming one string to another. + + Note that this returns indices to the _grapheme clusters_, not characters! + """ word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1))) word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2))) return seq_editops(word1, word2) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index b57a047..a048b1e 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -1,11 +1,97 @@ from __future__ import division, print_function +from typing import Optional from warnings import warn from lxml import etree as ET +from lxml.etree import XMLSyntaxError +from contextlib import suppress +from itertools import repeat +from .substitute_equivalences import substitute_equivalences import sys +import attr +import enum +import unicodedata +import re + + +@attr.s(frozen=True) +class ExtractedText: + segments = attr.ib(converter=list) + joiner = attr.ib(type=str) + # TODO Types are not validated (attr does not do this yet) + + @property + def text(self): + return self.joiner.join(s.text for s in self.segments) + + _segment_id_for_pos = None + + def segment_id_for_pos(self, pos): + # Calculate segment ids once, on the first call + if not self._segment_id_for_pos: + segment_id_for_pos = [] + for s in self.segments: + segment_id_for_pos.extend(repeat(s.segment_id, len(s.text))) + segment_id_for_pos.extend(repeat(None, len(self.joiner))) + # This is frozen, so we have to jump through the hoop: + object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos) + assert self._segment_id_for_pos + + return self._segment_id_for_pos[pos] + + +class Normalization(enum.Enum): + NFC = 1 + NFC_MUFI = 2 # TODO + NFC_SBB = 3 + + +def normalize(text, normalization): + if normalization == Normalization.NFC: + return unicodedata.normalize('NFC', text) + if normalization == Normalization.NFC_MUFI: + raise NotImplementedError() + if normalization == Normalization.NFC_SBB: + return substitute_equivalences(text) + else: + raise ValueError() -from lxml.etree import XMLSyntaxError + +# XXX hack +def normalize_sbb(t): + return normalize(t, Normalization.NFC_SBB) + + +@attr.s(frozen=True) +class ExtractedTextSegment: + segment_id = attr.ib(type=Optional[str]) + + @segment_id.validator + def check(self, _, value): + if value is None: + return + if not re.match(r'[\w\d_-]+', value): + raise ValueError('Malformed segment id "{}"'.format(value)) + text = attr.ib(type=str) + + @text.validator + def check(self, _, value): + if value is not None and normalize(value, self.normalization) != value: + raise ValueError('String "{}" is not normalized.'.format(value)) + normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB) + + @classmethod + def from_text_segment(cls, text_segment, nsmap): + """Build an ExtractedTextSegment from a PAGE content text element""" + + segment_id = text_segment.attrib['id'] + segment_text = None + with suppress(AttributeError): + segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text + segment_text = segment_text or '' + segment_text = normalize_sbb(segment_text) + return cls(segment_id, segment_text) def alto_namespace(tree): @@ -21,7 +107,7 @@ def alto_namespace(tree): raise ValueError('Not an ALTO tree') -def alto_text(tree): +def alto_extract(tree): """Extract text from the given ALTO ElementTree.""" nsmap = {'alto': alto_namespace(tree)} @@ -29,9 +115,18 @@ def alto_text(tree): lines = ( ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap)) for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap)) - text_ = '\n'.join(lines) - return text_ + return ExtractedText( + (ExtractedTextSegment(None, normalize_sbb(line_text)) for line_text in lines), + '\n' + ) + # TODO This currently does not extract any segment id, because we are + # clueless about the ALTO format. + # FIXME needs to handle normalization + + +def alto_text(tree): + return alto_extract(tree).text def page_namespace(tree): @@ -47,18 +142,12 @@ def page_namespace(tree): raise ValueError('Not a PAGE tree') -def page_text(tree): +def page_extract(tree): """Extract text from the given PAGE content ElementTree.""" nsmap = {'page': page_namespace(tree)} - def region_text(region): - try: - return region.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text - except AttributeError: - return None - - region_texts = [] + regions = [] reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap) if reading_order is not None: for group in reading_order.iterfind('./*', namespaces=nsmap): @@ -68,39 +157,55 @@ def page_text(tree): region_id = region_ref_indexed.attrib['regionRef'] region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap) if region is not None: - region_texts.append(region_text(region)) + regions.append(ExtractedTextSegment.from_text_segment(region, nsmap)) else: warn('Not a TextRegion: "%s"' % region_id) else: raise NotImplementedError else: for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap): - region_texts.append(region_text(region)) + regions.append(ExtractedTextSegment.from_text_segment(region, nsmap)) - # XXX Does a file have to have regions etc.? region vs lines etc. # Filter empty region texts - region_texts = (t for t in region_texts if t) + regions = (r for r in regions if r.text is not None) - text_ = '\n'.join(region_texts) + return ExtractedText(regions, '\n') + # FIXME needs to handle normalization - return text_ +def page_text(tree): + return page_extract(tree).text -def text(filename): - """Read the text from the given file. + +def plain_extract(filename): + with open(filename, 'r') as f: + return ExtractedText( + (ExtractedTextSegment('line %d' % no, line) for no, line in enumerate(f.readlines())), + '\n' + ) + + +def plain_text(filename): + return plain_extract(filename).text + + +def extract(filename): + """Extract the text from the given file. Supports PAGE, ALTO and falls back to plain text. """ - try: tree = ET.parse(filename) except XMLSyntaxError: - with open(filename, 'r') as f: - return f.read() + return plain_extract(filename) try: - return page_text(tree) + return page_extract(tree) except ValueError: - return alto_text(tree) + return alto_extract(tree) + + +def text(filename): + return extract(filename).text if __name__ == '__main__': diff --git a/qurator/dinglehopper/substitute_equivalences.py b/qurator/dinglehopper/substitute_equivalences.py index 1b7e0cf..39be276 100644 --- a/qurator/dinglehopper/substitute_equivalences.py +++ b/qurator/dinglehopper/substitute_equivalences.py @@ -1,21 +1,15 @@ import unicodedata -def substitute_equivalences(s): +def unjoin_ligatures(s): + """Unjoin ligatures, i.e. ff becomes ff.""" - # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR - # It might make sense to use different rules for GT and for the different OCR equivalences = { - '': 'ü', '': 'ſſ', "\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I - '': 'ä', '': 'ch', - '==': '–', # → en-dash - '—': '–', # em-dash → en-dash '': 'ck', '': 'll', - '': 'ö', '': 'ſi', '': 'ſt', 'fi': 'fi', @@ -23,12 +17,7 @@ def substitute_equivalences(s): 'fl': 'fl', 'ffi': 'ffi', '': 'ct', - '’': '\'', - '⸗': '-', '': 'tz', # MUFI: LATIN SMALL LIGATURE TZ - 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E - 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E - 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E '\uf532': 'as', # eMOP: Latin small ligature as '\uf533': 'is', # eMOP: Latin small ligature is '\uf534': 'us', # eMOP: Latin small ligature us @@ -37,10 +26,32 @@ def substitute_equivalences(s): '\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly? '\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P 'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST + } + s = unicodedata.normalize('NFC', s) + for fr, to in equivalences.items(): + s = s.replace(fr, to) + return s + + +def substitute_equivalences(s): + # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR + # It might make sense to use different rules for GT and for the different OCR + equivalences = { + '': 'ü', + '': 'ä', + '==': '–', # → en-dash + '—': '–', # em-dash → en-dash + '': 'ö', + '’': '\'', + '⸗': '-', + 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E + 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E + 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E '\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT } s = unicodedata.normalize('NFC', s) + s = unjoin_ligatures(s) for fr, to in equivalences.items(): s = s.replace(fr, to) return s diff --git a/qurator/dinglehopper/templates/report.html.js b/qurator/dinglehopper/templates/report.html.js index ac43676..4c2ba28 100644 --- a/qurator/dinglehopper/templates/report.html.js +++ b/qurator/dinglehopper/templates/report.html.js @@ -1,14 +1,15 @@ function find_diff_class(classes) { - return classes.split(/\s+/).find(x => x.match(/.diff\d.*/)); + return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/))); } $(document).ready(function() { + /* Enable Bootstrap tooltips */ + $('[data-toggle="tooltip"]').tooltip(); + $('.diff').mouseover(function() { - let c = find_diff_class($(this).attr('class')) - $('.' + c).addClass('diff-highlight') + find_diff_class($(this).attr('class')).addClass('diff-highlight'); }); $('.diff').mouseout(function() { - let c = find_diff_class($(this).attr('class')) - $('.' + c).removeClass('diff-highlight') + find_diff_class($(this).attr('class')).removeClass('diff-highlight'); }); }); diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py new file mode 100644 index 0000000..2e6a9e6 --- /dev/null +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -0,0 +1,68 @@ +import unicodedata +import pytest +from qurator.dinglehopper import ExtractedText, ExtractedTextSegment +from uniseg.graphemecluster import grapheme_clusters +from qurator.dinglehopper import seq_align +from collections import namedtuple + + +def test_text(): + test1 = ExtractedText([ + ExtractedTextSegment('s0', 'foo'), + ExtractedTextSegment('s1', 'bar'), + ExtractedTextSegment('s2', 'bazinga') + ], ' ') + + assert test1.text == 'foo bar bazinga' + assert test1.segment_id_for_pos(0) == 's0' + assert test1.segment_id_for_pos(3) is None + assert test1.segment_id_for_pos(10) == 's2' + + +def test_normalization_check(): + with pytest.raises(ValueError, match=r'.*is not normalized.*'): + ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) + assert ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) + + +AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id') + + +def test_align(): + """ + Test aligning by character while retaining segment id info + + The difficulty here is that aligning should work on grapheme clusters, + not Python characters. + """ + + test1 = ExtractedText([ + ExtractedTextSegment('s0', 'foo'), + ExtractedTextSegment('s1', 'bar'), + ExtractedTextSegment('s2', 'batzinga') + ], ' ') + test2 = ExtractedText([ + ExtractedTextSegment('x0', 'foo'), + ExtractedTextSegment('x1', 'bar'), + ExtractedTextSegment('x2', '.'), # extra . + ExtractedTextSegment('x3', 'bazim̃ga'), # deletion + different grapheme cluster, m̃ also is two Python characters + ], ' ') + + left_pos = 0; right_pos = 0; alignment = [] + for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)): + left_id = test1.segment_id_for_pos(left_pos) if left is not None else None + right_id = test2.segment_id_for_pos(right_pos) if right is not None else None + el = AlignmentElement(left, right, left_id, right_id) + alignment.append(el) + if left is not None: + left_pos += len(left) + if right is not None: + right_pos += len(right) + + print('test1: {}'.format(test1.text)) + print('test2: {}'.format(test2.text)) + + assert alignment[0] == ('f', 'f', 's0', 'x0') + assert alignment[8] == (None, '.', None, 'x2') + assert alignment[12] == ('t', None, 's2', None) + assert alignment[15] == ('n', 'm̃', 's2', 'x3') diff --git a/qurator/dinglehopper/tests/test_align.py b/qurator/dinglehopper/tests/test_align.py index cc5cb43..23483f8 100644 --- a/qurator/dinglehopper/tests/test_align.py +++ b/qurator/dinglehopper/tests/test_align.py @@ -78,7 +78,8 @@ def test_lines(): def test_lines_similar(): - """Test comparing list of lines while using a "weaker equivalence". + """ + Test comparing list of lines while using a "weaker equivalence". This mainly serves as documentation. """ @@ -88,7 +89,14 @@ def test_lines_similar(): self._string = string def __eq__(self, other): - return distance(self._string, other._string) < 2 # XXX NOT the final version + # Just an example! + min_len = min(len(self._string), len(other._string)) + if min_len > 0: + normalized_distance = distance(self._string, other._string)/min_len + similar = normalized_distance < 0.1 + else: + similar = False + return similar def __ne__(self, other): return not self.__eq__(other) @@ -106,3 +114,6 @@ def test_lines_similar(): left, right = unzip(result) assert list(left) == [SimilarString('This is a line.'), SimilarString('This is another'), None, SimilarString('And the last line')] assert list(right) == [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')] + + # Test __eq__ (i.e. is it a substitution or a similar string?) + assert list(left)[0] == list(right)[0] diff --git a/qurator/dinglehopper/tests/test_integ_align.py b/qurator/dinglehopper/tests/test_integ_align.py index df1e230..b35974b 100644 --- a/qurator/dinglehopper/tests/test_integ_align.py +++ b/qurator/dinglehopper/tests/test_integ_align.py @@ -13,11 +13,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @pytest.mark.integration def test_align_page_files(): # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. - # → 4 elements in the alignment should be different. + # → 2 elements in the alignment should be different, the ligature is + # (currently) not counted due to normalization. # NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters. gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) result = list(align(gt, ocr)) - assert sum(left != right for left, right in result) == 4 + for left, right in result: + if left != right: + print(left, right) + assert sum(left != right for left, right in result) == 2 diff --git a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py index c27cd31..1c3bf52 100644 --- a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py +++ b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py @@ -4,6 +4,7 @@ import os import pytest from lxml import etree as ET +from uniseg.graphemecluster import grapheme_clusters from .. import character_error_rate, page_text, alto_text @@ -13,9 +14,14 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @pytest.mark.integration def test_character_error_rate_between_page_files(): # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. + # The fi ligature does not count. gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) - assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311) # 2 TextRegions, 1 \n + + gt_len = len(list(grapheme_clusters(gt))) + expected_cer = 2/gt_len + + assert character_error_rate(gt, ocr) == expected_cer @pytest.mark.integration diff --git a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py index 5699700..d71bc14 100644 --- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py +++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py @@ -1,4 +1,3 @@ -import os import json import pytest @@ -10,14 +9,17 @@ from ..cli import process def test_cli_json(tmp_path): """Test that the cli/process() yields a loadable JSON report""" - # XXX Path.__str__() is necessary for Python 3.5 with working_directory(str(tmp_path)): with open('gt.txt', 'w') as gtf: gtf.write('AAAAA') with open('ocr.txt', 'w') as ocrf: ocrf.write('AAAAB') + with open('gt.txt', 'r') as gtf: + print(gtf.read()) process('gt.txt', 'ocr.txt', 'report') + with open('report.json', 'r') as jsonf: + print(jsonf.read()) with open('report.json', 'r') as jsonf: j = json.load(jsonf) assert j['cer'] == pytest.approx(0.2) @@ -26,7 +28,6 @@ def test_cli_json(tmp_path): def test_cli_json_cer_is_infinity(tmp_path): """Test that the cli/process() yields a loadable JSON report when CER == inf""" - # XXX Path.__str__() is necessary for Python 3.5 with working_directory(str(tmp_path)): with open('gt.txt', 'w') as gtf: gtf.write('') # Empty to yield CER == inf diff --git a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py index 2857d56..cbe12f8 100644 --- a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py +++ b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py @@ -13,9 +13,11 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @pytest.mark.integration def test_distance_between_page_files(): # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. + # Due to normalization, we don't count the ligature. + # → 2 differences gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) - assert distance(gt, ocr) == 4 + assert distance(gt, ocr) == 2 @pytest.mark.integration diff --git a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py index 41da748..75bb816 100644 --- a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py +++ b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py @@ -1,11 +1,9 @@ import os -import re import shutil import json from pathlib import Path from click.testing import CliRunner -import pytest from .util import working_directory @@ -17,8 +15,6 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') def test_ocrd_cli(tmp_path): """Test OCR-D interface""" - # XXX Path.str() is necessary for Python 3.5 - # Copy test workspace test_workspace_dir_source = Path(data_dir) / 'actevedef_718448162' test_workspace_dir = tmp_path / 'test_ocrd_cli' diff --git a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py index 1d2dead..f5c922b 100644 --- a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py +++ b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py @@ -12,14 +12,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @pytest.mark.integration def test_word_error_rate_between_page_files(): - # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. → 3 changed words + # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words, + # the ligature does not count → 2 errors gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line assert len(list(words(gt))) == gt_word_count ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) - assert word_error_rate(gt, ocr) == 3/gt_word_count + assert word_error_rate(gt, ocr) == 2/gt_word_count @pytest.mark.integration diff --git a/qurator/dinglehopper/tests/test_ocr_files.py b/qurator/dinglehopper/tests/test_ocr_files.py index dd9377a..3291152 100644 --- a/qurator/dinglehopper/tests/test_ocr_files.py +++ b/qurator/dinglehopper/tests/test_ocr_files.py @@ -6,7 +6,8 @@ import textwrap import pytest -from .. import alto_namespace, alto_text, page_namespace, page_text, text +from .util import working_directory +from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @@ -49,27 +50,51 @@ def test_page_namespace(): def test_page_test(): tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml')) result = page_text(tree) + + # We are currently normalizing on extraction, so the text is normalized. + # + # expected = textwrap.dedent("""\ + # ber die vielen Sorgen wegen deelben vergaß + # Hartkopf, der Frau Amtmnnin das ver⸗ + # ſproene zu berliefern. — Ein Erpreer + # wurde an ihn abgeſit, um ihn ums Him⸗ + # melswien zu ſagen, daß er das Verſproene + # glei den Augenbli berbringen mte, die + # Frau Amtmnnin htte  auf ihn verlaen, + # und nun wßte e nit, was e anfangen + # ſote. Den Augenbli ſote er kommen, + # ſon vergieng e in ihrer Ang. — Die + # Ge wren ſon angekommen, und es fehlte + # ihr do no an aem. — + # Hartkopf mußte  er bennen, und + # endli na langem Nadenken fiel es ihm er + # wieder ein. — Er langte den Zettel aus dem + # Accisbue heraus, und ſagte ſeiner Frau, daß + # e das, was da wre, herbeyſaffen mte. + # Jndeß mangelten do einige Generalia, die + # alſo wegfielen. — Hartkopf gieng ſelb + # mit und berbrate es. —""") expected = textwrap.dedent("""\ - ber die vielen Sorgen wegen deelben vergaß - Hartkopf, der Frau Amtmnnin das ver⸗ - ſproene zu berliefern. — Ein Erpreer - wurde an ihn abgeſit, um ihn ums Him⸗ - melswien zu ſagen, daß er das Verſproene - glei den Augenbli berbringen mte, die - Frau Amtmnnin htte  auf ihn verlaen, - und nun wßte e nit, was e anfangen - ſote. Den Augenbli ſote er kommen, - ſon vergieng e in ihrer Ang. — Die - Ge wren ſon angekommen, und es fehlte - ihr do no an aem. — - Hartkopf mußte  er bennen, und - endli na langem Nadenken fiel es ihm er - wieder ein. — Er langte den Zettel aus dem - Accisbue heraus, und ſagte ſeiner Frau, daß - e das, was da wre, herbeyſaffen mte. - Jndeß mangelten do einige Generalia, die - alſo wegfielen. — Hartkopf gieng ſelb - mit und berbrate es. —""") + über die vielen Sorgen wegen deſſelben vergaß + Hartkopf, der Frau Amtmännin das ver- + ſprochene zu überliefern. – Ein Erpreſſer + wurde an ihn abgeſchickt, um ihn ums Him- + melswillen zu ſagen, daß er das Verſprochene + gleich den Augenblick überbringen möchte, die + Frau Amtmännin hätte ſich auf ihn verlaſſen, + und nun wüßte ſie nicht, was ſie anfangen + ſollte. Den Augenblick ſollte er kommen, + ſonſt vergieng ſie in ihrer Angſt. – Die + Gäſte wären ſchon angekommen, und es fehlte + ihr doch noch an allem. – + Hartkopf mußte ſich erſt beſinnen, und + endlich nach langem Nachdenken fiel es ihm erſt + wieder ein. – Er langte den Zettel aus dem + Accisbuche heraus, und ſagte ſeiner Frau, daß + ſie das, was da wäre, herbeyſchaffen möchte. + Jndeß mangelten doch einige Generalia, die + alſo wegfielen. – Hartkopf gieng ſelbſt + mit und überbrachte es. –""") assert result == expected @@ -92,7 +117,8 @@ def test_page_order(): tree = ET.parse(os.path.join(data_dir, 'order.page.xml')) result = page_text(tree) - assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL) + print(result) + assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die', result, re.DOTALL) def test_page_mixed_regions(): @@ -106,5 +132,15 @@ def test_page_mixed_regions(): def test_text(): assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml')) - assert "wieder ein. — Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml')) + assert "wieder ein. – Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml')) assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt')) + + +def test_plain(tmp_path): + with working_directory(str(tmp_path)): + with open('ocr.txt', 'w') as ocrf: + ocrf.write('AAAAB') + + result = plain_text('ocr.txt') + expected = 'AAAAB' + assert result == expected diff --git a/qurator/dinglehopper/tests/util.py b/qurator/dinglehopper/tests/util.py index 52b7506..1f224e5 100644 --- a/qurator/dinglehopper/tests/util.py +++ b/qurator/dinglehopper/tests/util.py @@ -21,8 +21,8 @@ def diffprint(x, y): _diffprint(x, y) -def unzip(l): - return zip(*l) +def unzip(an_iterable_of_tuples): + return zip(*an_iterable_of_tuples) class working_directory: diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py index 7ed56e4..64eba0a 100644 --- a/qurator/dinglehopper/word_error_rate.py +++ b/qurator/dinglehopper/word_error_rate.py @@ -32,6 +32,11 @@ def words(s): cat = subcat[0] return cat in unwanted_categories or subcat in unwanted_subcategories + # XXX + from .cli import ExtractedText + if isinstance(s, ExtractedText): + s = s.text + # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters." for word in uniseg.wordbreak.words(s): @@ -42,10 +47,20 @@ def words(s): def words_normalized(s): + # XXX + from .cli import ExtractedText + if isinstance(s, ExtractedText): + s = s.text return words(unicodedata.normalize('NFC', s)) def word_error_rate_n(reference, compared) -> Tuple[float, int]: + # XXX + from .cli import ExtractedText + if isinstance(reference, ExtractedText): + reference = reference.text + if isinstance(compared, ExtractedText): + compared = compared.text if isinstance(reference, str): reference_seq = list(words_normalized(reference)) compared_seq = list(words_normalized(compared)) diff --git a/requirements.txt b/requirements.txt index de6547b..846990b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,6 @@ lxml uniseg numpy colorama +MarkupSafe ocrd >= 1.0.0b15 attrs diff --git a/setup.cfg b/setup.cfg index 6deafc2..43d7a3a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,2 @@ [flake8] -max-line-length = 120 +max-line-length = 90