From 079be203bd66e39a1b0b69ac6609418d6e9fbcb2 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 20:04:24 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20dinglehopper:=20Fix=20tests=20to?= =?UTF-8?q?=20deal=20with=20new=20normalization=20logic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 5 +- .../dinglehopper/tests/test_integ_align.py | 8 +- .../test_integ_character_error_rate_ocr.py | 8 +- .../tests/test_integ_cli_valid_json.py | 5 +- .../tests/test_integ_edit_distance_ocr.py | 4 +- .../tests/test_integ_word_error_rate_ocr.py | 5 +- qurator/dinglehopper/tests/test_ocr_files.py | 82 +++++++++++++------ 7 files changed, 85 insertions(+), 32 deletions(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 180ecd3..e1267f7 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -15,7 +15,7 @@ import unicodedata @attr.s(frozen=True) class ExtractedText: - segments = attr.ib() + segments = attr.ib(converter=list) joiner = attr.ib(type=str) # TODO Types are not validated (attr does not do this yet) @@ -80,6 +80,7 @@ class ExtractedTextSegment: segment_text = None with suppress(AttributeError): segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text + segment_text = segment_text or '' segment_text = normalize_sbb(segment_text) return cls(segment_id, segment_text) @@ -157,7 +158,7 @@ def page_extract(tree): regions.append(ExtractedTextSegment.from_text_segment(region, nsmap)) # Filter empty region texts - regions = [r for r in regions if r.text is not None] + regions = (r for r in regions if r.text is not None) return ExtractedText(regions, '\n') # FIXME needs to handle normalization diff --git a/qurator/dinglehopper/tests/test_integ_align.py b/qurator/dinglehopper/tests/test_integ_align.py index df1e230..b35974b 100644 --- a/qurator/dinglehopper/tests/test_integ_align.py +++ b/qurator/dinglehopper/tests/test_integ_align.py @@ -13,11 +13,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @pytest.mark.integration def test_align_page_files(): # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. - # → 4 elements in the alignment should be different. + # → 2 elements in the alignment should be different, the ligature is + # (currently) not counted due to normalization. # NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters. gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) result = list(align(gt, ocr)) - assert sum(left != right for left, right in result) == 4 + for left, right in result: + if left != right: + print(left, right) + assert sum(left != right for left, right in result) == 2 diff --git a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py index c27cd31..1c3bf52 100644 --- a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py +++ b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py @@ -4,6 +4,7 @@ import os import pytest from lxml import etree as ET +from uniseg.graphemecluster import grapheme_clusters from .. import character_error_rate, page_text, alto_text @@ -13,9 +14,14 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @pytest.mark.integration def test_character_error_rate_between_page_files(): # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. + # The fi ligature does not count. gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) - assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311) # 2 TextRegions, 1 \n + + gt_len = len(list(grapheme_clusters(gt))) + expected_cer = 2/gt_len + + assert character_error_rate(gt, ocr) == expected_cer @pytest.mark.integration diff --git a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py index 35421bb..d71bc14 100644 --- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py +++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py @@ -1,4 +1,3 @@ -import os import json import pytest @@ -16,7 +15,11 @@ def test_cli_json(tmp_path): with open('ocr.txt', 'w') as ocrf: ocrf.write('AAAAB') + with open('gt.txt', 'r') as gtf: + print(gtf.read()) process('gt.txt', 'ocr.txt', 'report') + with open('report.json', 'r') as jsonf: + print(jsonf.read()) with open('report.json', 'r') as jsonf: j = json.load(jsonf) assert j['cer'] == pytest.approx(0.2) diff --git a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py index 2857d56..cbe12f8 100644 --- a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py +++ b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py @@ -13,9 +13,11 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @pytest.mark.integration def test_distance_between_page_files(): # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. + # Due to normalization, we don't count the ligature. + # → 2 differences gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) - assert distance(gt, ocr) == 4 + assert distance(gt, ocr) == 2 @pytest.mark.integration diff --git a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py index 1d2dead..f5c922b 100644 --- a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py +++ b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py @@ -12,14 +12,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @pytest.mark.integration def test_word_error_rate_between_page_files(): - # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. → 3 changed words + # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words, + # the ligature does not count → 2 errors gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line assert len(list(words(gt))) == gt_word_count ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) - assert word_error_rate(gt, ocr) == 3/gt_word_count + assert word_error_rate(gt, ocr) == 2/gt_word_count @pytest.mark.integration diff --git a/qurator/dinglehopper/tests/test_ocr_files.py b/qurator/dinglehopper/tests/test_ocr_files.py index dd9377a..3291152 100644 --- a/qurator/dinglehopper/tests/test_ocr_files.py +++ b/qurator/dinglehopper/tests/test_ocr_files.py @@ -6,7 +6,8 @@ import textwrap import pytest -from .. import alto_namespace, alto_text, page_namespace, page_text, text +from .util import working_directory +from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @@ -49,27 +50,51 @@ def test_page_namespace(): def test_page_test(): tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml')) result = page_text(tree) + + # We are currently normalizing on extraction, so the text is normalized. + # + # expected = textwrap.dedent("""\ + # ber die vielen Sorgen wegen deelben vergaß + # Hartkopf, der Frau Amtmnnin das ver⸗ + # ſproene zu berliefern. — Ein Erpreer + # wurde an ihn abgeſit, um ihn ums Him⸗ + # melswien zu ſagen, daß er das Verſproene + # glei den Augenbli berbringen mte, die + # Frau Amtmnnin htte  auf ihn verlaen, + # und nun wßte e nit, was e anfangen + # ſote. Den Augenbli ſote er kommen, + # ſon vergieng e in ihrer Ang. — Die + # Ge wren ſon angekommen, und es fehlte + # ihr do no an aem. — + # Hartkopf mußte  er bennen, und + # endli na langem Nadenken fiel es ihm er + # wieder ein. — Er langte den Zettel aus dem + # Accisbue heraus, und ſagte ſeiner Frau, daß + # e das, was da wre, herbeyſaffen mte. + # Jndeß mangelten do einige Generalia, die + # alſo wegfielen. — Hartkopf gieng ſelb + # mit und berbrate es. —""") expected = textwrap.dedent("""\ - ber die vielen Sorgen wegen deelben vergaß - Hartkopf, der Frau Amtmnnin das ver⸗ - ſproene zu berliefern. — Ein Erpreer - wurde an ihn abgeſit, um ihn ums Him⸗ - melswien zu ſagen, daß er das Verſproene - glei den Augenbli berbringen mte, die - Frau Amtmnnin htte  auf ihn verlaen, - und nun wßte e nit, was e anfangen - ſote. Den Augenbli ſote er kommen, - ſon vergieng e in ihrer Ang. — Die - Ge wren ſon angekommen, und es fehlte - ihr do no an aem. — - Hartkopf mußte  er bennen, und - endli na langem Nadenken fiel es ihm er - wieder ein. — Er langte den Zettel aus dem - Accisbue heraus, und ſagte ſeiner Frau, daß - e das, was da wre, herbeyſaffen mte. - Jndeß mangelten do einige Generalia, die - alſo wegfielen. — Hartkopf gieng ſelb - mit und berbrate es. —""") + über die vielen Sorgen wegen deſſelben vergaß + Hartkopf, der Frau Amtmännin das ver- + ſprochene zu überliefern. – Ein Erpreſſer + wurde an ihn abgeſchickt, um ihn ums Him- + melswillen zu ſagen, daß er das Verſprochene + gleich den Augenblick überbringen möchte, die + Frau Amtmännin hätte ſich auf ihn verlaſſen, + und nun wüßte ſie nicht, was ſie anfangen + ſollte. Den Augenblick ſollte er kommen, + ſonſt vergieng ſie in ihrer Angſt. – Die + Gäſte wären ſchon angekommen, und es fehlte + ihr doch noch an allem. – + Hartkopf mußte ſich erſt beſinnen, und + endlich nach langem Nachdenken fiel es ihm erſt + wieder ein. – Er langte den Zettel aus dem + Accisbuche heraus, und ſagte ſeiner Frau, daß + ſie das, was da wäre, herbeyſchaffen möchte. + Jndeß mangelten doch einige Generalia, die + alſo wegfielen. – Hartkopf gieng ſelbſt + mit und überbrachte es. –""") assert result == expected @@ -92,7 +117,8 @@ def test_page_order(): tree = ET.parse(os.path.join(data_dir, 'order.page.xml')) result = page_text(tree) - assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL) + print(result) + assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die', result, re.DOTALL) def test_page_mixed_regions(): @@ -106,5 +132,15 @@ def test_page_mixed_regions(): def test_text(): assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml')) - assert "wieder ein. — Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml')) + assert "wieder ein. – Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml')) assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt')) + + +def test_plain(tmp_path): + with working_directory(str(tmp_path)): + with open('ocr.txt', 'w') as ocrf: + ocrf.write('AAAAB') + + result = plain_text('ocr.txt') + expected = 'AAAAB' + assert result == expected