From 079be203bd66e39a1b0b69ac6609418d6e9fbcb2 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 12 Jun 2020 20:04:24 +0200
Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20dinglehopper:=20Fix=20tests=20to?=
 =?UTF-8?q?=20deal=20with=20new=20normalization=20logic?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 qurator/dinglehopper/ocr_files.py             |  5 +-
 .../dinglehopper/tests/test_integ_align.py    |  8 +-
 .../test_integ_character_error_rate_ocr.py    |  8 +-
 .../tests/test_integ_cli_valid_json.py        |  5 +-
 .../tests/test_integ_edit_distance_ocr.py     |  4 +-
 .../tests/test_integ_word_error_rate_ocr.py   |  5 +-
 qurator/dinglehopper/tests/test_ocr_files.py  | 82 +++++++++++++------
 7 files changed, 85 insertions(+), 32 deletions(-)

diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py
index 180ecd3..e1267f7 100644
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@@ -15,7 +15,7 @@ import unicodedata
 
 @attr.s(frozen=True)
 class ExtractedText:
-    segments = attr.ib()
+    segments = attr.ib(converter=list)
     joiner = attr.ib(type=str)
     # TODO Types are not validated (attr does not do this yet)
 
@@ -80,6 +80,7 @@ class ExtractedTextSegment:
         segment_text = None
         with suppress(AttributeError):
             segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
+            segment_text = segment_text or ''
             segment_text = normalize_sbb(segment_text)
         return cls(segment_id, segment_text)
 
@@ -157,7 +158,7 @@ def page_extract(tree):
             regions.append(ExtractedTextSegment.from_text_segment(region, nsmap))
 
     # Filter empty region texts
-    regions = [r for r in regions if r.text is not None]
+    regions = (r for r in regions if r.text is not None)
 
     return ExtractedText(regions, '\n')
     # FIXME needs to handle normalization
diff --git a/qurator/dinglehopper/tests/test_integ_align.py b/qurator/dinglehopper/tests/test_integ_align.py
index df1e230..b35974b 100644
--- a/qurator/dinglehopper/tests/test_integ_align.py
+++ b/qurator/dinglehopper/tests/test_integ_align.py
@@ -13,11 +13,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
 @pytest.mark.integration
 def test_align_page_files():
     # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
-    # → 4 elements in the alignment should be different.
+    # → 2 elements in the alignment should be different, the ligature is
+    # (currently) not counted due to normalization.
     # NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
 
     gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
     ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
 
     result = list(align(gt, ocr))
-    assert sum(left != right for left, right in result) == 4
+    for left, right in result:
+        if left != right:
+            print(left, right)
+    assert sum(left != right for left, right in result) == 2
diff --git a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
index c27cd31..1c3bf52 100644
--- a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
@@ -4,6 +4,7 @@ import os
 
 import pytest
 from lxml import etree as ET
+from uniseg.graphemecluster import grapheme_clusters
 
 from .. import character_error_rate, page_text, alto_text
 
@@ -13,9 +14,14 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
 @pytest.mark.integration
 def test_character_error_rate_between_page_files():
     # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
+    # The fi ligature does not count.
     gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
     ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
-    assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311)  # 2 TextRegions, 1 \n
+
+    gt_len = len(list(grapheme_clusters(gt)))
+    expected_cer = 2/gt_len
+
+    assert character_error_rate(gt, ocr) == expected_cer
 
 
 @pytest.mark.integration
diff --git a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
index 35421bb..d71bc14 100644
--- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
+++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
@@ -1,4 +1,3 @@
-import os
 import json
 
 import pytest
@@ -16,7 +15,11 @@ def test_cli_json(tmp_path):
         with open('ocr.txt', 'w') as ocrf:
             ocrf.write('AAAAB')
 
+        with open('gt.txt', 'r') as gtf:
+            print(gtf.read())
         process('gt.txt', 'ocr.txt', 'report')
+        with open('report.json', 'r') as jsonf:
+            print(jsonf.read())
         with open('report.json', 'r') as jsonf:
             j = json.load(jsonf)
             assert j['cer'] == pytest.approx(0.2)
diff --git a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
index 2857d56..cbe12f8 100644
--- a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
@@ -13,9 +13,11 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
 @pytest.mark.integration
 def test_distance_between_page_files():
     # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
+    # Due to normalization, we don't count the ligature.
+    # → 2 differences
     gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
     ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
-    assert distance(gt, ocr) == 4
+    assert distance(gt, ocr) == 2
 
 
 @pytest.mark.integration
diff --git a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
index 1d2dead..f5c922b 100644
--- a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
@@ -12,14 +12,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
 
 @pytest.mark.integration
 def test_word_error_rate_between_page_files():
-    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. → 3 changed words
+    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
+    # the ligature does not count → 2 errors
     gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
 
     gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4  # Manually verified word count per line
     assert len(list(words(gt))) == gt_word_count
 
     ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
-    assert word_error_rate(gt, ocr) == 3/gt_word_count
+    assert word_error_rate(gt, ocr) == 2/gt_word_count
 
 
 @pytest.mark.integration
diff --git a/qurator/dinglehopper/tests/test_ocr_files.py b/qurator/dinglehopper/tests/test_ocr_files.py
index dd9377a..3291152 100644
--- a/qurator/dinglehopper/tests/test_ocr_files.py
+++ b/qurator/dinglehopper/tests/test_ocr_files.py
@@ -6,7 +6,8 @@ import textwrap
 
 import pytest
 
-from .. import alto_namespace, alto_text, page_namespace, page_text, text
+from .util import working_directory
+from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text
 
 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
 
@@ -49,27 +50,51 @@ def test_page_namespace():
 def test_page_test():
     tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
     result = page_text(tree)
+
+    # We are currently normalizing on extraction, so the text is normalized.
+    #
+    #  expected = textwrap.dedent("""\
+    #      ber die vielen Sorgen wegen deelben vergaß
+    #      Hartkopf, der Frau Amtmnnin das ver⸗
+    #      ſproene zu berliefern. — Ein Erpreer
+    #      wurde an ihn abgeſit, um ihn ums Him⸗
+    #      melswien zu ſagen, daß er das Verſproene
+    #      glei den Augenbli berbringen mte, die
+    #      Frau Amtmnnin htte  auf ihn verlaen,
+    #      und nun wßte e nit, was e anfangen
+    #      ſote. Den Augenbli ſote er kommen,
+    #      ſon vergieng e in ihrer Ang. — Die
+    #      Ge wren ſon angekommen, und es fehlte
+    #      ihr do no an aem. —
+    #      Hartkopf mußte  er bennen, und
+    #      endli na langem Nadenken ﬁel es ihm er
+    #      wieder ein. — Er langte den Zettel aus dem
+    #      Accisbue heraus, und ſagte ſeiner Frau, daß
+    #      e das, was da wre, herbeyſaﬀen mte.
+    #      Jndeß mangelten do einige Generalia, die
+    #      alſo wegﬁelen. — Hartkopf gieng ſelb
+    #      mit und berbrate es. —""")
     expected = textwrap.dedent("""\
-        ber die vielen Sorgen wegen deelben vergaß
-        Hartkopf, der Frau Amtmnnin das ver⸗
-        ſproene zu berliefern. — Ein Erpreer
-        wurde an ihn abgeſit, um ihn ums Him⸗
-        melswien zu ſagen, daß er das Verſproene
-        glei den Augenbli berbringen mte, die
-        Frau Amtmnnin htte  auf ihn verlaen,
-        und nun wßte e nit, was e anfangen
-        ſote. Den Augenbli ſote er kommen,
-        ſon vergieng e in ihrer Ang. — Die
-        Ge wren ſon angekommen, und es fehlte
-        ihr do no an aem. —
-        Hartkopf mußte  er bennen, und
-        endli na langem Nadenken ﬁel es ihm er
-        wieder ein. — Er langte den Zettel aus dem
-        Accisbue heraus, und ſagte ſeiner Frau, daß
-        e das, was da wre, herbeyſaﬀen mte.
-        Jndeß mangelten do einige Generalia, die
-        alſo wegﬁelen. — Hartkopf gieng ſelb
-        mit und berbrate es. —""")
+        über die vielen Sorgen wegen deſſelben vergaß
+        Hartkopf, der Frau Amtmännin das ver-
+        ſprochene zu überliefern. – Ein Erpreſſer
+        wurde an ihn abgeſchickt, um ihn ums Him-
+        melswillen zu ſagen, daß er das Verſprochene
+        gleich den Augenblick überbringen möchte, die
+        Frau Amtmännin hätte ſich auf ihn verlaſſen,
+        und nun wüßte ſie nicht, was ſie anfangen
+        ſollte. Den Augenblick ſollte er kommen,
+        ſonſt vergieng ſie in ihrer Angſt. – Die
+        Gäſte wären ſchon angekommen, und es fehlte
+        ihr doch noch an allem. –
+        Hartkopf mußte ſich erſt beſinnen, und
+        endlich nach langem Nachdenken fiel es ihm erſt
+        wieder ein. – Er langte den Zettel aus dem
+        Accisbuche heraus, und ſagte ſeiner Frau, daß
+        ſie das, was da wäre, herbeyſchaffen möchte.
+        Jndeß mangelten doch einige Generalia, die
+        alſo wegfielen. – Hartkopf gieng ſelbſt
+        mit und überbrachte es. –""")
     assert result == expected
 
 
@@ -92,7 +117,8 @@ def test_page_order():
     tree = ET.parse(os.path.join(data_dir, 'order.page.xml'))
     result = page_text(tree)
 
-    assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL)
+    print(result)
+    assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die', result, re.DOTALL)
 
 
 def test_page_mixed_regions():
@@ -106,5 +132,15 @@ def test_page_mixed_regions():
 
 def test_text():
     assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
-    assert "wieder ein. — Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
+    assert "wieder ein. – Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
     assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt'))
+
+
+def test_plain(tmp_path):
+    with working_directory(str(tmp_path)):
+        with open('ocr.txt', 'w') as ocrf:
+            ocrf.write('AAAAB')
+
+        result = plain_text('ocr.txt')
+        expected = 'AAAAB'
+        assert result == expected