✅ Test word segmentation (Fixes #30)

2026-07-21 18:19:10 +02:00 · 2020-02-04 18:40:06 +01:00 · 2020-02-04 18:40:06 +01:00 · 82fe0333f1
commit 82fe0333f1
parent 9010250911
1 changed files with 38 additions and 4 deletions
--- a/test/test_recognize.py
+++ b/test/test_recognize.py
@ -2,6 +2,7 @@ import os
 import shutil
 import subprocess
 import urllib.request
+from lxml import etree

 import pytest
 from ocrd.resolver import Resolver
@ -11,6 +12,7 @@ from .base import assets

 METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml')
 WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
+CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json')


@pytest.fixture
@ -53,12 +55,44 @@ def test_recognize(workspace):
        input_file_grp="OCR-D-GT-SEG-LINE",
        output_file_grp="OCR-D-OCR-CALAMARI",
        parameter={
-            'checkpoint': os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json')
+            "checkpoint": CHECKPOINT,
        }
    ).process()
    workspace.save_mets()

-    page1 = os.path.join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml')
+    page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
    assert os.path.exists(page1)
-    with open(page1, 'r', encoding='utf-8') as f:
-        assert 'verſchuldeten' in f.read()
+    with open(page1, "r", encoding="utf-8") as f:
+        assert "verſchuldeten" in f.read()
+
+
+def test_word_segmentation(workspace):
+    CalamariRecognize(
+        workspace,
+        input_file_grp="OCR-D-GT-SEG-LINE",
+        output_file_grp="OCR-D-OCR-CALAMARI",
+        parameter={
+            "checkpoint": CHECKPOINT,
+        }
+    ).process()
+    workspace.save_mets()
+
+    page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
+    assert os.path.exists(page1)
+    tree = etree.parse(page1)
+
+    NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
+
+    # The result should contain a TextLine that contains the text "December"
+    line = tree.xpath(".//pc:TextLine[pc:TextEquiv/pc:Unicode[contains(text(),'December')]]", namespaces=NSMAP)[0]
+    assert line
+
+    # The textline should a. contain multiple words and b. these should concatenate fine to produce the same line text
+    words = line.xpath(".//pc:Word", namespaces=NSMAP)
+    assert len(words) >= 2
+    words_text = " ".join(word.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text for word in words)
+    line_text = line.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text
+    assert words_text == line_text
+
+
+# vim:tw=120: