✅ Test word segmentation (Fixes #30)

2026-06-30 15:59:11 +02:00 · 2020-02-04 18:40:06 +01:00 · 2020-02-04 18:40:06 +01:00 · 82fe0333f1
commit 82fe0333f1
parent 9010250911
1 changed files with 38 additions and 4 deletions
--- a/test/test_recognize.py
+++ b/test/test_recognize.py
@ -2,6 +2,7 @@ import os
 import shutil
 import subprocess
 import urllib.request
 from lxml import etree
 import pytest
 from ocrd.resolver import Resolver
@ -11,6 +12,7 @@ from .base import assets
 METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml')
 WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
 CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json')
@pytest.fixture
@ -53,12 +55,44 @@ def test_recognize(workspace):
        input_file_grp="OCR-D-GT-SEG-LINE",
        output_file_grp="OCR-D-OCR-CALAMARI",
        parameter={
-            'checkpoint': os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json')
+            "checkpoint": CHECKPOINT,
        }
    ).process()
    workspace.save_mets()
-    page1 = os.path.join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml')
+    page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
    assert os.path.exists(page1)
-    with open(page1, 'r', encoding='utf-8') as f:
+    with open(page1, "r", encoding="utf-8") as f:
-        assert 'verſchuldeten' in f.read()
+        assert "verſchuldeten" in f.read()
 def test_word_segmentation(workspace):
    CalamariRecognize(
        workspace,
        input_file_grp="OCR-D-GT-SEG-LINE",
        output_file_grp="OCR-D-OCR-CALAMARI",
        parameter={
            "checkpoint": CHECKPOINT,
        }
    ).process()
    workspace.save_mets()
    page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
    assert os.path.exists(page1)
    tree = etree.parse(page1)
    NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
    # The result should contain a TextLine that contains the text "December"
    line = tree.xpath(".//pc:TextLine[pc:TextEquiv/pc:Unicode[contains(text(),'December')]]", namespaces=NSMAP)[0]
    assert line
    # The textline should a. contain multiple words and b. these should concatenate fine to produce the same line text
    words = line.xpath(".//pc:Word", namespaces=NSMAP)
    assert len(words) >= 2
    words_text = " ".join(word.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text for word in words)
    line_text = line.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text
    assert words_text == line_text
 # vim:tw=120: