diff --git a/test/test_recognize.py b/test/test_recognize.py index b6b6980..11b34ad 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -2,6 +2,7 @@ import os import shutil import subprocess import urllib.request +from lxml import etree import pytest from ocrd.resolver import Resolver @@ -11,6 +12,7 @@ from .base import assets METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml') WORKSPACE_DIR = '/tmp/test-ocrd-calamari' +CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json') @pytest.fixture @@ -53,12 +55,44 @@ def test_recognize(workspace): input_file_grp="OCR-D-GT-SEG-LINE", output_file_grp="OCR-D-OCR-CALAMARI", parameter={ - 'checkpoint': os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json') + "checkpoint": CHECKPOINT, } ).process() workspace.save_mets() - page1 = os.path.join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml') + page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml") assert os.path.exists(page1) - with open(page1, 'r', encoding='utf-8') as f: - assert 'verſchuldeten' in f.read() + with open(page1, "r", encoding="utf-8") as f: + assert "verſchuldeten" in f.read() + + +def test_word_segmentation(workspace): + CalamariRecognize( + workspace, + input_file_grp="OCR-D-GT-SEG-LINE", + output_file_grp="OCR-D-OCR-CALAMARI", + parameter={ + "checkpoint": CHECKPOINT, + } + ).process() + workspace.save_mets() + + page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml") + assert os.path.exists(page1) + tree = etree.parse(page1) + + NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" } + + # The result should contain a TextLine that contains the text "December" + line = tree.xpath(".//pc:TextLine[pc:TextEquiv/pc:Unicode[contains(text(),'December')]]", namespaces=NSMAP)[0] + assert line + + # The textline should a. contain multiple words and b. these should concatenate fine to produce the same line text + words = line.xpath(".//pc:Word", namespaces=NSMAP) + assert len(words) >= 2 + words_text = " ".join(word.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text for word in words) + line_text = line.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text + assert words_text == line_text + + +# vim:tw=120: