test_run: add tests for ocr

2026-03-05 14:51:57 +01:00 · 2025-09-25 19:53:19 +02:00 · 2025-09-25 19:53:19 +02:00 · 5c7e1f21fb
commit 5c7e1f21fb
parent 2d14d57e4f
1 changed files with 73 additions and 7 deletions
--- a/tests/test_run.py
+++ b/tests/test_run.py
@ -7,6 +7,7 @@ from eynollah.cli import (
    binarization as binarization_cli,
    enhancement as enhancement_cli,
    machine_based_reading_order as mbreorder_cli,
+    ocr as ocr_cli,
 )
 from click.testing import CliRunner
 from ocrd_modelfactory import page_from_file
@ -76,7 +77,7 @@ def test_run_eynollah_layout_directory(tmp_path, pytestconfig, caplog):
        return logrec.name == 'eynollah'
    runner = CliRunner()
    with caplog.filtering(only_eynollah):
-        result = runner.invoke(layout_cli, args)
+        result = runner.invoke(layout_cli, args, catch_exceptions=False)
    assert result.exit_code == 0, result.stdout
    logmsgs = [logrec.message for logrec in caplog.records]
    assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Job done in')]) == 2
@ -104,7 +105,7 @@ def test_run_eynollah_binarization_filename(tmp_path, subtests, pytestconfig, ca
        with subtests.test(#msg="test CLI",
                           options=options):
            with caplog.filtering(only_eynollah):
-                result = runner.invoke(binarization_cli, args + options)
+                result = runner.invoke(binarization_cli, args + options, catch_exceptions=False)
            assert result.exit_code == 0, result.stdout
            logmsgs = [logrec.message for logrec in caplog.records]
            assert any(True for logmsg in logmsgs if logmsg.startswith('Predicting'))
@ -130,7 +131,7 @@ def test_run_eynollah_binarization_directory(tmp_path, subtests, pytestconfig, c
        return logrec.name == 'SbbBinarizer'
    runner = CliRunner()
    with caplog.filtering(only_eynollah):
-        result = runner.invoke(binarization_cli, args)
+        result = runner.invoke(binarization_cli, args, catch_exceptions=False)
    assert result.exit_code == 0, result.stdout
    logmsgs = [logrec.message for logrec in caplog.records]
    assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Predicting')]) == 2
@ -159,7 +160,7 @@ def test_run_eynollah_enhancement_filename(tmp_path, subtests, pytestconfig, cap
        with subtests.test(#msg="test CLI",
                           options=options):
            with caplog.filtering(only_eynollah):
-                result = runner.invoke(enhancement_cli, args + options)
+                result = runner.invoke(enhancement_cli, args + options, catch_exceptions=False)
            assert result.exit_code == 0, result.stdout
            logmsgs = [logrec.message for logrec in caplog.records]
            assert any(True for logmsg in logmsgs if logmsg.startswith('Image was enhanced')), logmsgs
@ -185,7 +186,7 @@ def test_run_eynollah_enhancement_directory(tmp_path, subtests, pytestconfig, ca
        return logrec.name == 'enhancement'
    runner = CliRunner()
    with caplog.filtering(only_eynollah):
-        result = runner.invoke(enhancement_cli, args)
+        result = runner.invoke(enhancement_cli, args, catch_exceptions=False)
    assert result.exit_code == 0, result.stdout
    logmsgs = [logrec.message for logrec in caplog.records]
    assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Image was enhanced')]) == 2
@ -206,7 +207,7 @@ def test_run_eynollah_mbreorder_filename(tmp_path, subtests, pytestconfig, caplo
        return logrec.name == 'mbreorder'
    runner = CliRunner()
    with caplog.filtering(only_eynollah):
-        result = runner.invoke(mbreorder_cli, args)
+        result = runner.invoke(mbreorder_cli, args, catch_exceptions=False)
    assert result.exit_code == 0, result.stdout
    logmsgs = [logrec.message for logrec in caplog.records]
    # FIXME: mbreorder has no logging!
@ -235,9 +236,74 @@ def test_run_eynollah_mbreorder_directory(tmp_path, subtests, pytestconfig, capl
        return logrec.name == 'mbreorder'
    runner = CliRunner()
    with caplog.filtering(only_eynollah):
-        result = runner.invoke(mbreorder_cli, args)
+        result = runner.invoke(mbreorder_cli, args, catch_exceptions=False)
    assert result.exit_code == 0, result.stdout
    logmsgs = [logrec.message for logrec in caplog.records]
    # FIXME: mbreorder has no logging!
    #assert len([logmsg for logmsg in logmsgs if logmsg.startswith('???')]) == 2
    assert len(list(outdir.iterdir())) == 2
+
+def test_run_eynollah_ocr_filename(tmp_path, subtests, pytestconfig, caplog):
+    infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
+    outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
+    outrenderfile = tmp_path.joinpath('render').joinpath('kant_aufklaerung_1784_0020.xml')
+    outrenderfile.parent.mkdir()
+    args = [
+        '-m', EYNOLLAH_MODELS,
+        '-i', str(infile),
+        '-dx', str(infile.parent),
+        '-o', str(outfile.parent),
+        # subtests write to same location
+        '--overwrite',
+    ]
+    if pytestconfig.getoption('verbose') > 0:
+        args.extend(['-l', 'DEBUG'])
+    caplog.set_level(logging.DEBUG)
+    def only_eynollah(logrec):
+        return logrec.name == 'eynollah'
+    runner = CliRunner()
+    for options in [
+            [], # defaults
+            ["-doit", str(outrenderfile.parent)],
+            ["-trocr"],
+    ]:
+        with subtests.test(#msg="test CLI",
+                           options=options):
+            with caplog.filtering(only_eynollah):
+                result = runner.invoke(ocr_cli, args + options, catch_exceptions=False)
+            assert result.exit_code == 0, result.stdout
+            logmsgs = [logrec.message for logrec in caplog.records]
+            # FIXME: ocr has no logging!
+            #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
+            assert outfile.exists()
+            if "-doit" in options:
+                assert outrenderfile.exists()
+            #in_tree = page_from_file(str(infile)).etree
+            #in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
+            out_tree = page_from_file(str(outfile)).etree
+            out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS)
+            assert len(out_texts) >= 2, ("result is inaccurate", out_texts)
+            assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts)
+
+def test_run_eynollah_ocr_directory(tmp_path, subtests, pytestconfig, caplog):
+    indir = testdir.joinpath('resources')
+    outdir = tmp_path
+    args = [
+        '-m', EYNOLLAH_MODELS,
+        '-di', str(indir),
+        '-dx', str(indir),
+        '-o', str(outdir),
+    ]
+    if pytestconfig.getoption('verbose') > 0:
+        args.extend(['-l', 'DEBUG'])
+    caplog.set_level(logging.INFO)
+    def only_eynollah(logrec):
+        return logrec.name == 'eynollah'
+    runner = CliRunner()
+    with caplog.filtering(only_eynollah):
+        result = runner.invoke(ocr_cli, args, catch_exceptions=False)
+    assert result.exit_code == 0, result.stdout
+    logmsgs = [logrec.message for logrec in caplog.records]
+    # FIXME: ocr has no logging!
+    #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
+    assert len(list(outdir.iterdir())) == 2