refactor cli tests

2026-03-02 05:11:57 +01:00 · 2025-10-29 16:20:30 +01:00 · 2025-10-29 16:20:30 +01:00 · b6f82c72b9
commit b6f82c72b9
parent ef999c8f0a
15 changed files with 453 additions and 592 deletions
--- a/tests/cli_tests/conftest.py
+++ b/tests/cli_tests/conftest.py
@ -0,0 +1,36 @@
+from typing import List
+from click import Command
+import pytest
+import logging
+
+from click.testing import CliRunner, Result
+
+@pytest.fixture
+def run_eynollah_ok_and_check_logs(
+    pytestconfig,
+    caplog,
+    model_dir,
+    eynollah_log_filter,
+):
+    """
+    Generates a Click Runner for `cli`, injects model_path and logging level
+    to `args`, runs the command and checks whether the logs generated contain
+    every fragment in `expected_logs`
+    """
+
+    def _run_click_ok_logs(cli: Command, args: List[str], expected_logs: List[str]) -> Result:
+        args = ['-m', model_dir] + args
+        if pytestconfig.getoption('verbose') > 0:
+            args.extend(['-l', 'DEBUG'])
+        caplog.set_level(logging.INFO)
+        runner = CliRunner()
+        with caplog.filtering(eynollah_log_filter):
+            result = runner.invoke(cli, args, catch_exceptions=False)
+        assert result.exit_code == 0, result.stdout
+        if expected_logs:
+            logmsgs = [logrec.message for logrec in caplog.records]
+            assert any(logmsg.startswith(needle) for needle in expected_logs for logmsg in logmsgs), f'{expected_logs} not in {logmsgs}'
+        return result
+
+    return _run_click_ok_logs
+
--- a/tests/cli_tests/test_binarization.py
+++ b/tests/cli_tests/test_binarization.py
@ -0,0 +1,58 @@
+import pytest
+from PIL import Image
+from eynollah.cli import (
+    binarization as binarization_cli,
+)
+from ocrd_modelfactory import page_from_file
+from ocrd_models.constants import NAMESPACES as NS
+
+@pytest.mark.parametrize(
+    "options",
+    [
+            [], # defaults
+            ["--no-patches"],
+    ], ids=str)
+def test_run_eynollah_binarization_filename(
+    tmp_path,
+    run_eynollah_ok_and_check_logs,
+    tests_dir,
+    options,
+):
+    infile = tests_dir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
+    outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
+    run_eynollah_ok_and_check_logs(
+        binarization_cli,
+        [
+            '-i', str(infile),
+            '-o', str(outfile),
+        ] + options,
+        [
+            'Predicting'
+        ]
+    )
+    assert outfile.exists()
+    with Image.open(infile) as original_img:
+        original_size = original_img.size
+    with Image.open(outfile) as binarized_img:
+        binarized_size = binarized_img.size
+    assert original_size == binarized_size
+
+def test_run_eynollah_binarization_directory(
+    tmp_path,
+    run_eynollah_ok_and_check_logs,
+    resources_dir,
+    image_resources,
+):
+    outdir = tmp_path
+    run_eynollah_ok_and_check_logs(
+        binarization_cli,
+        [
+            '-di', str(resources_dir),
+            '-o', str(outdir),
+        ],
+        [
+            f'Predicting {image_resources[0].name}',
+            f'Predicting {image_resources[1].name}',
+        ]
+    )
+    assert len(list(outdir.iterdir())) == 2
--- a/tests/cli_tests/test_enhance.py
+++ b/tests/cli_tests/test_enhance.py
@ -0,0 +1,57 @@
+import pytest
+from PIL import Image
+from eynollah.cli import (
+    enhancement as enhancement_cli,
+)
+from ocrd_modelfactory import page_from_file
+from ocrd_models.constants import NAMESPACES as NS
+
+@pytest.mark.parametrize(
+    "options",
+    [
+            [], # defaults
+            ["-sos"],
+    ], ids=str)
+def test_run_eynollah_enhancement_filename(
+    tmp_path,
+    resources_dir,
+    run_eynollah_ok_and_check_logs,
+    options,
+):
+    infile = resources_dir / 'kant_aufklaerung_1784_0020.tif'
+    outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
+    run_eynollah_ok_and_check_logs(
+        enhancement_cli,
+        [
+            '-i', str(infile),
+            '-o', str(outfile.parent),
+        ] + options,
+        [
+            'Image was enhanced',
+        ]
+    )
+    with Image.open(infile) as original_img:
+        original_size = original_img.size
+    with Image.open(outfile) as enhanced_img:
+        enhanced_size = enhanced_img.size
+    assert (original_size == enhanced_size) == ("-sos" in options)
+
+def test_run_eynollah_enhancement_directory(
+    tmp_path,
+    resources_dir,
+    image_resources,
+    run_eynollah_ok_and_check_logs,
+):
+    outdir = tmp_path
+    run_eynollah_ok_and_check_logs(
+        enhancement_cli,
+        [
+            '-di', str(resources_dir),
+            '-o', str(outdir),
+        ],
+        [
+            f'Image {image_resources[0]} was enhanced',
+            f'Image {image_resources[1]} was enhanced',
+        ]
+    )
+    assert len(list(outdir.iterdir())) == 2
--- a/tests/cli_tests/test_layout.py
+++ b/tests/cli_tests/test_layout.py
@ -0,0 +1,109 @@
+import pytest
+from eynollah.cli import (
+    layout as layout_cli,
+)
+from ocrd_modelfactory import page_from_file
+from ocrd_models.constants import NAMESPACES as NS
+
+@pytest.mark.parametrize(
+    "options",
+    [
+            [], # defaults
+            #["--allow_scaling", "--curved-line"],
+            ["--allow_scaling", "--curved-line", "--full-layout"],
+            ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"],
+            ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based",
+             "--textline_light", "--light_version"],
+            # -ep ...
+            # -eoi ...
+            # FIXME: find out whether OCR extra was installed, otherwise skip these
+            ["--do_ocr"],
+            ["--do_ocr", "--light_version", "--textline_light"],
+            ["--do_ocr", "--transformer_ocr"],
+            #["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"],
+            ["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"],
+            # --skip_layout_and_reading_order
+    ], ids=str)
+def test_run_eynollah_layout_filename(
+    tmp_path,
+    run_eynollah_ok_and_check_logs,
+    resources_dir,
+    options,
+):
+    outdir = tmp_path
+    infile = resources_dir / 'kant_aufklaerung_1784_0020.tif'
+    outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml'
+    run_eynollah_ok_and_check_logs(
+        layout_cli,
+        [
+        '-i', str(infile),
+        '-o', str(outfile.parent),
+        ] + options,
+        [
+            str(infile)
+        ]
+    )
+    assert outfile.exists()
+    tree = page_from_file(str(outfile)).etree
+    regions = tree.xpath("//page:TextRegion", namespaces=NS)
+    assert len(regions) >= 2, "result is inaccurate"
+    regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
+    assert len(regions) >= 2, "result is inaccurate"
+    lines = tree.xpath("//page:TextLine", namespaces=NS)
+    assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line
+
+@pytest.mark.parametrize(
+    "options",
+    [
+            ["--tables"],
+            ["--tables", "--full-layout"],
+            ["--tables", "--full-layout", "--textline_light", "--light_version"],
+    ], ids=str)
+def test_run_eynollah_layout_filename2(
+    tmp_path,
+    resources_dir,
+    run_eynollah_ok_and_check_logs,
+    options,
+):
+    infile = resources_dir / 'euler_rechenkunst01_1738_0025.tif'
+    outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml'
+    run_eynollah_ok_and_check_logs(
+        layout_cli,
+        [
+            '-i', str(infile),
+            '-o', str(outfile.parent),
+        ] + options,
+        [
+            infile
+        ]
+    )
+    assert outfile.exists()
+    tree = page_from_file(str(outfile)).etree
+    regions = tree.xpath("//page:TextRegion", namespaces=NS)
+    assert len(regions) >= 2, "result is inaccurate"
+    regions = tree.xpath("//page:TableRegion", namespaces=NS)
+    # model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP
+    assert len(regions) >= 1, "result is inaccurate"
+    regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
+    assert len(regions) >= 2, "result is inaccurate"
+    lines = tree.xpath("//page:TextLine", namespaces=NS)
+    assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line
+
+def test_run_eynollah_layout_directory(
+    tmp_path,
+    resources_dir,
+    run_eynollah_ok_and_check_logs,
+):
+    outdir = tmp_path
+    run_eynollah_ok_and_check_logs(
+        layout_cli,
+        [
+        '-di', str(resources_dir),
+        '-o', str(outdir),
+        ],
+        [
+            'Job done in',
+            'All jobs done in',
+        ]
+    )
+    assert len(list(outdir.iterdir())) == 2
--- a/tests/cli_tests/test_mbreorder.py
+++ b/tests/cli_tests/test_mbreorder.py
@ -0,0 +1,53 @@
+from ocrd_modelfactory import page_from_file
+from ocrd_models.constants import NAMESPACES as NS
+
+from eynollah.cli import (
+    machine_based_reading_order as mbreorder_cli,
+)
+
+
+def test_run_eynollah_mbreorder_filename(
+    tmp_path,
+    resources_dir,
+    run_eynollah_ok_and_check_logs,
+):
+    infile = resources_dir / 'kant_aufklaerung_1784_0020.xml'
+    outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
+    run_eynollah_ok_and_check_logs(
+        mbreorder_cli,
+        [
+            '-i', str(infile),
+            '-o', str(outfile.parent),
+        ],
+        [
+            # FIXME: mbreorder has no logging!
+        ]
+    )
+    assert outfile.exists()
+    #in_tree = page_from_file(str(infile)).etree
+    #in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
+    out_tree = page_from_file(str(outfile)).etree
+    out_order = out_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
+    #assert len(out_order) >= 2, "result is inaccurate"
+    #assert in_order != out_order
+    assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3']
+
+def test_run_eynollah_mbreorder_directory(
+    tmp_path,
+    resources_dir,
+    run_eynollah_ok_and_check_logs,
+):
+    outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
+    outdir = tmp_path
+    run_eynollah_ok_and_check_logs(
+        mbreorder_cli,
+        [
+            '-di', str(resources_dir),
+            '-o', str(outdir),
+        ],
+        [
+            # FIXME: mbreorder has no logging!
+        ]
+    )
+    assert len(list(outdir.iterdir())) == 2
+
--- a/tests/cli_tests/test_ocr.py
+++ b/tests/cli_tests/test_ocr.py
@ -0,0 +1,67 @@
+import pytest
+from eynollah.cli import (
+    ocr as ocr_cli,
+)
+from ocrd_modelfactory import page_from_file
+from ocrd_models.constants import NAMESPACES as NS
+
+@pytest.mark.parametrize(
+    "options",
+    [
+        [], # defaults
+        ["-doit", #str(outrenderfile.parent)],
+         ],
+        ["-trocr"],
+    ], ids=str)
+def test_run_eynollah_ocr_filename(
+    tmp_path,
+    run_eynollah_ok_and_check_logs,
+    resources_dir,
+    options,
+):
+    infile = resources_dir / 'kant_aufklaerung_1784_0020.tif'
+    outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
+    outrenderfile = tmp_path / 'render' / 'kant_aufklaerung_1784_0020.png'
+    outrenderfile.parent.mkdir()
+    if "-doit" in options:
+        options.insert(options.index("-doit") + 1, str(outrenderfile.parent))
+    run_eynollah_ok_and_check_logs(
+        ocr_cli,
+        [
+            '-i', str(infile),
+            '-dx', str(infile.parent),
+            '-o', str(outfile.parent),
+        ] + options,
+        [
+            # FIXME: ocr has no logging!
+        ]
+    )
+    assert outfile.exists()
+    if "-doit" in options:
+        assert outrenderfile.exists()
+    #in_tree = page_from_file(str(infile)).etree
+    #in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
+    out_tree = page_from_file(str(outfile)).etree
+    out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS)
+    assert len(out_texts) >= 2, ("result is inaccurate", out_texts)
+    assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts)
+
+def test_run_eynollah_ocr_directory(
+    tmp_path,
+    run_eynollah_ok_and_check_logs,
+    resources_dir,
+):
+    outdir = tmp_path
+    run_eynollah_ok_and_check_logs(
+        ocr_cli,
+        [
+            '-di', str(resources_dir),
+            '-dx', str(resources_dir),
+            '-o', str(outdir),
+        ],
+        [
+            # FIXME: ocr has no logging!
+        ]
+    )
+    assert len(list(outdir.iterdir())) == 2
+
--- a/tests/cli_tests/test_run.py
+++ b/tests/cli_tests/test_run.py
@ -0,0 +1,10 @@
+import pytest
+from PIL import Image
+from eynollah.cli import (
+    layout as layout_cli,
+    binarization as binarization_cli,
+    enhancement as enhancement_cli,
+)
+from ocrd_modelfactory import page_from_file
+from ocrd_models.constants import NAMESPACES as NS
+