wip

2026-06-16 09:59:13 +02:00 · 2025-10-23 11:58:23 +02:00 · 2025-10-23 11:58:23 +02:00 · ec1fd93dad
commit ec1fd93dad
parent 874cfc247f
6 changed files with 349 additions and 144 deletions
--- a/.github/workflows/test-eynollah.yml
+++ b/.github/workflows/test-eynollah.yml
@ -31,7 +31,7 @@ jobs:
        src: "./src"
    - name: Try to restore models_eynollah
-			uses: actions/cache/restore@v4
+      uses: actions/cache/restore@v4
      id: all_model_cache
      with:
        path: models_eynollah
@ -40,8 +40,8 @@ jobs:
    - name: Download models
      if: steps.all_model_cache.outputs.cache-hit != 'true'
      run: |
-				make models
+        make models
-				ls -la models_eynollah
+        ls -la models_eynollah
    - uses: actions/cache/save@v4
      if: steps.all_model_cache.outputs.cache-hit != 'true'
--- a/45
+++ b/45
@ -14,17 +14,9 @@ WGET = wget -O
 #SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz
 #SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz
 #SEG_MODEL := https://zenodo.org/records/17194824/files/models_layout_v0_5_0.tar.gz?download=1
-SEG_MODEL := https://zenodo.org/records/17295988/files/models_layout_v0_6_0.tar.gz?download=1
+EYNOLLAH_MODELS_URL := https://zenodo.org/records/17295988/files/models_all_v0_7_0.zip
-SEG_MODELFILE = $(notdir $(patsubst %?download=1,%,$(SEG_MODEL)))
+EYNOLLAH_MODELS_ZIP = $(notdir $(SEG_MODEL))
-SEG_MODELNAME = $(SEG_MODELFILE:%.tar.gz=%)
+EYNOLLAH_MODELS_DIR = $(SEG_MODELFILE:%.zip=%)
 BIN_MODEL := https://zenodo.org/records/17295988/files/models_binarization_v0_6_0.tar.gz?download=1
 BIN_MODELFILE = $(notdir $(BIN_MODEL))
 BIN_MODELNAME := default-2021-03-09
 OCR_MODEL := https://zenodo.org/records/17295988/files/models_ocr_v0_6_0.tar.gz?download=1
 OCR_MODELFILE = $(notdir $(patsubst %?download=1,%,$(OCR_MODEL)))
 OCR_MODELNAME = $(OCR_MODELFILE:%.tar.gz=%)
 PYTEST_ARGS ?= -vv --isolate
@ -49,33 +41,23 @@ help:
 	@echo "    EXTRAS       comma-separated list of features (like 'OCR,plotting') for 'install' [$(EXTRAS)]"
 	@echo "    DOCKER_TAG   Docker image tag for 'docker' [$(DOCKER_TAG)]"
 	@echo "    PYTEST_ARGS  pytest args for 'test' (Set to '-s' to see log output during test execution, '-vv' to see individual tests. [$(PYTEST_ARGS)]"
-	@echo "    SEG_MODEL    URL of 'models' archive to download for segmentation 'test' [$(SEG_MODEL)]"
+	@echo "    ALL_MODELS   URL of archive of all models [$(ALL_MODELS)]"
 	@echo "    BIN_MODEL    URL of 'models' archive to download for binarization 'test' [$(BIN_MODEL)]"
 	@echo "    OCR_MODEL    URL of 'models' archive to download for binarization 'test' [$(OCR_MODEL)]"
 	@echo ""
 # END-EVAL
 # Download and extract models to $(PWD)/models_layout_v0_6_0
-models: $(BIN_MODELNAME) $(SEG_MODELNAME) $(OCR_MODELNAME)
+models: $(EYNOLLAH_MODELS_DIR)
 # do not download these files if we already have the directories
-.INTERMEDIATE: $(BIN_MODELFILE) $(SEG_MODELFILE) $(OCR_MODELFILE)
+.INTERMEDIATE: $(EYNOLLAH_MODELS_ZIP)
-$(BIN_MODELFILE):
+$(EYNOLLAH_MODELS_ZIP):
-	$(WGET) $@ $(BIN_MODEL)
+	$(WGET) $@ $(EYNOLLAH_MODELS_URL)
 $(SEG_MODELFILE):
 	$(WGET) $@ $(SEG_MODEL)
 $(OCR_MODELFILE):
 	$(WGET) $@ $(OCR_MODEL)
-$(BIN_MODELNAME): $(BIN_MODELFILE)
+$(EYNOLLAH_MODELS_DIR): $(EYNOLLAH_MODELS_ZIP)
-	tar zxf $<
+	unzip $<
 $(SEG_MODELNAME): $(SEG_MODELFILE)
 	tar zxf $<
 $(OCR_MODELNAME): $(OCR_MODELFILE)
 	tar zxf $<
 build:
 	$(PIP) install build
@ -89,13 +71,8 @@ install:
 install-dev:
 	$(PIP) install -e .$(and $(EXTRAS),[$(EXTRAS)])
-ifeq (OCR,$(findstring OCR, $(EXTRAS)))
+deps-test: $(EYNOLLAH_MODELS_ZIP)
 deps-test: $(OCR_MODELNAME)
 endif
 deps-test: $(BIN_MODELNAME) $(SEG_MODELNAME)
 	$(PIP) install -r requirements-test.txt
 ifeq (OCR,$(findstring OCR, $(EXTRAS)))
 	ln -rs $(OCR_MODELNAME)/* $(SEG_MODELNAME)/
 endif
 smoke-test: TMPDIR != mktemp -d
--- a/src/eynollah/cli_models.py
+++ b/src/eynollah/cli_models.py
@ -85,9 +85,9 @@ def package(
            copies.add((src, dist_dir))
            mkdirs.add(dist_dir)
    for dir in mkdirs:
-        print(f"mkdir -p {dir}")
+        print(f"mkdir -vp {dir}")
    for (src, dst) in copies:
-        print(f"cp -r {src} {dst}")
+        print(f"cp -vr {src} {dst}")
    for dir in mkdirs:
        zip_path = Path(f'../{dir.parent.name}.zip')
-        print(f"(cd {dir}/..; zip -r {zip_path} models_eynollah)")
+        print(f"(cd {dir}/..; zip -vr {zip_path} models_eynollah)")
--- a/src/eynollah/model_zoo/default_specs.py
+++ b/src/eynollah/model_zoo/default_specs.py
@ -14,7 +14,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
        category="enhancement",
        variant='',
        filename="models_eynollah/eynollah-enhancement_20210425",
-        dists=['enhancement', 'layout'],
+        dists=['enhancement', 'layout', 'ci'],
        dist_url=dist_url("enhancement"),
        type=KerasModel,
    ),
@ -23,7 +23,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
        category="binarization",
        variant='',
        filename="models_eynollah/eynollah-binarization-hybrid_20230504",
-        dists=['layout', 'binarization'],
+        dists=['layout', 'binarization', ],
        dist_url=dist_url("binarization"),
        type=KerasModel,
    ),
--- a/tests/test_run.py
+++ b/tests/test_run.py
@ -16,114 +16,12 @@ from ocrd_models.constants import NAMESPACES as NS
 testdir = Path(__file__).parent.resolve()
 MODELS_LAYOUT = environ.get('MODELS_LAYOUT', str(testdir.joinpath('..', 'models_layout_v0_6_0').resolve()))
 MODELS_OCR = environ.get('MODELS_OCR', str(testdir.joinpath('..', 'models_ocr_v0_6_0').resolve()))
 MODELS_BIN = environ.get('MODELS_BIN', str(testdir.joinpath('..', 'default-2021-03-09').resolve()))
 def only_eynollah(logrec):
 		return logrec.name.startswith('eynollah')
@pytest.mark.parametrize(
    "options",
    [
            [], # defaults
            #["--allow_scaling", "--curved-line"],
            ["--allow_scaling", "--curved-line", "--full-layout"],
            ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"],
            ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based",
             "--textline_light", "--light_version"],
            # -ep ...
            # -eoi ...
            # FIXME: find out whether OCR extra was installed, otherwise skip these
            ["--do_ocr"],
            ["--do_ocr", "--light_version", "--textline_light"],
            ["--do_ocr", "--transformer_ocr"],
            #["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"],
            ["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"],
            # --skip_layout_and_reading_order
    ], ids=str)
 def test_run_eynollah_layout_filename(tmp_path, pytestconfig, caplog, options):
    infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
    outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml'
    args = [
        '-m', MODELS_LAYOUT,
        '-i', str(infile),
        '-o', str(outfile.parent),
    ]
    if pytestconfig.getoption('verbose') > 0:
        args.extend(['-l', 'DEBUG'])
    caplog.set_level(logging.INFO)
    runner = CliRunner()
    with caplog.filtering(only_eynollah):
        result = runner.invoke(layout_cli, args + options, catch_exceptions=False)
    assert result.exit_code == 0, result.stdout
    logmsgs = [logrec.message for logrec in caplog.records]
    assert str(infile) in logmsgs
    assert outfile.exists()
    tree = page_from_file(str(outfile)).etree
    regions = tree.xpath("//page:TextRegion", namespaces=NS)
    assert len(regions) >= 2, "result is inaccurate"
    regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
    assert len(regions) >= 2, "result is inaccurate"
    lines = tree.xpath("//page:TextLine", namespaces=NS)
    assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line
@pytest.mark.parametrize(
    "options",
    [
            ["--tables"],
            ["--tables", "--full-layout"],
            ["--tables", "--full-layout", "--textline_light", "--light_version"],
    ], ids=str)
 def test_run_eynollah_layout_filename2(tmp_path, pytestconfig, caplog, options):
    infile = testdir.joinpath('resources/euler_rechenkunst01_1738_0025.tif')
    outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml'
    args = [
        '-m', MODELS_LAYOUT,
        '-i', str(infile),
        '-o', str(outfile.parent),
    ]
    if pytestconfig.getoption('verbose') > 0:
        args.extend(['-l', 'DEBUG'])
    caplog.set_level(logging.INFO)
    runner = CliRunner()
    with caplog.filtering(only_eynollah):
        result = runner.invoke(layout_cli, args + options, catch_exceptions=False)
    assert result.exit_code == 0, result.stdout
    logmsgs = [logrec.message for logrec in caplog.records]
    assert str(infile) in logmsgs
    assert outfile.exists()
    tree = page_from_file(str(outfile)).etree
    regions = tree.xpath("//page:TextRegion", namespaces=NS)
    assert len(regions) >= 2, "result is inaccurate"
    regions = tree.xpath("//page:TableRegion", namespaces=NS)
    # model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP
    assert len(regions) >= 1, "result is inaccurate"
    regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
    assert len(regions) >= 2, "result is inaccurate"
    lines = tree.xpath("//page:TextLine", namespaces=NS)
    assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line
 def test_run_eynollah_layout_directory(tmp_path, pytestconfig, caplog):
    indir = testdir.joinpath('resources')
    outdir = tmp_path
    args = [
        '-m', MODELS_LAYOUT,
        '-di', str(indir),
        '-o', str(outdir),
    ]
    if pytestconfig.getoption('verbose') > 0:
        args.extend(['-l', 'DEBUG'])
    caplog.set_level(logging.INFO)
    runner = CliRunner()
    with caplog.filtering(only_eynollah):
        result = runner.invoke(layout_cli, args, catch_exceptions=False)
    assert result.exit_code == 0, result.stdout
    logmsgs = [logrec.message for logrec in caplog.records]
    assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Job done in')]) == 2
    assert any(logmsg for logmsg in logmsgs if logmsg.startswith('All jobs done in'))
    assert len(list(outdir.iterdir())) == 2
@pytest.mark.parametrize(
    "options",
    [
--- a/tests/test_run_layout.py
+++ b/tests/test_run_layout.py
@ -0,0 +1,330 @@
 from os import environ
 from pathlib import Path
 import pytest
 import logging
 from PIL import Image
 from eynollah.cli import (
    layout as layout_cli,
    binarization as binarization_cli,
    enhancement as enhancement_cli,
    machine_based_reading_order as mbreorder_cli,
    ocr as ocr_cli,
 )
 from click.testing import CliRunner
 from ocrd_modelfactory import page_from_file
 from ocrd_models.constants import NAMESPACES as NS
 testdir = Path(__file__).parent.resolve()
 MODELS_LAYOUT = environ.get('MODELS_LAYOUT', str(testdir.joinpath('..', 'models_layout_v0_6_0').resolve()))
 def only_eynollah(logrec):
 		return logrec.name.startswith('eynollah')
@pytest.mark.parametrize(
    "options",
    [
            [], # defaults
            #["--allow_scaling", "--curved-line"],
            ["--allow_scaling", "--curved-line", "--full-layout"],
            ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"],
            ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based",
             "--textline_light", "--light_version"],
            # -ep ...
            # -eoi ...
            # FIXME: find out whether OCR extra was installed, otherwise skip these
            ["--do_ocr"],
            ["--do_ocr", "--light_version", "--textline_light"],
            ["--do_ocr", "--transformer_ocr"],
            #["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"],
            ["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"],
            # --skip_layout_and_reading_order
    ], ids=str)
 def test_run_eynollah_layout_filename(tmp_path, pytestconfig, caplog, options):
    infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
    outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml'
    args = [
        '-m', MODELS_LAYOUT,
        '-i', str(infile),
        '-o', str(outfile.parent),
    ]
    if pytestconfig.getoption('verbose') > 0:
        args.extend(['-l', 'DEBUG'])
    caplog.set_level(logging.INFO)
    runner = CliRunner()
    with caplog.filtering(only_eynollah):
        result = runner.invoke(layout_cli, args + options, catch_exceptions=False)
    assert result.exit_code == 0, result.stdout
    logmsgs = [logrec.message for logrec in caplog.records]
    assert str(infile) in logmsgs
    assert outfile.exists()
    tree = page_from_file(str(outfile)).etree
    regions = tree.xpath("//page:TextRegion", namespaces=NS)
    assert len(regions) >= 2, "result is inaccurate"
    regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
    assert len(regions) >= 2, "result is inaccurate"
    lines = tree.xpath("//page:TextLine", namespaces=NS)
    assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line
@pytest.mark.parametrize(
    "options",
    [
            ["--tables"],
            ["--tables", "--full-layout"],
            ["--tables", "--full-layout", "--textline_light", "--light_version"],
    ], ids=str)
 def test_run_eynollah_layout_filename2(tmp_path, pytestconfig, caplog, options):
    infile = testdir.joinpath('resources/euler_rechenkunst01_1738_0025.tif')
    outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml'
    args = [
        '-m', MODELS_LAYOUT,
        '-i', str(infile),
        '-o', str(outfile.parent),
    ]
    if pytestconfig.getoption('verbose') > 0:
        args.extend(['-l', 'DEBUG'])
    caplog.set_level(logging.INFO)
    runner = CliRunner()
    with caplog.filtering(only_eynollah):
        result = runner.invoke(layout_cli, args + options, catch_exceptions=False)
    assert result.exit_code == 0, result.stdout
    logmsgs = [logrec.message for logrec in caplog.records]
    assert str(infile) in logmsgs
    assert outfile.exists()
    tree = page_from_file(str(outfile)).etree
    regions = tree.xpath("//page:TextRegion", namespaces=NS)
    assert len(regions) >= 2, "result is inaccurate"
    regions = tree.xpath("//page:TableRegion", namespaces=NS)
    # model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP
    assert len(regions) >= 1, "result is inaccurate"
    regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
    assert len(regions) >= 2, "result is inaccurate"
    lines = tree.xpath("//page:TextLine", namespaces=NS)
    assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line
 def test_run_eynollah_layout_directory(tmp_path, pytestconfig, caplog):
    indir = testdir.joinpath('resources')
    outdir = tmp_path
    args = [
        '-m', MODELS_LAYOUT,
        '-di', str(indir),
        '-o', str(outdir),
    ]
    if pytestconfig.getoption('verbose') > 0:
        args.extend(['-l', 'DEBUG'])
    caplog.set_level(logging.INFO)
    runner = CliRunner()
    with caplog.filtering(only_eynollah):
        result = runner.invoke(layout_cli, args, catch_exceptions=False)
    assert result.exit_code == 0, result.stdout
    logmsgs = [logrec.message for logrec in caplog.records]
    assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Job done in')]) == 2
    assert any(logmsg for logmsg in logmsgs if logmsg.startswith('All jobs done in'))
    assert len(list(outdir.iterdir())) == 2
@pytest.mark.parametrize(
    "options",
    [
            [], # defaults
            ["--no-patches"],
    ], ids=str)
 def test_run_eynollah_binarization_filename(tmp_path, pytestconfig, caplog, options):
    infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
    outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
    args = [
        '-m', MODELS_BIN,
        '-i', str(infile),
        '-o', str(outfile),
    ]
    if pytestconfig.getoption('verbose') > 0:
        args.extend(['-l', 'DEBUG'])
    caplog.set_level(logging.INFO)
    runner = CliRunner()
    with caplog.filtering(only_eynollah):
        result = runner.invoke(binarization_cli, args + options, catch_exceptions=False)
    assert result.exit_code == 0, result.stdout
    logmsgs = [logrec.message for logrec in caplog.records]
    assert any(True for logmsg in logmsgs if logmsg.startswith('Predicting'))
    assert outfile.exists()
    with Image.open(infile) as original_img:
        original_size = original_img.size
    with Image.open(outfile) as binarized_img:
        binarized_size = binarized_img.size
    assert original_size == binarized_size
 def test_run_eynollah_binarization_directory(tmp_path, pytestconfig, caplog):
    indir = testdir.joinpath('resources')
    outdir = tmp_path
    args = [
        '-m', MODELS_BIN,
        '-di', str(indir),
        '-o', str(outdir),
    ]
    if pytestconfig.getoption('verbose') > 0:
        args.extend(['-l', 'DEBUG'])
    caplog.set_level(logging.INFO)
    runner = CliRunner()
    with caplog.filtering(only_eynollah):
        result = runner.invoke(binarization_cli, args, catch_exceptions=False)
    assert result.exit_code == 0, result.stdout
    logmsgs = [logrec.message for logrec in caplog.records]
    assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Predicting')]) == 2
    assert len(list(outdir.iterdir())) == 2
@pytest.mark.parametrize(
    "options",
    [
            [], # defaults
            ["-sos"],
    ], ids=str)
 def test_run_eynollah_enhancement_filename(tmp_path, pytestconfig, caplog, options):
    infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
    outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
    args = [
        '-m', MODELS_LAYOUT,
        '-i', str(infile),
        '-o', str(outfile.parent),
    ]
    if pytestconfig.getoption('verbose') > 0:
        args.extend(['-l', 'DEBUG'])
    caplog.set_level(logging.INFO)
    runner = CliRunner()
    with caplog.filtering(only_eynollah):
        result = runner.invoke(enhancement_cli, args + options, catch_exceptions=False)
    assert result.exit_code == 0, result.stdout
    logmsgs = [logrec.message for logrec in caplog.records]
    assert any(True for logmsg in logmsgs if logmsg.startswith('Image was enhanced')), logmsgs
    assert outfile.exists()
    with Image.open(infile) as original_img:
        original_size = original_img.size
    with Image.open(outfile) as enhanced_img:
        enhanced_size = enhanced_img.size
    assert (original_size == enhanced_size) == ("-sos" in options)
 def test_run_eynollah_enhancement_directory(tmp_path, pytestconfig, caplog):
    indir = testdir.joinpath('resources')
    outdir = tmp_path
    args = [
        '-m', MODELS_LAYOUT,
        '-di', str(indir),
        '-o', str(outdir),
    ]
    if pytestconfig.getoption('verbose') > 0:
        args.extend(['-l', 'DEBUG'])
    caplog.set_level(logging.INFO)
    runner = CliRunner()
    with caplog.filtering(only_eynollah):
        result = runner.invoke(enhancement_cli, args, catch_exceptions=False)
    assert result.exit_code == 0, result.stdout
    logmsgs = [logrec.message for logrec in caplog.records]
    assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Image was enhanced')]) == 2
    assert len(list(outdir.iterdir())) == 2
 def test_run_eynollah_mbreorder_filename(tmp_path, pytestconfig, caplog):
    infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.xml')
    outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
    args = [
        '-m', MODELS_LAYOUT,
        '-i', str(infile),
        '-o', str(outfile.parent),
    ]
    if pytestconfig.getoption('verbose') > 0:
        args.extend(['-l', 'DEBUG'])
    caplog.set_level(logging.INFO)
    runner = CliRunner()
    with caplog.filtering(only_eynollah):
        result = runner.invoke(mbreorder_cli, args, catch_exceptions=False)
    assert result.exit_code == 0, result.stdout
    logmsgs = [logrec.message for logrec in caplog.records]
    # FIXME: mbreorder has no logging!
    #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
    assert outfile.exists()
    #in_tree = page_from_file(str(infile)).etree
    #in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
    out_tree = page_from_file(str(outfile)).etree
    out_order = out_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
    #assert len(out_order) >= 2, "result is inaccurate"
    #assert in_order != out_order
    assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3']
 def test_run_eynollah_mbreorder_directory(tmp_path, pytestconfig, caplog):
    indir = testdir.joinpath('resources')
    outdir = tmp_path
    args = [
        '-m', MODELS_LAYOUT,
        '-di', str(indir),
        '-o', str(outdir),
    ]
    if pytestconfig.getoption('verbose') > 0:
        args.extend(['-l', 'DEBUG'])
    caplog.set_level(logging.INFO)
    runner = CliRunner()
    with caplog.filtering(only_eynollah):
        result = runner.invoke(mbreorder_cli, args, catch_exceptions=False)
    assert result.exit_code == 0, result.stdout
    logmsgs = [logrec.message for logrec in caplog.records]
    # FIXME: mbreorder has no logging!
    #assert len([logmsg for logmsg in logmsgs if logmsg.startswith('???')]) == 2
    assert len(list(outdir.iterdir())) == 2
@pytest.mark.parametrize(
    "options",
    [
        [], # defaults
        ["-doit", #str(outrenderfile.parent)],
         ],
        ["-trocr"],
    ], ids=str)
 def test_run_eynollah_ocr_filename(tmp_path, pytestconfig, caplog, options):
    infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
    outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
    outrenderfile = tmp_path.joinpath('render').joinpath('kant_aufklaerung_1784_0020.png')
    outrenderfile.parent.mkdir()
    args = [
        '-m', MODELS_OCR,
        '-i', str(infile),
        '-dx', str(infile.parent),
        '-o', str(outfile.parent),
    ]
    if pytestconfig.getoption('verbose') > 0:
        args.extend(['-l', 'DEBUG'])
    caplog.set_level(logging.DEBUG)
    runner = CliRunner()
    if "-doit" in options:
        options.insert(options.index("-doit") + 1, str(outrenderfile.parent))
    with caplog.filtering(only_eynollah):
        result = runner.invoke(ocr_cli, args + options, catch_exceptions=False)
    assert result.exit_code == 0, result.stdout
    logmsgs = [logrec.message for logrec in caplog.records]
    # FIXME: ocr has no logging!
    #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
    assert outfile.exists()
    if "-doit" in options:
        assert outrenderfile.exists()
    #in_tree = page_from_file(str(infile)).etree
    #in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
    out_tree = page_from_file(str(outfile)).etree
    out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS)
    assert len(out_texts) >= 2, ("result is inaccurate", out_texts)
    assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts)
 def test_run_eynollah_ocr_directory(tmp_path, pytestconfig, caplog):
    indir = testdir.joinpath('resources')
    outdir = tmp_path
    args = [
        '-m', MODELS_OCR,
        '-di', str(indir),
        '-dx', str(indir),
        '-o', str(outdir),
    ]
    if pytestconfig.getoption('verbose') > 0:
        args.extend(['-l', 'DEBUG'])
    caplog.set_level(logging.INFO)
    runner = CliRunner()
    with caplog.filtering(only_eynollah):
        result = runner.invoke(ocr_cli, args, catch_exceptions=False)
    assert result.exit_code == 0, result.stdout
    logmsgs = [logrec.message for logrec in caplog.records]
    # FIXME: ocr has no logging!
    #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
    assert len(list(outdir.iterdir())) == 2