refactor cli tests

This commit is contained in:
kba 2025-10-29 16:20:30 +01:00
parent ef999c8f0a
commit b6f82c72b9
15 changed files with 453 additions and 592 deletions

View file

@ -0,0 +1,36 @@
from typing import List
from click import Command
import pytest
import logging
from click.testing import CliRunner, Result
@pytest.fixture
def run_eynollah_ok_and_check_logs(
pytestconfig,
caplog,
model_dir,
eynollah_log_filter,
):
"""
Generates a Click Runner for `cli`, injects model_path and logging level
to `args`, runs the command and checks whether the logs generated contain
every fragment in `expected_logs`
"""
def _run_click_ok_logs(cli: Command, args: List[str], expected_logs: List[str]) -> Result:
args = ['-m', model_dir] + args
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
runner = CliRunner()
with caplog.filtering(eynollah_log_filter):
result = runner.invoke(cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
if expected_logs:
logmsgs = [logrec.message for logrec in caplog.records]
assert any(logmsg.startswith(needle) for needle in expected_logs for logmsg in logmsgs), f'{expected_logs} not in {logmsgs}'
return result
return _run_click_ok_logs

View file

@ -0,0 +1,58 @@
import pytest
from PIL import Image
from eynollah.cli import (
binarization as binarization_cli,
)
from ocrd_modelfactory import page_from_file
from ocrd_models.constants import NAMESPACES as NS
@pytest.mark.parametrize(
"options",
[
[], # defaults
["--no-patches"],
], ids=str)
def test_run_eynollah_binarization_filename(
tmp_path,
run_eynollah_ok_and_check_logs,
tests_dir,
options,
):
infile = tests_dir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
run_eynollah_ok_and_check_logs(
binarization_cli,
[
'-i', str(infile),
'-o', str(outfile),
] + options,
[
'Predicting'
]
)
assert outfile.exists()
with Image.open(infile) as original_img:
original_size = original_img.size
with Image.open(outfile) as binarized_img:
binarized_size = binarized_img.size
assert original_size == binarized_size
def test_run_eynollah_binarization_directory(
tmp_path,
run_eynollah_ok_and_check_logs,
resources_dir,
image_resources,
):
outdir = tmp_path
run_eynollah_ok_and_check_logs(
binarization_cli,
[
'-di', str(resources_dir),
'-o', str(outdir),
],
[
f'Predicting {image_resources[0].name}',
f'Predicting {image_resources[1].name}',
]
)
assert len(list(outdir.iterdir())) == 2

View file

@ -0,0 +1,57 @@
import pytest
from PIL import Image
from eynollah.cli import (
enhancement as enhancement_cli,
)
from ocrd_modelfactory import page_from_file
from ocrd_models.constants import NAMESPACES as NS
@pytest.mark.parametrize(
"options",
[
[], # defaults
["-sos"],
], ids=str)
def test_run_eynollah_enhancement_filename(
tmp_path,
resources_dir,
run_eynollah_ok_and_check_logs,
options,
):
infile = resources_dir / 'kant_aufklaerung_1784_0020.tif'
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
run_eynollah_ok_and_check_logs(
enhancement_cli,
[
'-i', str(infile),
'-o', str(outfile.parent),
] + options,
[
'Image was enhanced',
]
)
with Image.open(infile) as original_img:
original_size = original_img.size
with Image.open(outfile) as enhanced_img:
enhanced_size = enhanced_img.size
assert (original_size == enhanced_size) == ("-sos" in options)
def test_run_eynollah_enhancement_directory(
tmp_path,
resources_dir,
image_resources,
run_eynollah_ok_and_check_logs,
):
outdir = tmp_path
run_eynollah_ok_and_check_logs(
enhancement_cli,
[
'-di', str(resources_dir),
'-o', str(outdir),
],
[
f'Image {image_resources[0]} was enhanced',
f'Image {image_resources[1]} was enhanced',
]
)
assert len(list(outdir.iterdir())) == 2

View file

@ -0,0 +1,109 @@
import pytest
from eynollah.cli import (
layout as layout_cli,
)
from ocrd_modelfactory import page_from_file
from ocrd_models.constants import NAMESPACES as NS
@pytest.mark.parametrize(
"options",
[
[], # defaults
#["--allow_scaling", "--curved-line"],
["--allow_scaling", "--curved-line", "--full-layout"],
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"],
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based",
"--textline_light", "--light_version"],
# -ep ...
# -eoi ...
# FIXME: find out whether OCR extra was installed, otherwise skip these
["--do_ocr"],
["--do_ocr", "--light_version", "--textline_light"],
["--do_ocr", "--transformer_ocr"],
#["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"],
["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"],
# --skip_layout_and_reading_order
], ids=str)
def test_run_eynollah_layout_filename(
tmp_path,
run_eynollah_ok_and_check_logs,
resources_dir,
options,
):
outdir = tmp_path
infile = resources_dir / 'kant_aufklaerung_1784_0020.tif'
outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml'
run_eynollah_ok_and_check_logs(
layout_cli,
[
'-i', str(infile),
'-o', str(outfile.parent),
] + options,
[
str(infile)
]
)
assert outfile.exists()
tree = page_from_file(str(outfile)).etree
regions = tree.xpath("//page:TextRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
lines = tree.xpath("//page:TextLine", namespaces=NS)
assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line
@pytest.mark.parametrize(
"options",
[
["--tables"],
["--tables", "--full-layout"],
["--tables", "--full-layout", "--textline_light", "--light_version"],
], ids=str)
def test_run_eynollah_layout_filename2(
tmp_path,
resources_dir,
run_eynollah_ok_and_check_logs,
options,
):
infile = resources_dir / 'euler_rechenkunst01_1738_0025.tif'
outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml'
run_eynollah_ok_and_check_logs(
layout_cli,
[
'-i', str(infile),
'-o', str(outfile.parent),
] + options,
[
infile
]
)
assert outfile.exists()
tree = page_from_file(str(outfile)).etree
regions = tree.xpath("//page:TextRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
regions = tree.xpath("//page:TableRegion", namespaces=NS)
# model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP
assert len(regions) >= 1, "result is inaccurate"
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
lines = tree.xpath("//page:TextLine", namespaces=NS)
assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line
def test_run_eynollah_layout_directory(
tmp_path,
resources_dir,
run_eynollah_ok_and_check_logs,
):
outdir = tmp_path
run_eynollah_ok_and_check_logs(
layout_cli,
[
'-di', str(resources_dir),
'-o', str(outdir),
],
[
'Job done in',
'All jobs done in',
]
)
assert len(list(outdir.iterdir())) == 2

View file

@ -0,0 +1,53 @@
from ocrd_modelfactory import page_from_file
from ocrd_models.constants import NAMESPACES as NS
from eynollah.cli import (
machine_based_reading_order as mbreorder_cli,
)
def test_run_eynollah_mbreorder_filename(
tmp_path,
resources_dir,
run_eynollah_ok_and_check_logs,
):
infile = resources_dir / 'kant_aufklaerung_1784_0020.xml'
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
run_eynollah_ok_and_check_logs(
mbreorder_cli,
[
'-i', str(infile),
'-o', str(outfile.parent),
],
[
# FIXME: mbreorder has no logging!
]
)
assert outfile.exists()
#in_tree = page_from_file(str(infile)).etree
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
out_tree = page_from_file(str(outfile)).etree
out_order = out_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
#assert len(out_order) >= 2, "result is inaccurate"
#assert in_order != out_order
assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3']
def test_run_eynollah_mbreorder_directory(
tmp_path,
resources_dir,
run_eynollah_ok_and_check_logs,
):
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
outdir = tmp_path
run_eynollah_ok_and_check_logs(
mbreorder_cli,
[
'-di', str(resources_dir),
'-o', str(outdir),
],
[
# FIXME: mbreorder has no logging!
]
)
assert len(list(outdir.iterdir())) == 2

View file

@ -0,0 +1,67 @@
import pytest
from eynollah.cli import (
ocr as ocr_cli,
)
from ocrd_modelfactory import page_from_file
from ocrd_models.constants import NAMESPACES as NS
@pytest.mark.parametrize(
"options",
[
[], # defaults
["-doit", #str(outrenderfile.parent)],
],
["-trocr"],
], ids=str)
def test_run_eynollah_ocr_filename(
tmp_path,
run_eynollah_ok_and_check_logs,
resources_dir,
options,
):
infile = resources_dir / 'kant_aufklaerung_1784_0020.tif'
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
outrenderfile = tmp_path / 'render' / 'kant_aufklaerung_1784_0020.png'
outrenderfile.parent.mkdir()
if "-doit" in options:
options.insert(options.index("-doit") + 1, str(outrenderfile.parent))
run_eynollah_ok_and_check_logs(
ocr_cli,
[
'-i', str(infile),
'-dx', str(infile.parent),
'-o', str(outfile.parent),
] + options,
[
# FIXME: ocr has no logging!
]
)
assert outfile.exists()
if "-doit" in options:
assert outrenderfile.exists()
#in_tree = page_from_file(str(infile)).etree
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
out_tree = page_from_file(str(outfile)).etree
out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS)
assert len(out_texts) >= 2, ("result is inaccurate", out_texts)
assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts)
def test_run_eynollah_ocr_directory(
tmp_path,
run_eynollah_ok_and_check_logs,
resources_dir,
):
outdir = tmp_path
run_eynollah_ok_and_check_logs(
ocr_cli,
[
'-di', str(resources_dir),
'-dx', str(resources_dir),
'-o', str(outdir),
],
[
# FIXME: ocr has no logging!
]
)
assert len(list(outdir.iterdir())) == 2

View file

@ -0,0 +1,10 @@
import pytest
from PIL import Image
from eynollah.cli import (
layout as layout_cli,
binarization as binarization_cli,
enhancement as enhancement_cli,
)
from ocrd_modelfactory import page_from_file
from ocrd_models.constants import NAMESPACES as NS