eynollah/tests/test_run.py

316 lines
13 KiB
Python
Raw Normal View History

2021-02-04 15:21:14 +01:00
from os import environ
from pathlib import Path
import pytest
import logging
from PIL import Image
from eynollah.cli import (
layout as layout_cli,
binarization as binarization_cli,
enhancement as enhancement_cli,
machine_based_reading_order as mbreorder_cli,
2025-09-25 19:53:19 +02:00
ocr as ocr_cli,
)
from click.testing import CliRunner
from ocrd_modelfactory import page_from_file
from ocrd_models.constants import NAMESPACES as NS
2021-02-04 15:21:14 +01:00
testdir = Path(__file__).parent.resolve()
2025-09-25 21:47:15 +02:00
MODELS_LAYOUT = environ.get('MODELS_LAYOUT', str(testdir.joinpath('..', 'models_layout_v0_5_0').resolve()))
MODELS_OCR = environ.get('MODELS_OCR', str(testdir.joinpath('..', 'models_ocr_v0_5_0').resolve()))
MODELS_BIN = environ.get('MODELS_BIN', str(testdir.joinpath('..', 'default-2021-03-09').resolve()))
2021-02-04 15:21:14 +01:00
def test_run_eynollah_layout_filename(tmp_path, subtests, pytestconfig, caplog):
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml'
args = [
2025-09-25 21:47:15 +02:00
'-m', MODELS_LAYOUT,
'-i', str(infile),
'-o', str(outfile.parent),
# subtests write to same location
'--overwrite',
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'eynollah'
runner = CliRunner()
for options in [
[], # defaults
["--allow_scaling", "--curved-line"],
["--allow_scaling", "--curved-line", "--full-layout"],
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"],
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based",
"--textline_light", "--light_version"],
# -ep ...
# -eoi ...
# --do_ocr
# --skip_layout_and_reading_order
]:
with subtests.test(#msg="test CLI",
options=options):
with caplog.filtering(only_eynollah):
2025-04-06 18:24:56 +00:00
result = runner.invoke(layout_cli, args + options, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
assert str(infile) in logmsgs
assert outfile.exists()
tree = page_from_file(str(outfile)).etree
regions = tree.xpath("//page:TextRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
lines = tree.xpath("//page:TextLine", namespaces=NS)
assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line
def test_run_eynollah_layout_directory(tmp_path, pytestconfig, caplog):
indir = testdir.joinpath('resources')
outdir = tmp_path
args = [
2025-09-25 21:47:15 +02:00
'-m', MODELS_LAYOUT,
'-di', str(indir),
'-o', str(outdir),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'eynollah'
runner = CliRunner()
with caplog.filtering(only_eynollah):
2025-09-25 19:53:19 +02:00
result = runner.invoke(layout_cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Job done in')]) == 2
assert any(logmsg for logmsg in logmsgs if logmsg.startswith('All jobs done in'))
assert len(list(outdir.iterdir())) == 2
def test_run_eynollah_binarization_filename(tmp_path, subtests, pytestconfig, caplog):
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
args = [
2025-09-25 21:47:15 +02:00
'-m', MODELS_BIN,
2025-07-23 16:44:17 +02:00
'-i', str(infile),
'-o', str(outfile),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'SbbBinarizer'
runner = CliRunner()
for options in [
[], # defaults
["--no-patches"],
]:
with subtests.test(#msg="test CLI",
options=options):
with caplog.filtering(only_eynollah):
2025-09-25 19:53:19 +02:00
result = runner.invoke(binarization_cli, args + options, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
assert any(True for logmsg in logmsgs if logmsg.startswith('Predicting'))
assert outfile.exists()
with Image.open(infile) as original_img:
original_size = original_img.size
with Image.open(outfile) as binarized_img:
binarized_size = binarized_img.size
assert original_size == binarized_size
def test_run_eynollah_binarization_directory(tmp_path, subtests, pytestconfig, caplog):
indir = testdir.joinpath('resources')
outdir = tmp_path
args = [
2025-09-25 21:47:15 +02:00
'-m', MODELS_BIN,
'-di', str(indir),
2025-07-23 16:44:17 +02:00
'-o', str(outdir),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'SbbBinarizer'
runner = CliRunner()
with caplog.filtering(only_eynollah):
2025-09-25 19:53:19 +02:00
result = runner.invoke(binarization_cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Predicting')]) == 2
assert len(list(outdir.iterdir())) == 2
def test_run_eynollah_enhancement_filename(tmp_path, subtests, pytestconfig, caplog):
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
args = [
2025-09-25 21:47:15 +02:00
'-m', MODELS_LAYOUT,
'-i', str(infile),
'-o', str(outfile.parent),
# subtests write to same location
'--overwrite',
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'enhancement'
runner = CliRunner()
for options in [
[], # defaults
["-sos"],
]:
with subtests.test(#msg="test CLI",
options=options):
with caplog.filtering(only_eynollah):
2025-09-25 19:53:19 +02:00
result = runner.invoke(enhancement_cli, args + options, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
assert any(True for logmsg in logmsgs if logmsg.startswith('Image was enhanced')), logmsgs
assert outfile.exists()
with Image.open(infile) as original_img:
original_size = original_img.size
with Image.open(outfile) as enhanced_img:
enhanced_size = enhanced_img.size
assert (original_size == enhanced_size) == ("-sos" in options)
def test_run_eynollah_enhancement_directory(tmp_path, subtests, pytestconfig, caplog):
indir = testdir.joinpath('resources')
outdir = tmp_path
args = [
2025-09-25 21:47:15 +02:00
'-m', MODELS_LAYOUT,
'-di', str(indir),
'-o', str(outdir),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'enhancement'
runner = CliRunner()
with caplog.filtering(only_eynollah):
2025-09-25 19:53:19 +02:00
result = runner.invoke(enhancement_cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Image was enhanced')]) == 2
assert len(list(outdir.iterdir())) == 2
def test_run_eynollah_mbreorder_filename(tmp_path, subtests, pytestconfig, caplog):
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.xml')
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
args = [
2025-09-25 21:47:15 +02:00
'-m', MODELS_LAYOUT,
'-i', str(infile),
'-o', str(outfile.parent),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'mbreorder'
runner = CliRunner()
with caplog.filtering(only_eynollah):
2025-09-25 19:53:19 +02:00
result = runner.invoke(mbreorder_cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
# FIXME: mbreorder has no logging!
#assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
assert outfile.exists()
#in_tree = page_from_file(str(infile)).etree
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
out_tree = page_from_file(str(outfile)).etree
out_order = out_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
#assert len(out_order) >= 2, "result is inaccurate"
#assert in_order != out_order
assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3']
def test_run_eynollah_mbreorder_directory(tmp_path, subtests, pytestconfig, caplog):
indir = testdir.joinpath('resources')
outdir = tmp_path
args = [
2025-09-25 21:47:15 +02:00
'-m', MODELS_LAYOUT,
'-di', str(indir),
'-o', str(outdir),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'mbreorder'
runner = CliRunner()
with caplog.filtering(only_eynollah):
2025-09-25 19:53:19 +02:00
result = runner.invoke(mbreorder_cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
# FIXME: mbreorder has no logging!
#assert len([logmsg for logmsg in logmsgs if logmsg.startswith('???')]) == 2
assert len(list(outdir.iterdir())) == 2
2025-09-25 19:53:19 +02:00
def test_run_eynollah_ocr_filename(tmp_path, subtests, pytestconfig, caplog):
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
outrenderfile = tmp_path.joinpath('render').joinpath('kant_aufklaerung_1784_0020.png')
2025-09-25 19:53:19 +02:00
outrenderfile.parent.mkdir()
args = [
2025-09-25 21:47:15 +02:00
'-m', MODELS_OCR,
2025-09-25 19:53:19 +02:00
'-i', str(infile),
'-dx', str(infile.parent),
'-o', str(outfile.parent),
# subtests write to same location
'--overwrite',
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.DEBUG)
def only_eynollah(logrec):
return logrec.name == 'eynollah'
runner = CliRunner()
for options in [
2025-09-26 12:54:29 +02:00
# kba Fri Sep 26 12:53:49 CEST 2025
# Disabled until NHWC/NCHW error in https://github.com/qurator-spk/eynollah/actions/runs/18019655200/job/51273541895 debugged
2025-09-26 12:54:29 +02:00
# [], # defaults
# ["-doit", str(outrenderfile.parent)],
2025-09-25 19:53:19 +02:00
["-trocr"],
]:
with subtests.test(#msg="test CLI",
options=options):
with caplog.filtering(only_eynollah):
result = runner.invoke(ocr_cli, args + options, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
# FIXME: ocr has no logging!
#assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
assert outfile.exists()
if "-doit" in options:
assert outrenderfile.exists()
#in_tree = page_from_file(str(infile)).etree
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
out_tree = page_from_file(str(outfile)).etree
out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS)
assert len(out_texts) >= 2, ("result is inaccurate", out_texts)
assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts)
# kba Fri Sep 26 12:53:49 CEST 2025
# Disabled until NHWC/NCHW error in https://github.com/qurator-spk/eynollah/actions/runs/18019655200/job/51273541895 debugged
# def test_run_eynollah_ocr_directory(tmp_path, subtests, pytestconfig, caplog):
# indir = testdir.joinpath('resources')
# outdir = tmp_path
# args = [
# '-m', MODELS_OCR,
# '-di', str(indir),
# '-dx', str(indir),
# '-o', str(outdir),
# ]
# if pytestconfig.getoption('verbose') > 0:
# args.extend(['-l', 'DEBUG'])
# caplog.set_level(logging.INFO)
# def only_eynollah(logrec):
# return logrec.name == 'eynollah'
# runner = CliRunner()
# with caplog.filtering(only_eynollah):
# result = runner.invoke(ocr_cli, args, catch_exceptions=False)
# assert result.exit_code == 0, result.stdout
# logmsgs = [logrec.message for logrec in caplog.records]
# # FIXME: ocr has no logging!
# #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
# assert len(list(outdir.iterdir())) == 2