eynollah/tests/cli_tests/test_ocr.py

65 lines
1.9 KiB
Python
Raw Normal View History

2025-10-29 16:20:30 +01:00
import pytest
from ocrd_modelfactory import page_from_file
from ocrd_models.constants import NAMESPACES as NS
@pytest.mark.parametrize(
"options",
[
["-trocr"],
2025-10-29 16:20:30 +01:00
[], # defaults
["-doit", #str(outrenderfile.parent)],
],
], ids=str)
def test_run_eynollah_ocr_filename(
tmp_path,
run_eynollah_ok_and_check_logs,
resources_dir,
options,
):
infile = resources_dir / 'kant_aufklaerung_1784_0020.tif'
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
outrenderfile = tmp_path / 'render' / 'kant_aufklaerung_1784_0020.png'
outrenderfile.parent.mkdir()
if "-doit" in options:
options.insert(options.index("-doit") + 1, str(outrenderfile.parent))
run_eynollah_ok_and_check_logs(
'ocr',
2025-10-29 16:20:30 +01:00
[
'-i', str(infile),
'-dx', str(infile.parent),
'-o', str(outfile.parent),
] + options,
[
# FIXME: ocr has no logging!
]
)
assert outfile.exists()
if "-doit" in options:
assert outrenderfile.exists()
#in_tree = page_from_file(str(infile)).etree
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
out_tree = page_from_file(str(outfile)).etree
out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS)
assert len(out_texts) >= 2, ("result is inaccurate", out_texts)
assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts)
def test_run_eynollah_ocr_directory(
tmp_path,
run_eynollah_ok_and_check_logs,
resources_dir,
):
outdir = tmp_path
run_eynollah_ok_and_check_logs(
'ocr',
2025-10-29 16:20:30 +01:00
[
'-di', str(resources_dir),
'-dx', str(resources_dir),
'-o', str(outdir),
],
[
# FIXME: ocr has no logging!
]
)
assert len(list(outdir.iterdir())) == 2