mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-11-09 22:24:13 +01:00
105 lines
3.7 KiB
Python
105 lines
3.7 KiB
Python
import pytest
|
|
from ocrd_modelfactory import page_from_file
|
|
from ocrd_models.constants import NAMESPACES as NS
|
|
|
|
@pytest.mark.parametrize(
|
|
"options",
|
|
[
|
|
[], # defaults
|
|
#["--allow_scaling", "--curved-line"],
|
|
["--allow_scaling", "--curved-line", "--full-layout"],
|
|
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"],
|
|
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based",
|
|
"--textline_light", "--light_version"],
|
|
# -ep ...
|
|
# -eoi ...
|
|
# FIXME: find out whether OCR extra was installed, otherwise skip these
|
|
["--do_ocr"],
|
|
["--do_ocr", "--light_version", "--textline_light"],
|
|
["--do_ocr", "--transformer_ocr"],
|
|
#["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"],
|
|
["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"],
|
|
# --skip_layout_and_reading_order
|
|
], ids=str)
|
|
def test_run_eynollah_layout_filename(
|
|
tmp_path,
|
|
run_eynollah_ok_and_check_logs,
|
|
resources_dir,
|
|
options,
|
|
):
|
|
infile = resources_dir / 'kant_aufklaerung_1784_0020.tif'
|
|
outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml'
|
|
run_eynollah_ok_and_check_logs(
|
|
'layout',
|
|
[
|
|
'-i', str(infile),
|
|
'-o', str(outfile.parent),
|
|
] + options,
|
|
[
|
|
str(infile)
|
|
]
|
|
)
|
|
assert outfile.exists()
|
|
tree = page_from_file(str(outfile)).etree
|
|
regions = tree.xpath("//page:TextRegion", namespaces=NS)
|
|
assert len(regions) >= 2, "result is inaccurate"
|
|
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
|
|
assert len(regions) >= 2, "result is inaccurate"
|
|
lines = tree.xpath("//page:TextLine", namespaces=NS)
|
|
assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line
|
|
|
|
@pytest.mark.parametrize(
|
|
"options",
|
|
[
|
|
["--tables"],
|
|
["--tables", "--full-layout"],
|
|
["--tables", "--full-layout", "--textline_light", "--light_version"],
|
|
], ids=str)
|
|
def test_run_eynollah_layout_filename2(
|
|
tmp_path,
|
|
resources_dir,
|
|
run_eynollah_ok_and_check_logs,
|
|
options,
|
|
):
|
|
infile = resources_dir / 'euler_rechenkunst01_1738_0025.tif'
|
|
outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml'
|
|
run_eynollah_ok_and_check_logs(
|
|
'layout',
|
|
[
|
|
'-i', str(infile),
|
|
'-o', str(outfile.parent),
|
|
] + options,
|
|
[
|
|
str(infile)
|
|
]
|
|
)
|
|
assert outfile.exists()
|
|
tree = page_from_file(str(outfile)).etree
|
|
regions = tree.xpath("//page:TextRegion", namespaces=NS)
|
|
assert len(regions) >= 2, "result is inaccurate"
|
|
regions = tree.xpath("//page:TableRegion", namespaces=NS)
|
|
# model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP
|
|
assert len(regions) >= 1, "result is inaccurate"
|
|
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
|
|
assert len(regions) >= 2, "result is inaccurate"
|
|
lines = tree.xpath("//page:TextLine", namespaces=NS)
|
|
assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line
|
|
|
|
def test_run_eynollah_layout_directory(
|
|
tmp_path,
|
|
resources_dir,
|
|
run_eynollah_ok_and_check_logs,
|
|
):
|
|
outdir = tmp_path
|
|
run_eynollah_ok_and_check_logs(
|
|
'layout',
|
|
[
|
|
'-di', str(resources_dir),
|
|
'-o', str(outdir),
|
|
],
|
|
[
|
|
'Job done in',
|
|
'All jobs done in',
|
|
]
|
|
)
|
|
assert len(list(outdir.iterdir())) == 2
|