eynollah/tests/cli_tests/test_layout.py

128 lines
4.4 KiB
Python

import pytest
from ocrd_modelfactory import page_from_file
from ocrd_models.constants import NAMESPACES as NS
@pytest.mark.parametrize(
"options",
[
[], # defaults
#["--allow_scaling", "--curved-line"],
["--allow_scaling", "--curved-line", "--full-layout"],
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"],
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based",
"--textline_light", "--light_version"],
# -ep ...
# -eoi ...
# FIXME: find out whether OCR extra was installed, otherwise skip these
["--do_ocr"],
["--do_ocr", "--light_version", "--textline_light"],
["--do_ocr", "--transformer_ocr"],
#["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"],
["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"],
# --skip_layout_and_reading_order
], ids=str)
def test_run_eynollah_layout_filename(
tmp_path,
run_eynollah_ok_and_check_logs,
resources_dir,
options,
):
infile = resources_dir / 'kant_aufklaerung_1784_0020.tif'
outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml'
run_eynollah_ok_and_check_logs(
'layout',
[
'-i', str(infile),
'-o', str(outfile.parent),
] + options,
[
str(infile)
]
)
assert outfile.exists()
tree = page_from_file(str(outfile)).etree
regions = tree.xpath("//page:TextRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
lines = tree.xpath("//page:TextLine", namespaces=NS)
assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line
@pytest.mark.parametrize(
"options",
[
["--tables"],
["--tables", "--full-layout"],
["--tables", "--full-layout", "--textline_light", "--light_version"],
], ids=str)
def test_run_eynollah_layout_filename2(
tmp_path,
resources_dir,
run_eynollah_ok_and_check_logs,
options,
):
infile = resources_dir / 'euler_rechenkunst01_1738_0025.tif'
outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml'
run_eynollah_ok_and_check_logs(
'layout',
[
'-i', str(infile),
'-o', str(outfile.parent),
] + options,
[
str(infile)
]
)
assert outfile.exists()
tree = page_from_file(str(outfile)).etree
regions = tree.xpath("//page:TextRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
regions = tree.xpath("//page:TableRegion", namespaces=NS)
# model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP
assert len(regions) >= 1, "result is inaccurate"
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
lines = tree.xpath("//page:TextLine", namespaces=NS)
assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line
def test_run_eynollah_layout_directory(
tmp_path,
resources_dir,
run_eynollah_ok_and_check_logs,
):
outdir = tmp_path
run_eynollah_ok_and_check_logs(
'layout',
[
'-di', str(resources_dir),
'-o', str(outdir),
],
[
'Job done in',
'All jobs done in',
]
)
assert len(list(outdir.iterdir())) == 2
# def test_run_eynollah_layout_marginalia(
# tmp_path,
# resources_dir,
# run_eynollah_ok_and_check_logs,
# ):
# outdir = tmp_path
# outfile = outdir / 'estor_rechtsgelehrsamkeit02_1758_0880_800px.xml'
# run_eynollah_ok_and_check_logs(
# 'layout',
# [
# '-i', str(resources_dir / 'estor_rechtsgelehrsamkeit02_1758_0880_800px.jpg'),
# '-o', str(outdir),
# ],
# [
# 'Job done in',
# 'All jobs done in',
# ]
# )
# assert outfile.exists()
# tree = page_from_file(str(outfile)).etree
# regions = tree.xpath('//page:TextRegion[type="marginalia"]', namespaces=NS)
# assert len(regions) == 5, "expected 5 marginalia regions"