Merge branch 'main' into ro-fixes and resolve conflicts…

major conflicts resolved manually:

- branches for non-`light` segmentation already removed in main
- Keras/TF setup and no TF1 sessions, esp. in new ModelZoo
- changes to binarizer and its CLI (`mode`, `overwrite`, `run_single()`)
- writer: `build...` w/ kwargs instead of positional
- training for segmentation/binarization/enhancement tasks:
  * drop unused `generate_data_from_folder()`
  * simplify `preprocess_imgs()`: turn `preprocess_img()`, `get_patches()`
    and `get_patches_num_scale_new()` into generators, only writing
    result files in the caller (top-level loop) instead of passing
    output directories and file counter
- training for new OCR task:
  * `train`: put keys into additional `config_params` where they belong,
    resp. (conditioned under existing keys), and w/ better documentation
  * `train`: add new keys as kwargs to `run()` to make usable
  * `utils`: instead of custom data loader `data_gen_ocr()`, re-use
    existing `preprocess_imgs()` (for cfg capture and top-level loop),
    but extended w/ new kwargs and calling new `preprocess_img_ocr()`;
    the latter as single-image generator (also much simplified)
  * `train`: use tf.data loader pipeline from that generator w/ standard
    mechanisms for batching, shuffling, prefetching etc.
  * `utils` and `train`: instead of `vectorize_label`, use `Dataset.padded_batch`
  * add TensorBoard callback and re-use our checkpoint callback
  * also use standard Keras top-level loop for training

still problematic (substantially unresolved):
- `Patches` now only w/ fixed implicit size
  (ignoring training config params)
- `PatchEncoder` now only w/ fixed implicit num patches and projection dim
  (ignoring training config params)
This commit is contained in:
Robert Sachunsky 2026-02-07 14:05:56 +01:00
commit 27f43c175f
77 changed files with 5597 additions and 4952 deletions

View file

@ -0,0 +1,47 @@
from typing import List
import pytest
import logging
from click.testing import CliRunner, Result
from eynollah.cli import main as eynollah_cli
@pytest.fixture
def run_eynollah_ok_and_check_logs(
pytestconfig,
caplog,
model_dir,
eynollah_subcommands,
eynollah_log_filter,
):
"""
Generates a Click Runner for `cli`, injects model_path and logging level
to `args`, runs the command and checks whether the logs generated contain
every fragment in `expected_logs`
"""
def _run_click_ok_logs(
subcommand: 'str',
args: List[str],
expected_logs: List[str],
) -> Result:
assert subcommand in eynollah_subcommands, f'subcommand {subcommand} must be one of {eynollah_subcommands}'
args = [
'-m', model_dir,
subcommand,
*args
]
if pytestconfig.getoption('verbose') > 0:
args = ['-l', 'DEBUG'] + args
caplog.set_level(logging.INFO)
runner = CliRunner()
with caplog.filtering(eynollah_log_filter):
result = runner.invoke(eynollah_cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
if expected_logs:
logmsgs = [logrec.message for logrec in caplog.records]
assert any(logmsg.startswith(needle) for needle in expected_logs for logmsg in logmsgs), f'{expected_logs} not in {logmsgs}'
return result
return _run_click_ok_logs

View file

@ -0,0 +1,53 @@
import pytest
from PIL import Image
@pytest.mark.parametrize(
"options",
[
[], # defaults
["--no-patches"],
], ids=str)
def test_run_eynollah_binarization_filename(
tmp_path,
run_eynollah_ok_and_check_logs,
resources_dir,
options,
):
infile = resources_dir / '2files/kant_aufklaerung_1784_0020.tif'
outfile = tmp_path / 'kant_aufklaerung_1784_0020.png'
run_eynollah_ok_and_check_logs(
'binarization',
[
'-i', str(infile),
'-o', str(outfile),
] + options,
[
'Loaded model'
]
)
assert outfile.exists()
with Image.open(infile) as original_img:
original_size = original_img.size
with Image.open(outfile) as binarized_img:
binarized_size = binarized_img.size
assert original_size == binarized_size
def test_run_eynollah_binarization_directory(
tmp_path,
run_eynollah_ok_and_check_logs,
resources_dir,
image_resources,
):
outdir = tmp_path
run_eynollah_ok_and_check_logs(
'binarization',
[
'-di', str(resources_dir / '2files'),
'-o', str(outdir),
],
[
f'Binarizing [ 1/2] {image_resources[0].name}',
f'Binarizing [ 2/2] {image_resources[1].name}',
]
)
assert len(list(outdir.iterdir())) == 2

View file

@ -0,0 +1,52 @@
import pytest
from PIL import Image
@pytest.mark.parametrize(
"options",
[
[], # defaults
["-sos"],
], ids=str)
def test_run_eynollah_enhancement_filename(
tmp_path,
resources_dir,
run_eynollah_ok_and_check_logs,
options,
):
infile = resources_dir / '2files/kant_aufklaerung_1784_0020.tif'
outfile = tmp_path / 'kant_aufklaerung_1784_0020.png'
run_eynollah_ok_and_check_logs(
'enhancement',
[
'-i', str(infile),
'-o', str(outfile.parent),
] + options,
[
'Image was enhanced',
]
)
with Image.open(infile) as original_img:
original_size = original_img.size
with Image.open(outfile) as enhanced_img:
enhanced_size = enhanced_img.size
assert (original_size == enhanced_size) == ("-sos" in options)
def test_run_eynollah_enhancement_directory(
tmp_path,
resources_dir,
image_resources,
run_eynollah_ok_and_check_logs,
):
outdir = tmp_path
run_eynollah_ok_and_check_logs(
'enhancement',
[
'-di', str(resources_dir/ '2files'),
'-o', str(outdir),
],
[
f'Image {image_resources[0]} was enhanced',
f'Image {image_resources[1]} was enhanced',
]
)
assert len(list(outdir.iterdir())) == 2

View file

@ -0,0 +1,119 @@
import pytest
from ocrd_modelfactory import page_from_file
from ocrd_models.constants import NAMESPACES as NS
@pytest.mark.parametrize(
"options",
[
[], # defaults
#["--allow_scaling", "--curved-line"],
["--allow_scaling", "--curved-line", "--full-layout"],
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"],
# -ep ...
# -eoi ...
# --skip_layout_and_reading_order
], ids=str)
def test_run_eynollah_layout_filename(
tmp_path,
run_eynollah_ok_and_check_logs,
resources_dir,
options,
):
infile = resources_dir / '2files/kant_aufklaerung_1784_0020.tif'
outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml'
run_eynollah_ok_and_check_logs(
'layout',
[
'-i', str(infile),
'-o', str(outfile.parent),
] + options,
[
str(infile)
]
)
assert outfile.exists()
tree = page_from_file(str(outfile)).etree
regions = tree.xpath("//page:TextRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
lines = tree.xpath("//page:TextLine", namespaces=NS)
assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line
@pytest.mark.parametrize(
"options",
[
["--tables"],
["--tables", "--full-layout"],
], ids=str)
def test_run_eynollah_layout_filename2(
tmp_path,
resources_dir,
run_eynollah_ok_and_check_logs,
options,
):
infile = resources_dir / '2files/euler_rechenkunst01_1738_0025.tif'
outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml'
run_eynollah_ok_and_check_logs(
'layout',
[
'-i', str(infile),
'-o', str(outfile.parent),
] + options,
[
str(infile)
]
)
assert outfile.exists()
tree = page_from_file(str(outfile)).etree
regions = tree.xpath("//page:TextRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
regions = tree.xpath("//page:TableRegion", namespaces=NS)
# model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP
assert len(regions) >= 1, "result is inaccurate"
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
assert len(regions) >= 2, "result is inaccurate"
lines = tree.xpath("//page:TextLine", namespaces=NS)
assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line
def test_run_eynollah_layout_directory(
tmp_path,
resources_dir,
run_eynollah_ok_and_check_logs,
):
outdir = tmp_path
run_eynollah_ok_and_check_logs(
'layout',
[
'-di', str(resources_dir / '2files'),
'-o', str(outdir),
],
[
'Job done in',
'All jobs done in',
]
)
assert len(list(outdir.iterdir())) == 2
# def test_run_eynollah_layout_marginalia(
# tmp_path,
# resources_dir,
# run_eynollah_ok_and_check_logs,
# ):
# outdir = tmp_path
# outfile = outdir / 'estor_rechtsgelehrsamkeit02_1758_0880_800px.xml'
# run_eynollah_ok_and_check_logs(
# 'layout',
# [
# '-i', str(resources_dir / 'estor_rechtsgelehrsamkeit02_1758_0880_800px.jpg'),
# '-o', str(outdir),
# ],
# [
# 'Job done in',
# 'All jobs done in',
# ]
# )
# assert outfile.exists()
# tree = page_from_file(str(outfile)).etree
# regions = tree.xpath('//page:TextRegion[type="marginalia"]', namespaces=NS)
# assert len(regions) == 5, "expected 5 marginalia regions"

View file

@ -0,0 +1,47 @@
from ocrd_modelfactory import page_from_file
from ocrd_models.constants import NAMESPACES as NS
def test_run_eynollah_mbreorder_filename(
tmp_path,
resources_dir,
run_eynollah_ok_and_check_logs,
):
infile = resources_dir / '2files/kant_aufklaerung_1784_0020.xml'
outfile = tmp_path /'kant_aufklaerung_1784_0020.xml'
run_eynollah_ok_and_check_logs(
'machine-based-reading-order',
[
'-i', str(infile),
'-o', str(outfile.parent),
],
[
# FIXME: mbreorder has no logging!
]
)
assert outfile.exists()
#in_tree = page_from_file(str(infile)).etree
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
out_tree = page_from_file(str(outfile)).etree
out_order = out_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
#assert len(out_order) >= 2, "result is inaccurate"
#assert in_order != out_order
assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3']
def test_run_eynollah_mbreorder_directory(
tmp_path,
resources_dir,
run_eynollah_ok_and_check_logs,
):
outdir = tmp_path
run_eynollah_ok_and_check_logs(
'machine-based-reading-order',
[
'-di', str(resources_dir / '2files'),
'-o', str(outdir),
],
[
# FIXME: mbreorder has no logging!
]
)
assert len(list(outdir.iterdir())) == 2

View file

@ -0,0 +1,64 @@
import pytest
from ocrd_modelfactory import page_from_file
from ocrd_models.constants import NAMESPACES as NS
@pytest.mark.parametrize(
"options",
[
["-trocr"],
[], # defaults
["-doit", #str(outrenderfile.parent)],
],
], ids=str)
def test_run_eynollah_ocr_filename(
tmp_path,
run_eynollah_ok_and_check_logs,
resources_dir,
options,
):
infile = resources_dir / '2files/kant_aufklaerung_1784_0020.tif'
outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml'
outrenderfile = tmp_path / 'render' / 'kant_aufklaerung_1784_0020.png'
outrenderfile.parent.mkdir()
if "-doit" in options:
options.insert(options.index("-doit") + 1, str(outrenderfile.parent))
run_eynollah_ok_and_check_logs(
'ocr',
[
'-i', str(infile),
'-dx', str(infile.parent),
'-o', str(outfile.parent),
] + options,
[
# FIXME: ocr has no logging!
]
)
assert outfile.exists()
if "-doit" in options:
assert outrenderfile.exists()
#in_tree = page_from_file(str(infile)).etree
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
out_tree = page_from_file(str(outfile)).etree
out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS)
assert len(out_texts) >= 2, ("result is inaccurate", out_texts)
assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts)
def test_run_eynollah_ocr_directory(
tmp_path,
run_eynollah_ok_and_check_logs,
resources_dir,
):
outdir = tmp_path
run_eynollah_ok_and_check_logs(
'ocr',
[
'-di', str(resources_dir / '2files'),
'-dx', str(resources_dir / '2files'),
'-o', str(outdir),
],
[
# FIXME: ocr has no logging!
]
)
assert len(list(outdir.iterdir())) == 2