mirror of
https://github.com/qurator-spk/eynollah.git
synced 2026-02-21 00:41:56 +01:00
Merge branch 'main' into ro-fixes and resolve conflicts…
major conflicts resolved manually:
- branches for non-`light` segmentation already removed in main
- Keras/TF setup and no TF1 sessions, esp. in new ModelZoo
- changes to binarizer and its CLI (`mode`, `overwrite`, `run_single()`)
- writer: `build...` w/ kwargs instead of positional
- training for segmentation/binarization/enhancement tasks:
* drop unused `generate_data_from_folder()`
* simplify `preprocess_imgs()`: turn `preprocess_img()`, `get_patches()`
and `get_patches_num_scale_new()` into generators, only writing
result files in the caller (top-level loop) instead of passing
output directories and file counter
- training for new OCR task:
* `train`: put keys into additional `config_params` where they belong,
resp. (conditioned under existing keys), and w/ better documentation
* `train`: add new keys as kwargs to `run()` to make usable
* `utils`: instead of custom data loader `data_gen_ocr()`, re-use
existing `preprocess_imgs()` (for cfg capture and top-level loop),
but extended w/ new kwargs and calling new `preprocess_img_ocr()`;
the latter as single-image generator (also much simplified)
* `train`: use tf.data loader pipeline from that generator w/ standard
mechanisms for batching, shuffling, prefetching etc.
* `utils` and `train`: instead of `vectorize_label`, use `Dataset.padded_batch`
* add TensorBoard callback and re-use our checkpoint callback
* also use standard Keras top-level loop for training
still problematic (substantially unresolved):
- `Patches` now only w/ fixed implicit size
(ignoring training config params)
- `PatchEncoder` now only w/ fixed implicit num patches and projection dim
(ignoring training config params)
This commit is contained in:
commit
27f43c175f
77 changed files with 5597 additions and 4952 deletions
47
tests/cli_tests/conftest.py
Normal file
47
tests/cli_tests/conftest.py
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
from typing import List
|
||||
import pytest
|
||||
import logging
|
||||
|
||||
from click.testing import CliRunner, Result
|
||||
from eynollah.cli import main as eynollah_cli
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def run_eynollah_ok_and_check_logs(
|
||||
pytestconfig,
|
||||
caplog,
|
||||
model_dir,
|
||||
eynollah_subcommands,
|
||||
eynollah_log_filter,
|
||||
):
|
||||
"""
|
||||
Generates a Click Runner for `cli`, injects model_path and logging level
|
||||
to `args`, runs the command and checks whether the logs generated contain
|
||||
every fragment in `expected_logs`
|
||||
"""
|
||||
|
||||
def _run_click_ok_logs(
|
||||
subcommand: 'str',
|
||||
args: List[str],
|
||||
expected_logs: List[str],
|
||||
) -> Result:
|
||||
assert subcommand in eynollah_subcommands, f'subcommand {subcommand} must be one of {eynollah_subcommands}'
|
||||
args = [
|
||||
'-m', model_dir,
|
||||
subcommand,
|
||||
*args
|
||||
]
|
||||
if pytestconfig.getoption('verbose') > 0:
|
||||
args = ['-l', 'DEBUG'] + args
|
||||
caplog.set_level(logging.INFO)
|
||||
runner = CliRunner()
|
||||
with caplog.filtering(eynollah_log_filter):
|
||||
result = runner.invoke(eynollah_cli, args, catch_exceptions=False)
|
||||
assert result.exit_code == 0, result.stdout
|
||||
if expected_logs:
|
||||
logmsgs = [logrec.message for logrec in caplog.records]
|
||||
assert any(logmsg.startswith(needle) for needle in expected_logs for logmsg in logmsgs), f'{expected_logs} not in {logmsgs}'
|
||||
return result
|
||||
|
||||
return _run_click_ok_logs
|
||||
|
||||
53
tests/cli_tests/test_binarization.py
Normal file
53
tests/cli_tests/test_binarization.py
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"options",
|
||||
[
|
||||
[], # defaults
|
||||
["--no-patches"],
|
||||
], ids=str)
|
||||
def test_run_eynollah_binarization_filename(
|
||||
tmp_path,
|
||||
run_eynollah_ok_and_check_logs,
|
||||
resources_dir,
|
||||
options,
|
||||
):
|
||||
infile = resources_dir / '2files/kant_aufklaerung_1784_0020.tif'
|
||||
outfile = tmp_path / 'kant_aufklaerung_1784_0020.png'
|
||||
run_eynollah_ok_and_check_logs(
|
||||
'binarization',
|
||||
[
|
||||
'-i', str(infile),
|
||||
'-o', str(outfile),
|
||||
] + options,
|
||||
[
|
||||
'Loaded model'
|
||||
]
|
||||
)
|
||||
assert outfile.exists()
|
||||
with Image.open(infile) as original_img:
|
||||
original_size = original_img.size
|
||||
with Image.open(outfile) as binarized_img:
|
||||
binarized_size = binarized_img.size
|
||||
assert original_size == binarized_size
|
||||
|
||||
def test_run_eynollah_binarization_directory(
|
||||
tmp_path,
|
||||
run_eynollah_ok_and_check_logs,
|
||||
resources_dir,
|
||||
image_resources,
|
||||
):
|
||||
outdir = tmp_path
|
||||
run_eynollah_ok_and_check_logs(
|
||||
'binarization',
|
||||
[
|
||||
'-di', str(resources_dir / '2files'),
|
||||
'-o', str(outdir),
|
||||
],
|
||||
[
|
||||
f'Binarizing [ 1/2] {image_resources[0].name}',
|
||||
f'Binarizing [ 2/2] {image_resources[1].name}',
|
||||
]
|
||||
)
|
||||
assert len(list(outdir.iterdir())) == 2
|
||||
52
tests/cli_tests/test_enhance.py
Normal file
52
tests/cli_tests/test_enhance.py
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"options",
|
||||
[
|
||||
[], # defaults
|
||||
["-sos"],
|
||||
], ids=str)
|
||||
def test_run_eynollah_enhancement_filename(
|
||||
tmp_path,
|
||||
resources_dir,
|
||||
run_eynollah_ok_and_check_logs,
|
||||
options,
|
||||
):
|
||||
infile = resources_dir / '2files/kant_aufklaerung_1784_0020.tif'
|
||||
outfile = tmp_path / 'kant_aufklaerung_1784_0020.png'
|
||||
run_eynollah_ok_and_check_logs(
|
||||
'enhancement',
|
||||
[
|
||||
'-i', str(infile),
|
||||
'-o', str(outfile.parent),
|
||||
] + options,
|
||||
[
|
||||
'Image was enhanced',
|
||||
]
|
||||
)
|
||||
with Image.open(infile) as original_img:
|
||||
original_size = original_img.size
|
||||
with Image.open(outfile) as enhanced_img:
|
||||
enhanced_size = enhanced_img.size
|
||||
assert (original_size == enhanced_size) == ("-sos" in options)
|
||||
|
||||
def test_run_eynollah_enhancement_directory(
|
||||
tmp_path,
|
||||
resources_dir,
|
||||
image_resources,
|
||||
run_eynollah_ok_and_check_logs,
|
||||
):
|
||||
outdir = tmp_path
|
||||
run_eynollah_ok_and_check_logs(
|
||||
'enhancement',
|
||||
[
|
||||
'-di', str(resources_dir/ '2files'),
|
||||
'-o', str(outdir),
|
||||
],
|
||||
[
|
||||
f'Image {image_resources[0]} was enhanced',
|
||||
f'Image {image_resources[1]} was enhanced',
|
||||
]
|
||||
)
|
||||
assert len(list(outdir.iterdir())) == 2
|
||||
119
tests/cli_tests/test_layout.py
Normal file
119
tests/cli_tests/test_layout.py
Normal file
|
|
@ -0,0 +1,119 @@
|
|||
import pytest
|
||||
from ocrd_modelfactory import page_from_file
|
||||
from ocrd_models.constants import NAMESPACES as NS
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"options",
|
||||
[
|
||||
[], # defaults
|
||||
#["--allow_scaling", "--curved-line"],
|
||||
["--allow_scaling", "--curved-line", "--full-layout"],
|
||||
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"],
|
||||
# -ep ...
|
||||
# -eoi ...
|
||||
# --skip_layout_and_reading_order
|
||||
], ids=str)
|
||||
def test_run_eynollah_layout_filename(
|
||||
tmp_path,
|
||||
run_eynollah_ok_and_check_logs,
|
||||
resources_dir,
|
||||
options,
|
||||
):
|
||||
infile = resources_dir / '2files/kant_aufklaerung_1784_0020.tif'
|
||||
outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml'
|
||||
run_eynollah_ok_and_check_logs(
|
||||
'layout',
|
||||
[
|
||||
'-i', str(infile),
|
||||
'-o', str(outfile.parent),
|
||||
] + options,
|
||||
[
|
||||
str(infile)
|
||||
]
|
||||
)
|
||||
assert outfile.exists()
|
||||
tree = page_from_file(str(outfile)).etree
|
||||
regions = tree.xpath("//page:TextRegion", namespaces=NS)
|
||||
assert len(regions) >= 2, "result is inaccurate"
|
||||
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
|
||||
assert len(regions) >= 2, "result is inaccurate"
|
||||
lines = tree.xpath("//page:TextLine", namespaces=NS)
|
||||
assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"options",
|
||||
[
|
||||
["--tables"],
|
||||
["--tables", "--full-layout"],
|
||||
], ids=str)
|
||||
def test_run_eynollah_layout_filename2(
|
||||
tmp_path,
|
||||
resources_dir,
|
||||
run_eynollah_ok_and_check_logs,
|
||||
options,
|
||||
):
|
||||
infile = resources_dir / '2files/euler_rechenkunst01_1738_0025.tif'
|
||||
outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml'
|
||||
run_eynollah_ok_and_check_logs(
|
||||
'layout',
|
||||
[
|
||||
'-i', str(infile),
|
||||
'-o', str(outfile.parent),
|
||||
] + options,
|
||||
[
|
||||
str(infile)
|
||||
]
|
||||
)
|
||||
assert outfile.exists()
|
||||
tree = page_from_file(str(outfile)).etree
|
||||
regions = tree.xpath("//page:TextRegion", namespaces=NS)
|
||||
assert len(regions) >= 2, "result is inaccurate"
|
||||
regions = tree.xpath("//page:TableRegion", namespaces=NS)
|
||||
# model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP
|
||||
assert len(regions) >= 1, "result is inaccurate"
|
||||
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
|
||||
assert len(regions) >= 2, "result is inaccurate"
|
||||
lines = tree.xpath("//page:TextLine", namespaces=NS)
|
||||
assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line
|
||||
|
||||
def test_run_eynollah_layout_directory(
|
||||
tmp_path,
|
||||
resources_dir,
|
||||
run_eynollah_ok_and_check_logs,
|
||||
):
|
||||
outdir = tmp_path
|
||||
run_eynollah_ok_and_check_logs(
|
||||
'layout',
|
||||
[
|
||||
'-di', str(resources_dir / '2files'),
|
||||
'-o', str(outdir),
|
||||
],
|
||||
[
|
||||
'Job done in',
|
||||
'All jobs done in',
|
||||
]
|
||||
)
|
||||
assert len(list(outdir.iterdir())) == 2
|
||||
|
||||
# def test_run_eynollah_layout_marginalia(
|
||||
# tmp_path,
|
||||
# resources_dir,
|
||||
# run_eynollah_ok_and_check_logs,
|
||||
# ):
|
||||
# outdir = tmp_path
|
||||
# outfile = outdir / 'estor_rechtsgelehrsamkeit02_1758_0880_800px.xml'
|
||||
# run_eynollah_ok_and_check_logs(
|
||||
# 'layout',
|
||||
# [
|
||||
# '-i', str(resources_dir / 'estor_rechtsgelehrsamkeit02_1758_0880_800px.jpg'),
|
||||
# '-o', str(outdir),
|
||||
# ],
|
||||
# [
|
||||
# 'Job done in',
|
||||
# 'All jobs done in',
|
||||
# ]
|
||||
# )
|
||||
# assert outfile.exists()
|
||||
# tree = page_from_file(str(outfile)).etree
|
||||
# regions = tree.xpath('//page:TextRegion[type="marginalia"]', namespaces=NS)
|
||||
# assert len(regions) == 5, "expected 5 marginalia regions"
|
||||
47
tests/cli_tests/test_mbreorder.py
Normal file
47
tests/cli_tests/test_mbreorder.py
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
from ocrd_modelfactory import page_from_file
|
||||
from ocrd_models.constants import NAMESPACES as NS
|
||||
|
||||
def test_run_eynollah_mbreorder_filename(
|
||||
tmp_path,
|
||||
resources_dir,
|
||||
run_eynollah_ok_and_check_logs,
|
||||
):
|
||||
infile = resources_dir / '2files/kant_aufklaerung_1784_0020.xml'
|
||||
outfile = tmp_path /'kant_aufklaerung_1784_0020.xml'
|
||||
run_eynollah_ok_and_check_logs(
|
||||
'machine-based-reading-order',
|
||||
[
|
||||
'-i', str(infile),
|
||||
'-o', str(outfile.parent),
|
||||
],
|
||||
[
|
||||
# FIXME: mbreorder has no logging!
|
||||
]
|
||||
)
|
||||
assert outfile.exists()
|
||||
#in_tree = page_from_file(str(infile)).etree
|
||||
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
|
||||
out_tree = page_from_file(str(outfile)).etree
|
||||
out_order = out_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
|
||||
#assert len(out_order) >= 2, "result is inaccurate"
|
||||
#assert in_order != out_order
|
||||
assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3']
|
||||
|
||||
def test_run_eynollah_mbreorder_directory(
|
||||
tmp_path,
|
||||
resources_dir,
|
||||
run_eynollah_ok_and_check_logs,
|
||||
):
|
||||
outdir = tmp_path
|
||||
run_eynollah_ok_and_check_logs(
|
||||
'machine-based-reading-order',
|
||||
[
|
||||
'-di', str(resources_dir / '2files'),
|
||||
'-o', str(outdir),
|
||||
],
|
||||
[
|
||||
# FIXME: mbreorder has no logging!
|
||||
]
|
||||
)
|
||||
assert len(list(outdir.iterdir())) == 2
|
||||
|
||||
64
tests/cli_tests/test_ocr.py
Normal file
64
tests/cli_tests/test_ocr.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
import pytest
|
||||
from ocrd_modelfactory import page_from_file
|
||||
from ocrd_models.constants import NAMESPACES as NS
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"options",
|
||||
[
|
||||
["-trocr"],
|
||||
[], # defaults
|
||||
["-doit", #str(outrenderfile.parent)],
|
||||
],
|
||||
], ids=str)
|
||||
def test_run_eynollah_ocr_filename(
|
||||
tmp_path,
|
||||
run_eynollah_ok_and_check_logs,
|
||||
resources_dir,
|
||||
options,
|
||||
):
|
||||
infile = resources_dir / '2files/kant_aufklaerung_1784_0020.tif'
|
||||
outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml'
|
||||
outrenderfile = tmp_path / 'render' / 'kant_aufklaerung_1784_0020.png'
|
||||
outrenderfile.parent.mkdir()
|
||||
if "-doit" in options:
|
||||
options.insert(options.index("-doit") + 1, str(outrenderfile.parent))
|
||||
run_eynollah_ok_and_check_logs(
|
||||
'ocr',
|
||||
[
|
||||
'-i', str(infile),
|
||||
'-dx', str(infile.parent),
|
||||
'-o', str(outfile.parent),
|
||||
] + options,
|
||||
[
|
||||
# FIXME: ocr has no logging!
|
||||
]
|
||||
)
|
||||
assert outfile.exists()
|
||||
if "-doit" in options:
|
||||
assert outrenderfile.exists()
|
||||
#in_tree = page_from_file(str(infile)).etree
|
||||
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
|
||||
out_tree = page_from_file(str(outfile)).etree
|
||||
out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS)
|
||||
assert len(out_texts) >= 2, ("result is inaccurate", out_texts)
|
||||
assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts)
|
||||
|
||||
def test_run_eynollah_ocr_directory(
|
||||
tmp_path,
|
||||
run_eynollah_ok_and_check_logs,
|
||||
resources_dir,
|
||||
):
|
||||
outdir = tmp_path
|
||||
run_eynollah_ok_and_check_logs(
|
||||
'ocr',
|
||||
[
|
||||
'-di', str(resources_dir / '2files'),
|
||||
'-dx', str(resources_dir / '2files'),
|
||||
'-o', str(outdir),
|
||||
],
|
||||
[
|
||||
# FIXME: ocr has no logging!
|
||||
]
|
||||
)
|
||||
assert len(list(outdir.iterdir())) == 2
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue