From ec1fd93dad864e0267b68d7528cd5ba5978da957 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 23 Oct 2025 11:58:23 +0200 Subject: [PATCH] wip --- .github/workflows/test-eynollah.yml | 6 +- Makefile | 45 +--- src/eynollah/cli_models.py | 6 +- src/eynollah/model_zoo/default_specs.py | 4 +- tests/test_run.py | 102 -------- tests/test_run_layout.py | 330 ++++++++++++++++++++++++ 6 files changed, 349 insertions(+), 144 deletions(-) create mode 100644 tests/test_run_layout.py diff --git a/.github/workflows/test-eynollah.yml b/.github/workflows/test-eynollah.yml index 5b22fd1..dae190a 100644 --- a/.github/workflows/test-eynollah.yml +++ b/.github/workflows/test-eynollah.yml @@ -31,7 +31,7 @@ jobs: src: "./src" - name: Try to restore models_eynollah - uses: actions/cache/restore@v4 + uses: actions/cache/restore@v4 id: all_model_cache with: path: models_eynollah @@ -40,8 +40,8 @@ jobs: - name: Download models if: steps.all_model_cache.outputs.cache-hit != 'true' run: | - make models - ls -la models_eynollah + make models + ls -la models_eynollah - uses: actions/cache/save@v4 if: steps.all_model_cache.outputs.cache-hit != 'true' diff --git a/Makefile b/Makefile index b1cbcc4..1e7f2dd 100644 --- a/Makefile +++ b/Makefile @@ -14,17 +14,9 @@ WGET = wget -O #SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz #SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz #SEG_MODEL := https://zenodo.org/records/17194824/files/models_layout_v0_5_0.tar.gz?download=1 -SEG_MODEL := https://zenodo.org/records/17295988/files/models_layout_v0_6_0.tar.gz?download=1 -SEG_MODELFILE = $(notdir $(patsubst %?download=1,%,$(SEG_MODEL))) -SEG_MODELNAME = $(SEG_MODELFILE:%.tar.gz=%) - -BIN_MODEL := https://zenodo.org/records/17295988/files/models_binarization_v0_6_0.tar.gz?download=1 -BIN_MODELFILE = $(notdir $(BIN_MODEL)) -BIN_MODELNAME := default-2021-03-09 - -OCR_MODEL := https://zenodo.org/records/17295988/files/models_ocr_v0_6_0.tar.gz?download=1 -OCR_MODELFILE = $(notdir $(patsubst %?download=1,%,$(OCR_MODEL))) -OCR_MODELNAME = $(OCR_MODELFILE:%.tar.gz=%) +EYNOLLAH_MODELS_URL := https://zenodo.org/records/17295988/files/models_all_v0_7_0.zip +EYNOLLAH_MODELS_ZIP = $(notdir $(SEG_MODEL)) +EYNOLLAH_MODELS_DIR = $(SEG_MODELFILE:%.zip=%) PYTEST_ARGS ?= -vv --isolate @@ -49,33 +41,23 @@ help: @echo " EXTRAS comma-separated list of features (like 'OCR,plotting') for 'install' [$(EXTRAS)]" @echo " DOCKER_TAG Docker image tag for 'docker' [$(DOCKER_TAG)]" @echo " PYTEST_ARGS pytest args for 'test' (Set to '-s' to see log output during test execution, '-vv' to see individual tests. [$(PYTEST_ARGS)]" - @echo " SEG_MODEL URL of 'models' archive to download for segmentation 'test' [$(SEG_MODEL)]" - @echo " BIN_MODEL URL of 'models' archive to download for binarization 'test' [$(BIN_MODEL)]" - @echo " OCR_MODEL URL of 'models' archive to download for binarization 'test' [$(OCR_MODEL)]" + @echo " ALL_MODELS URL of archive of all models [$(ALL_MODELS)]" @echo "" # END-EVAL # Download and extract models to $(PWD)/models_layout_v0_6_0 -models: $(BIN_MODELNAME) $(SEG_MODELNAME) $(OCR_MODELNAME) +models: $(EYNOLLAH_MODELS_DIR) # do not download these files if we already have the directories -.INTERMEDIATE: $(BIN_MODELFILE) $(SEG_MODELFILE) $(OCR_MODELFILE) +.INTERMEDIATE: $(EYNOLLAH_MODELS_ZIP) -$(BIN_MODELFILE): - $(WGET) $@ $(BIN_MODEL) -$(SEG_MODELFILE): - $(WGET) $@ $(SEG_MODEL) -$(OCR_MODELFILE): - $(WGET) $@ $(OCR_MODEL) +$(EYNOLLAH_MODELS_ZIP): + $(WGET) $@ $(EYNOLLAH_MODELS_URL) -$(BIN_MODELNAME): $(BIN_MODELFILE) - tar zxf $< -$(SEG_MODELNAME): $(SEG_MODELFILE) - tar zxf $< -$(OCR_MODELNAME): $(OCR_MODELFILE) - tar zxf $< +$(EYNOLLAH_MODELS_DIR): $(EYNOLLAH_MODELS_ZIP) + unzip $< build: $(PIP) install build @@ -89,13 +71,8 @@ install: install-dev: $(PIP) install -e .$(and $(EXTRAS),[$(EXTRAS)]) -ifeq (OCR,$(findstring OCR, $(EXTRAS))) -deps-test: $(OCR_MODELNAME) -endif -deps-test: $(BIN_MODELNAME) $(SEG_MODELNAME) +deps-test: $(EYNOLLAH_MODELS_ZIP) $(PIP) install -r requirements-test.txt -ifeq (OCR,$(findstring OCR, $(EXTRAS))) - ln -rs $(OCR_MODELNAME)/* $(SEG_MODELNAME)/ endif smoke-test: TMPDIR != mktemp -d diff --git a/src/eynollah/cli_models.py b/src/eynollah/cli_models.py index 595c499..a299d19 100644 --- a/src/eynollah/cli_models.py +++ b/src/eynollah/cli_models.py @@ -85,9 +85,9 @@ def package( copies.add((src, dist_dir)) mkdirs.add(dist_dir) for dir in mkdirs: - print(f"mkdir -p {dir}") + print(f"mkdir -vp {dir}") for (src, dst) in copies: - print(f"cp -r {src} {dst}") + print(f"cp -vr {src} {dst}") for dir in mkdirs: zip_path = Path(f'../{dir.parent.name}.zip') - print(f"(cd {dir}/..; zip -r {zip_path} models_eynollah)") + print(f"(cd {dir}/..; zip -vr {zip_path} models_eynollah)") diff --git a/src/eynollah/model_zoo/default_specs.py b/src/eynollah/model_zoo/default_specs.py index e06c829..a57f7f1 100644 --- a/src/eynollah/model_zoo/default_specs.py +++ b/src/eynollah/model_zoo/default_specs.py @@ -14,7 +14,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([ category="enhancement", variant='', filename="models_eynollah/eynollah-enhancement_20210425", - dists=['enhancement', 'layout'], + dists=['enhancement', 'layout', 'ci'], dist_url=dist_url("enhancement"), type=KerasModel, ), @@ -23,7 +23,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([ category="binarization", variant='', filename="models_eynollah/eynollah-binarization-hybrid_20230504", - dists=['layout', 'binarization'], + dists=['layout', 'binarization', ], dist_url=dist_url("binarization"), type=KerasModel, ), diff --git a/tests/test_run.py b/tests/test_run.py index a410d34..359b0f0 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -16,114 +16,12 @@ from ocrd_models.constants import NAMESPACES as NS testdir = Path(__file__).parent.resolve() -MODELS_LAYOUT = environ.get('MODELS_LAYOUT', str(testdir.joinpath('..', 'models_layout_v0_6_0').resolve())) MODELS_OCR = environ.get('MODELS_OCR', str(testdir.joinpath('..', 'models_ocr_v0_6_0').resolve())) MODELS_BIN = environ.get('MODELS_BIN', str(testdir.joinpath('..', 'default-2021-03-09').resolve())) def only_eynollah(logrec): return logrec.name.startswith('eynollah') -@pytest.mark.parametrize( - "options", - [ - [], # defaults - #["--allow_scaling", "--curved-line"], - ["--allow_scaling", "--curved-line", "--full-layout"], - ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"], - ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based", - "--textline_light", "--light_version"], - # -ep ... - # -eoi ... - # FIXME: find out whether OCR extra was installed, otherwise skip these - ["--do_ocr"], - ["--do_ocr", "--light_version", "--textline_light"], - ["--do_ocr", "--transformer_ocr"], - #["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"], - ["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"], - # --skip_layout_and_reading_order - ], ids=str) -def test_run_eynollah_layout_filename(tmp_path, pytestconfig, caplog, options): - infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') - outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml' - args = [ - '-m', MODELS_LAYOUT, - '-i', str(infile), - '-o', str(outfile.parent), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(layout_cli, args + options, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - assert str(infile) in logmsgs - assert outfile.exists() - tree = page_from_file(str(outfile)).etree - regions = tree.xpath("//page:TextRegion", namespaces=NS) - assert len(regions) >= 2, "result is inaccurate" - regions = tree.xpath("//page:SeparatorRegion", namespaces=NS) - assert len(regions) >= 2, "result is inaccurate" - lines = tree.xpath("//page:TextLine", namespaces=NS) - assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line - -@pytest.mark.parametrize( - "options", - [ - ["--tables"], - ["--tables", "--full-layout"], - ["--tables", "--full-layout", "--textline_light", "--light_version"], - ], ids=str) -def test_run_eynollah_layout_filename2(tmp_path, pytestconfig, caplog, options): - infile = testdir.joinpath('resources/euler_rechenkunst01_1738_0025.tif') - outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml' - args = [ - '-m', MODELS_LAYOUT, - '-i', str(infile), - '-o', str(outfile.parent), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(layout_cli, args + options, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - assert str(infile) in logmsgs - assert outfile.exists() - tree = page_from_file(str(outfile)).etree - regions = tree.xpath("//page:TextRegion", namespaces=NS) - assert len(regions) >= 2, "result is inaccurate" - regions = tree.xpath("//page:TableRegion", namespaces=NS) - # model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP - assert len(regions) >= 1, "result is inaccurate" - regions = tree.xpath("//page:SeparatorRegion", namespaces=NS) - assert len(regions) >= 2, "result is inaccurate" - lines = tree.xpath("//page:TextLine", namespaces=NS) - assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line - -def test_run_eynollah_layout_directory(tmp_path, pytestconfig, caplog): - indir = testdir.joinpath('resources') - outdir = tmp_path - args = [ - '-m', MODELS_LAYOUT, - '-di', str(indir), - '-o', str(outdir), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(layout_cli, args, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Job done in')]) == 2 - assert any(logmsg for logmsg in logmsgs if logmsg.startswith('All jobs done in')) - assert len(list(outdir.iterdir())) == 2 - @pytest.mark.parametrize( "options", [ diff --git a/tests/test_run_layout.py b/tests/test_run_layout.py new file mode 100644 index 0000000..29cebc4 --- /dev/null +++ b/tests/test_run_layout.py @@ -0,0 +1,330 @@ +from os import environ +from pathlib import Path +import pytest +import logging +from PIL import Image +from eynollah.cli import ( + layout as layout_cli, + binarization as binarization_cli, + enhancement as enhancement_cli, + machine_based_reading_order as mbreorder_cli, + ocr as ocr_cli, +) +from click.testing import CliRunner +from ocrd_modelfactory import page_from_file +from ocrd_models.constants import NAMESPACES as NS + +testdir = Path(__file__).parent.resolve() + +MODELS_LAYOUT = environ.get('MODELS_LAYOUT', str(testdir.joinpath('..', 'models_layout_v0_6_0').resolve())) + +def only_eynollah(logrec): + return logrec.name.startswith('eynollah') + +@pytest.mark.parametrize( + "options", + [ + [], # defaults + #["--allow_scaling", "--curved-line"], + ["--allow_scaling", "--curved-line", "--full-layout"], + ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"], + ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based", + "--textline_light", "--light_version"], + # -ep ... + # -eoi ... + # FIXME: find out whether OCR extra was installed, otherwise skip these + ["--do_ocr"], + ["--do_ocr", "--light_version", "--textline_light"], + ["--do_ocr", "--transformer_ocr"], + #["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"], + ["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"], + # --skip_layout_and_reading_order + ], ids=str) +def test_run_eynollah_layout_filename(tmp_path, pytestconfig, caplog, options): + infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') + outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml' + args = [ + '-m', MODELS_LAYOUT, + '-i', str(infile), + '-o', str(outfile.parent), + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.INFO) + runner = CliRunner() + with caplog.filtering(only_eynollah): + result = runner.invoke(layout_cli, args + options, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + assert str(infile) in logmsgs + assert outfile.exists() + tree = page_from_file(str(outfile)).etree + regions = tree.xpath("//page:TextRegion", namespaces=NS) + assert len(regions) >= 2, "result is inaccurate" + regions = tree.xpath("//page:SeparatorRegion", namespaces=NS) + assert len(regions) >= 2, "result is inaccurate" + lines = tree.xpath("//page:TextLine", namespaces=NS) + assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line + +@pytest.mark.parametrize( + "options", + [ + ["--tables"], + ["--tables", "--full-layout"], + ["--tables", "--full-layout", "--textline_light", "--light_version"], + ], ids=str) +def test_run_eynollah_layout_filename2(tmp_path, pytestconfig, caplog, options): + infile = testdir.joinpath('resources/euler_rechenkunst01_1738_0025.tif') + outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml' + args = [ + '-m', MODELS_LAYOUT, + '-i', str(infile), + '-o', str(outfile.parent), + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.INFO) + runner = CliRunner() + with caplog.filtering(only_eynollah): + result = runner.invoke(layout_cli, args + options, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + assert str(infile) in logmsgs + assert outfile.exists() + tree = page_from_file(str(outfile)).etree + regions = tree.xpath("//page:TextRegion", namespaces=NS) + assert len(regions) >= 2, "result is inaccurate" + regions = tree.xpath("//page:TableRegion", namespaces=NS) + # model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP + assert len(regions) >= 1, "result is inaccurate" + regions = tree.xpath("//page:SeparatorRegion", namespaces=NS) + assert len(regions) >= 2, "result is inaccurate" + lines = tree.xpath("//page:TextLine", namespaces=NS) + assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line + +def test_run_eynollah_layout_directory(tmp_path, pytestconfig, caplog): + indir = testdir.joinpath('resources') + outdir = tmp_path + args = [ + '-m', MODELS_LAYOUT, + '-di', str(indir), + '-o', str(outdir), + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.INFO) + runner = CliRunner() + with caplog.filtering(only_eynollah): + result = runner.invoke(layout_cli, args, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Job done in')]) == 2 + assert any(logmsg for logmsg in logmsgs if logmsg.startswith('All jobs done in')) + assert len(list(outdir.iterdir())) == 2 + +@pytest.mark.parametrize( + "options", + [ + [], # defaults + ["--no-patches"], + ], ids=str) +def test_run_eynollah_binarization_filename(tmp_path, pytestconfig, caplog, options): + infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') + outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png') + args = [ + '-m', MODELS_BIN, + '-i', str(infile), + '-o', str(outfile), + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.INFO) + runner = CliRunner() + with caplog.filtering(only_eynollah): + result = runner.invoke(binarization_cli, args + options, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + assert any(True for logmsg in logmsgs if logmsg.startswith('Predicting')) + assert outfile.exists() + with Image.open(infile) as original_img: + original_size = original_img.size + with Image.open(outfile) as binarized_img: + binarized_size = binarized_img.size + assert original_size == binarized_size + +def test_run_eynollah_binarization_directory(tmp_path, pytestconfig, caplog): + indir = testdir.joinpath('resources') + outdir = tmp_path + args = [ + '-m', MODELS_BIN, + '-di', str(indir), + '-o', str(outdir), + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.INFO) + runner = CliRunner() + with caplog.filtering(only_eynollah): + result = runner.invoke(binarization_cli, args, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Predicting')]) == 2 + assert len(list(outdir.iterdir())) == 2 + +@pytest.mark.parametrize( + "options", + [ + [], # defaults + ["-sos"], + ], ids=str) +def test_run_eynollah_enhancement_filename(tmp_path, pytestconfig, caplog, options): + infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') + outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png') + args = [ + '-m', MODELS_LAYOUT, + '-i', str(infile), + '-o', str(outfile.parent), + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.INFO) + runner = CliRunner() + with caplog.filtering(only_eynollah): + result = runner.invoke(enhancement_cli, args + options, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + assert any(True for logmsg in logmsgs if logmsg.startswith('Image was enhanced')), logmsgs + assert outfile.exists() + with Image.open(infile) as original_img: + original_size = original_img.size + with Image.open(outfile) as enhanced_img: + enhanced_size = enhanced_img.size + assert (original_size == enhanced_size) == ("-sos" in options) + +def test_run_eynollah_enhancement_directory(tmp_path, pytestconfig, caplog): + indir = testdir.joinpath('resources') + outdir = tmp_path + args = [ + '-m', MODELS_LAYOUT, + '-di', str(indir), + '-o', str(outdir), + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.INFO) + runner = CliRunner() + with caplog.filtering(only_eynollah): + result = runner.invoke(enhancement_cli, args, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Image was enhanced')]) == 2 + assert len(list(outdir.iterdir())) == 2 + +def test_run_eynollah_mbreorder_filename(tmp_path, pytestconfig, caplog): + infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.xml') + outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml') + args = [ + '-m', MODELS_LAYOUT, + '-i', str(infile), + '-o', str(outfile.parent), + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.INFO) + runner = CliRunner() + with caplog.filtering(only_eynollah): + result = runner.invoke(mbreorder_cli, args, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + # FIXME: mbreorder has no logging! + #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs + assert outfile.exists() + #in_tree = page_from_file(str(infile)).etree + #in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) + out_tree = page_from_file(str(outfile)).etree + out_order = out_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) + #assert len(out_order) >= 2, "result is inaccurate" + #assert in_order != out_order + assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3'] + +def test_run_eynollah_mbreorder_directory(tmp_path, pytestconfig, caplog): + indir = testdir.joinpath('resources') + outdir = tmp_path + args = [ + '-m', MODELS_LAYOUT, + '-di', str(indir), + '-o', str(outdir), + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.INFO) + runner = CliRunner() + with caplog.filtering(only_eynollah): + result = runner.invoke(mbreorder_cli, args, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + # FIXME: mbreorder has no logging! + #assert len([logmsg for logmsg in logmsgs if logmsg.startswith('???')]) == 2 + assert len(list(outdir.iterdir())) == 2 + +@pytest.mark.parametrize( + "options", + [ + [], # defaults + ["-doit", #str(outrenderfile.parent)], + ], + ["-trocr"], + ], ids=str) +def test_run_eynollah_ocr_filename(tmp_path, pytestconfig, caplog, options): + infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') + outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml') + outrenderfile = tmp_path.joinpath('render').joinpath('kant_aufklaerung_1784_0020.png') + outrenderfile.parent.mkdir() + args = [ + '-m', MODELS_OCR, + '-i', str(infile), + '-dx', str(infile.parent), + '-o', str(outfile.parent), + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.DEBUG) + runner = CliRunner() + if "-doit" in options: + options.insert(options.index("-doit") + 1, str(outrenderfile.parent)) + with caplog.filtering(only_eynollah): + result = runner.invoke(ocr_cli, args + options, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + # FIXME: ocr has no logging! + #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs + assert outfile.exists() + if "-doit" in options: + assert outrenderfile.exists() + #in_tree = page_from_file(str(infile)).etree + #in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) + out_tree = page_from_file(str(outfile)).etree + out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS) + assert len(out_texts) >= 2, ("result is inaccurate", out_texts) + assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts) + +def test_run_eynollah_ocr_directory(tmp_path, pytestconfig, caplog): + indir = testdir.joinpath('resources') + outdir = tmp_path + args = [ + '-m', MODELS_OCR, + '-di', str(indir), + '-dx', str(indir), + '-o', str(outdir), + ] + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.INFO) + runner = CliRunner() + with caplog.filtering(only_eynollah): + result = runner.invoke(ocr_cli, args, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + logmsgs = [logrec.message for logrec in caplog.records] + # FIXME: ocr has no logging! + #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs + assert len(list(outdir.iterdir())) == 2