from os import environ from pathlib import Path import logging from PIL import Image from eynollah.cli import ( layout as layout_cli, binarization as binarization_cli, enhancement as enhancement_cli, machine_based_reading_order as mbreorder_cli, ocr as ocr_cli, ) from click.testing import CliRunner from ocrd_modelfactory import page_from_file from ocrd_models.constants import NAMESPACES as NS testdir = Path(__file__).parent.resolve() MODELS_LAYOUT = environ.get('MODELS_LAYOUT', str(testdir.joinpath('..', 'models_layout_v0_5_0').resolve())) MODELS_OCR = environ.get('MODELS_OCR', str(testdir.joinpath('..', 'models_ocr_v0_5_0').resolve())) MODELS_BIN = environ.get('MODELS_BIN', str(testdir.joinpath('..', 'default-2021-03-09').resolve())) def test_run_eynollah_layout_filename(tmp_path, subtests, pytestconfig, caplog): infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml' args = [ '-m', MODELS_LAYOUT, '-i', str(infile), '-o', str(outfile.parent), # subtests write to same location '--overwrite', ] if pytestconfig.getoption('verbose') > 0: args.extend(['-l', 'DEBUG']) caplog.set_level(logging.INFO) def only_eynollah(logrec): return logrec.name == 'eynollah' runner = CliRunner() for options in [ [], # defaults ["--allow_scaling", "--curved-line"], ["--allow_scaling", "--curved-line", "--full-layout"], ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"], ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based", "--textline_light", "--light_version"], # -ep ... # -eoi ... # --do_ocr # --skip_layout_and_reading_order ]: with subtests.test(#msg="test CLI", options=options): with caplog.filtering(only_eynollah): result = runner.invoke(layout_cli, args + options, catch_exceptions=False) assert result.exit_code == 0, result.stdout logmsgs = [logrec.message for logrec in caplog.records] assert str(infile) in logmsgs assert outfile.exists() tree = page_from_file(str(outfile)).etree regions = tree.xpath("//page:TextRegion", namespaces=NS) assert len(regions) >= 2, "result is inaccurate" regions = tree.xpath("//page:SeparatorRegion", namespaces=NS) assert len(regions) >= 2, "result is inaccurate" lines = tree.xpath("//page:TextLine", namespaces=NS) assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line def test_run_eynollah_layout_directory(tmp_path, pytestconfig, caplog): indir = testdir.joinpath('resources') outdir = tmp_path args = [ '-m', MODELS_LAYOUT, '-di', str(indir), '-o', str(outdir), ] if pytestconfig.getoption('verbose') > 0: args.extend(['-l', 'DEBUG']) caplog.set_level(logging.INFO) def only_eynollah(logrec): return logrec.name == 'eynollah' runner = CliRunner() with caplog.filtering(only_eynollah): result = runner.invoke(layout_cli, args, catch_exceptions=False) assert result.exit_code == 0, result.stdout logmsgs = [logrec.message for logrec in caplog.records] assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Job done in')]) == 2 assert any(logmsg for logmsg in logmsgs if logmsg.startswith('All jobs done in')) assert len(list(outdir.iterdir())) == 2 def test_run_eynollah_binarization_filename(tmp_path, subtests, pytestconfig, caplog): infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png') args = [ '-m', MODELS_BIN, '-i', str(infile), '-o', str(outfile), ] if pytestconfig.getoption('verbose') > 0: args.extend(['-l', 'DEBUG']) caplog.set_level(logging.INFO) def only_eynollah(logrec): return logrec.name == 'SbbBinarizer' runner = CliRunner() for options in [ [], # defaults ["--no-patches"], ]: with subtests.test(#msg="test CLI", options=options): with caplog.filtering(only_eynollah): result = runner.invoke(binarization_cli, args + options, catch_exceptions=False) assert result.exit_code == 0, result.stdout logmsgs = [logrec.message for logrec in caplog.records] assert any(True for logmsg in logmsgs if logmsg.startswith('Predicting')) assert outfile.exists() with Image.open(infile) as original_img: original_size = original_img.size with Image.open(outfile) as binarized_img: binarized_size = binarized_img.size assert original_size == binarized_size def test_run_eynollah_binarization_directory(tmp_path, subtests, pytestconfig, caplog): indir = testdir.joinpath('resources') outdir = tmp_path args = [ '-m', MODELS_BIN, '-di', str(indir), '-o', str(outdir), ] if pytestconfig.getoption('verbose') > 0: args.extend(['-l', 'DEBUG']) caplog.set_level(logging.INFO) def only_eynollah(logrec): return logrec.name == 'SbbBinarizer' runner = CliRunner() with caplog.filtering(only_eynollah): result = runner.invoke(binarization_cli, args, catch_exceptions=False) assert result.exit_code == 0, result.stdout logmsgs = [logrec.message for logrec in caplog.records] assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Predicting')]) == 2 assert len(list(outdir.iterdir())) == 2 def test_run_eynollah_enhancement_filename(tmp_path, subtests, pytestconfig, caplog): infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png') args = [ '-m', MODELS_LAYOUT, '-i', str(infile), '-o', str(outfile.parent), # subtests write to same location '--overwrite', ] if pytestconfig.getoption('verbose') > 0: args.extend(['-l', 'DEBUG']) caplog.set_level(logging.INFO) def only_eynollah(logrec): return logrec.name == 'enhancement' runner = CliRunner() for options in [ [], # defaults ["-sos"], ]: with subtests.test(#msg="test CLI", options=options): with caplog.filtering(only_eynollah): result = runner.invoke(enhancement_cli, args + options, catch_exceptions=False) assert result.exit_code == 0, result.stdout logmsgs = [logrec.message for logrec in caplog.records] assert any(True for logmsg in logmsgs if logmsg.startswith('Image was enhanced')), logmsgs assert outfile.exists() with Image.open(infile) as original_img: original_size = original_img.size with Image.open(outfile) as enhanced_img: enhanced_size = enhanced_img.size assert (original_size == enhanced_size) == ("-sos" in options) def test_run_eynollah_enhancement_directory(tmp_path, subtests, pytestconfig, caplog): indir = testdir.joinpath('resources') outdir = tmp_path args = [ '-m', MODELS_LAYOUT, '-di', str(indir), '-o', str(outdir), ] if pytestconfig.getoption('verbose') > 0: args.extend(['-l', 'DEBUG']) caplog.set_level(logging.INFO) def only_eynollah(logrec): return logrec.name == 'enhancement' runner = CliRunner() with caplog.filtering(only_eynollah): result = runner.invoke(enhancement_cli, args, catch_exceptions=False) assert result.exit_code == 0, result.stdout logmsgs = [logrec.message for logrec in caplog.records] assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Image was enhanced')]) == 2 assert len(list(outdir.iterdir())) == 2 def test_run_eynollah_mbreorder_filename(tmp_path, subtests, pytestconfig, caplog): infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.xml') outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml') args = [ '-m', MODELS_LAYOUT, '-i', str(infile), '-o', str(outfile.parent), ] if pytestconfig.getoption('verbose') > 0: args.extend(['-l', 'DEBUG']) caplog.set_level(logging.INFO) def only_eynollah(logrec): return logrec.name == 'mbreorder' runner = CliRunner() with caplog.filtering(only_eynollah): result = runner.invoke(mbreorder_cli, args, catch_exceptions=False) assert result.exit_code == 0, result.stdout logmsgs = [logrec.message for logrec in caplog.records] # FIXME: mbreorder has no logging! #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs assert outfile.exists() #in_tree = page_from_file(str(infile)).etree #in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) out_tree = page_from_file(str(outfile)).etree out_order = out_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) #assert len(out_order) >= 2, "result is inaccurate" #assert in_order != out_order assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3'] def test_run_eynollah_mbreorder_directory(tmp_path, subtests, pytestconfig, caplog): indir = testdir.joinpath('resources') outdir = tmp_path args = [ '-m', MODELS_LAYOUT, '-di', str(indir), '-o', str(outdir), ] if pytestconfig.getoption('verbose') > 0: args.extend(['-l', 'DEBUG']) caplog.set_level(logging.INFO) def only_eynollah(logrec): return logrec.name == 'mbreorder' runner = CliRunner() with caplog.filtering(only_eynollah): result = runner.invoke(mbreorder_cli, args, catch_exceptions=False) assert result.exit_code == 0, result.stdout logmsgs = [logrec.message for logrec in caplog.records] # FIXME: mbreorder has no logging! #assert len([logmsg for logmsg in logmsgs if logmsg.startswith('???')]) == 2 assert len(list(outdir.iterdir())) == 2 def test_run_eynollah_ocr_filename(tmp_path, subtests, pytestconfig, caplog): infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml') outrenderfile = tmp_path.joinpath('render').joinpath('kant_aufklaerung_1784_0020.png') outrenderfile.parent.mkdir() args = [ '-m', MODELS_OCR, '-i', str(infile), '-dx', str(infile.parent), '-o', str(outfile.parent), # subtests write to same location '--overwrite', ] if pytestconfig.getoption('verbose') > 0: args.extend(['-l', 'DEBUG']) caplog.set_level(logging.DEBUG) def only_eynollah(logrec): return logrec.name == 'eynollah' runner = CliRunner() for options in [ [], # defaults ["-doit", str(outrenderfile.parent)], ["-trocr"], ]: with subtests.test(#msg="test CLI", options=options): with caplog.filtering(only_eynollah): result = runner.invoke(ocr_cli, args + options, catch_exceptions=False) assert result.exit_code == 0, result.stdout logmsgs = [logrec.message for logrec in caplog.records] # FIXME: ocr has no logging! #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs assert outfile.exists() if "-doit" in options: assert outrenderfile.exists() #in_tree = page_from_file(str(infile)).etree #in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) out_tree = page_from_file(str(outfile)).etree out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS) assert len(out_texts) >= 2, ("result is inaccurate", out_texts) assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts) def test_run_eynollah_ocr_directory(tmp_path, subtests, pytestconfig, caplog): indir = testdir.joinpath('resources') outdir = tmp_path args = [ '-m', MODELS_OCR, '-di', str(indir), '-dx', str(indir), '-o', str(outdir), ] if pytestconfig.getoption('verbose') > 0: args.extend(['-l', 'DEBUG']) caplog.set_level(logging.INFO) def only_eynollah(logrec): return logrec.name == 'eynollah' runner = CliRunner() with caplog.filtering(only_eynollah): result = runner.invoke(ocr_cli, args, catch_exceptions=False) assert result.exit_code == 0, result.stdout logmsgs = [logrec.message for logrec in caplog.records] # FIXME: ocr has no logging! #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs assert len(list(outdir.iterdir())) == 2