From b6f82c72b9025d2663baa76c6ddf70d225c4da3b Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 29 Oct 2025 16:20:30 +0100 Subject: [PATCH] refactor cli tests --- src/eynollah/eynollah_ocr.py | 42 ++-- src/eynollah/image_enhancer.py | 17 +- src/eynollah/model_zoo/model_zoo.py | 5 +- src/eynollah/sbb_binarize.py | 7 +- tests/__init__.py | 0 tests/cli_tests/conftest.py | 36 +++ tests/cli_tests/test_binarization.py | 58 +++++ tests/cli_tests/test_enhance.py | 57 +++++ tests/cli_tests/test_layout.py | 109 +++++++++ tests/cli_tests/test_mbreorder.py | 53 +++++ tests/cli_tests/test_ocr.py | 67 ++++++ tests/cli_tests/test_run.py | 10 + tests/conftest.py | 25 ++ tests/test_run.py | 229 ------------------- tests/test_run_layout.py | 330 --------------------------- 15 files changed, 453 insertions(+), 592 deletions(-) delete mode 100644 tests/__init__.py create mode 100644 tests/cli_tests/conftest.py create mode 100644 tests/cli_tests/test_binarization.py create mode 100644 tests/cli_tests/test_enhance.py create mode 100644 tests/cli_tests/test_layout.py create mode 100644 tests/cli_tests/test_mbreorder.py create mode 100644 tests/cli_tests/test_ocr.py create mode 100644 tests/cli_tests/test_run.py create mode 100644 tests/conftest.py delete mode 100644 tests/test_run.py delete mode 100644 tests/test_run_layout.py diff --git a/src/eynollah/eynollah_ocr.py b/src/eynollah/eynollah_ocr.py index 41643de..3aafd8e 100644 --- a/src/eynollah/eynollah_ocr.py +++ b/src/eynollah/eynollah_ocr.py @@ -59,7 +59,7 @@ class Eynollah_ocr: export_textline_images_and_text: bool=False, do_not_mask_with_textline_contour: bool=False, pref_of_dataset=None, - min_conf_value_of_textline_text : float=0.3, + min_conf_value_of_textline_text : Optional[float]=None, logger: Optional[Logger]=None, ): self.tr_ocr = tr_ocr @@ -69,7 +69,7 @@ class Eynollah_ocr: self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour # prefix or dataset self.pref_of_dataset = pref_of_dataset - self.logger = logger if logger else getLogger('eynollah') + self.logger = logger if logger else getLogger('eynollah.ocr') self.model_zoo = EynollahModelZoo(basedir=dir_models) # TODO: Properly document what 'export_textline_images_and_text' is about @@ -77,21 +77,15 @@ class Eynollah_ocr: self.logger.info("export_textline_images_and_text was set, so no actual models are loaded") return - self.min_conf_value_of_textline_text = min_conf_value_of_textline_text + self.min_conf_value_of_textline_text = min_conf_value_of_textline_text if min_conf_value_of_textline_text else 0.3 self.b_s = 2 if batch_size is None and tr_ocr else 8 if batch_size is None else batch_size if tr_ocr: - self.model_zoo.load_model('trocr_processor', '') - if model_name: - self.model_zoo.load_model('ocr', 'tr', model_name) - else: - self.model_zoo.load_model('ocr', 'tr') + self.model_zoo.load_model('trocr_processor') + self.model_zoo.load_model('ocr', 'tr', model_path_override=model_name) self.model_zoo.get('ocr').to(self.device) else: - if model_name: - self.model_zoo.load_model('ocr', '', model_name) - else: - self.model_zoo.load_model('ocr', '') + self.model_zoo.load_model('ocr', '', model_path_override=model_name) self.model_zoo.load_model('num_to_char') self.end_character = len(self.model_zoo.load_model('characters')) + 2 @@ -206,10 +200,10 @@ class Eynollah_ocr: cropped_lines = [] indexer_b_s = 0 - pixel_values_merged = self.model_zoo.get('processor')(imgs, return_tensors="pt").pixel_values + pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values generated_ids_merged = self.model_zoo.get('ocr').generate( pixel_values_merged.to(self.device)) - generated_text_merged = self.model_zoo.get('processor').batch_decode( + generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( generated_ids_merged, skip_special_tokens=True) extracted_texts = extracted_texts + generated_text_merged @@ -229,10 +223,10 @@ class Eynollah_ocr: cropped_lines = [] indexer_b_s = 0 - pixel_values_merged = self.model_zoo.get('processor')(imgs, return_tensors="pt").pixel_values + pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values generated_ids_merged = self.model_zoo.get('ocr').generate( pixel_values_merged.to(self.device)) - generated_text_merged = self.model_zoo.get('processor').batch_decode( + generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( generated_ids_merged, skip_special_tokens=True) extracted_texts = extracted_texts + generated_text_merged @@ -249,10 +243,10 @@ class Eynollah_ocr: cropped_lines = [] indexer_b_s = 0 - pixel_values_merged = self.model_zoo.get('processor')(imgs, return_tensors="pt").pixel_values + pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values generated_ids_merged = self.model_zoo.get('ocr').generate( pixel_values_merged.to(self.device)) - generated_text_merged = self.model_zoo.get('processor').batch_decode( + generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( generated_ids_merged, skip_special_tokens=True) extracted_texts = extracted_texts + generated_text_merged @@ -267,10 +261,10 @@ class Eynollah_ocr: cropped_lines = [] indexer_b_s = 0 - pixel_values_merged = self.model_zoo.get('processor')(imgs, return_tensors="pt").pixel_values + pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values generated_ids_merged = self.model_zoo.get('ocr').generate( pixel_values_merged.to(self.device)) - generated_text_merged = self.model_zoo.get('processor').batch_decode( + generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( generated_ids_merged, skip_special_tokens=True) extracted_texts = extracted_texts + generated_text_merged @@ -284,9 +278,9 @@ class Eynollah_ocr: cropped_lines = [] indexer_b_s = 0 - pixel_values_merged = self.model_zoo.get('processor')(imgs, return_tensors="pt").pixel_values + pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values generated_ids_merged = self.model_zoo.get('ocr').generate(pixel_values_merged.to(self.device)) - generated_text_merged = self.model_zoo.get('processor').batch_decode(generated_ids_merged, skip_special_tokens=True) + generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(generated_ids_merged, skip_special_tokens=True) extracted_texts = extracted_texts + generated_text_merged @@ -301,10 +295,10 @@ class Eynollah_ocr: ####n_start = i*self.b_s ####n_end = (i+1)*self.b_s ####imgs = cropped_lines[n_start:n_end] - ####pixel_values_merged = self.model_zoo.get('processor')(imgs, return_tensors="pt").pixel_values + ####pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values ####generated_ids_merged = self.model_ocr.generate( #### pixel_values_merged.to(self.device)) - ####generated_text_merged = self.model_zoo.get('processor').batch_decode( + ####generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( #### generated_ids_merged, skip_special_tokens=True) ####extracted_texts = extracted_texts + generated_text_merged diff --git a/src/eynollah/image_enhancer.py b/src/eynollah/image_enhancer.py index cec8877..74b4865 100644 --- a/src/eynollah/image_enhancer.py +++ b/src/eynollah/image_enhancer.py @@ -50,7 +50,7 @@ class Enhancer: else: self.num_col_lower = num_col_lower - self.logger = logger if logger else getLogger('enhancement') + self.logger = logger if logger else getLogger('eynollah.enhance') self.model_zoo = EynollahModelZoo(basedir=dir_models) for v in ['binarization', 'enhancement', 'col_classifier', 'page']: self.model_zoo.load_model(v) @@ -142,7 +142,7 @@ class Enhancer: index_y_d = img_h - img_height_model img_patch = img[np.newaxis, index_y_d:index_y_u, index_x_d:index_x_u, :] - label_p_pred = self.model_zoo.get('enhancement', Model).predict(img_patch, verbose=0) + label_p_pred = self.model_zoo.get('enhancement', Model).predict(img_patch, verbose='0') seg = label_p_pred[0, :, :, :] * 255 if i == 0 and j == 0: @@ -667,7 +667,7 @@ class Enhancer: t0 = time.time() img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(light_version=False) - return img_res + return img_res, is_image_enhanced def run(self, @@ -705,9 +705,18 @@ class Enhancer: self.logger.warning("will skip input for existing output file '%s'", self.output_filename) continue - image_enhanced = self.run_single() + did_resize = False + image_enhanced, did_enhance = self.run_single() if self.save_org_scale: image_enhanced = resize_image(image_enhanced, self.h_org, self.w_org) + did_resize = True + + self.logger.info( + "Image %s was %senhanced%s.", + img_filename, + '' if did_enhance else 'not ', + 'and resized' if did_resize else '' + ) cv2.imwrite(self.output_filename, image_enhanced) diff --git a/src/eynollah/model_zoo/model_zoo.py b/src/eynollah/model_zoo/model_zoo.py index dada98f..32fdd0e 100644 --- a/src/eynollah/model_zoo/model_zoo.py +++ b/src/eynollah/model_zoo/model_zoo.py @@ -84,10 +84,13 @@ class EynollahModelZoo: self, model_category: str, model_variant: str = '', + model_path_override: Optional[str] = None, ) -> AnyModel: """ Load any model """ + if model_path_override: + self.override_models((model_category, model_variant, model_path_override)) model_path = self.model_path(model_category, model_variant) if model_path.suffix == '.h5' and Path(model_path.stem).exists(): # prefer SavedModel over HDF5 format if it exists @@ -183,5 +186,5 @@ class EynollahModelZoo: Ensure that a loaded models is not referenced by ``self._loaded`` anymore """ if hasattr(self, '_loaded') and getattr(self, '_loaded'): - for needle in self._loaded.keys(): + for needle in list(self._loaded.keys()): del self._loaded[needle] diff --git a/src/eynollah/sbb_binarize.py b/src/eynollah/sbb_binarize.py index da165ea..1bcf9d9 100644 --- a/src/eynollah/sbb_binarize.py +++ b/src/eynollah/sbb_binarize.py @@ -322,8 +322,7 @@ class SbbBinarizer: image = cv2.imread(image_path) img_last = 0 for n, (model_file, model) in enumerate(self.models.items()): - self.log.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.models.keys()))) - + self.log.info('Predicting %s with model %s [%s/%s]', image_path if image_path else '[image]', model_file, n + 1, len(self.models.keys())) res = self.predict(model, image, use_patches) img_fin = np.zeros((res.shape[0], res.shape[1], 3)) @@ -348,11 +347,11 @@ class SbbBinarizer: ls_imgs = list(filter(is_image_filename, os.listdir(dir_in))) for image_name in ls_imgs: image_stem = image_name.split('.')[0] - print(image_name,'image_name') + # print(image_name,'image_name') image = cv2.imread(os.path.join(dir_in,image_name) ) img_last = 0 for n, (model_file, model) in enumerate(self.models.items()): - self.log.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.models.keys()))) + self.log.info('Predicting %s with model %s [%s/%s]', image_name, model_file, n + 1, len(self.models.keys())) res = self.predict(model, image, use_patches) diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/cli_tests/conftest.py b/tests/cli_tests/conftest.py new file mode 100644 index 0000000..c54f47b --- /dev/null +++ b/tests/cli_tests/conftest.py @@ -0,0 +1,36 @@ +from typing import List +from click import Command +import pytest +import logging + +from click.testing import CliRunner, Result + +@pytest.fixture +def run_eynollah_ok_and_check_logs( + pytestconfig, + caplog, + model_dir, + eynollah_log_filter, +): + """ + Generates a Click Runner for `cli`, injects model_path and logging level + to `args`, runs the command and checks whether the logs generated contain + every fragment in `expected_logs` + """ + + def _run_click_ok_logs(cli: Command, args: List[str], expected_logs: List[str]) -> Result: + args = ['-m', model_dir] + args + if pytestconfig.getoption('verbose') > 0: + args.extend(['-l', 'DEBUG']) + caplog.set_level(logging.INFO) + runner = CliRunner() + with caplog.filtering(eynollah_log_filter): + result = runner.invoke(cli, args, catch_exceptions=False) + assert result.exit_code == 0, result.stdout + if expected_logs: + logmsgs = [logrec.message for logrec in caplog.records] + assert any(logmsg.startswith(needle) for needle in expected_logs for logmsg in logmsgs), f'{expected_logs} not in {logmsgs}' + return result + + return _run_click_ok_logs + diff --git a/tests/cli_tests/test_binarization.py b/tests/cli_tests/test_binarization.py new file mode 100644 index 0000000..4672a4f --- /dev/null +++ b/tests/cli_tests/test_binarization.py @@ -0,0 +1,58 @@ +import pytest +from PIL import Image +from eynollah.cli import ( + binarization as binarization_cli, +) +from ocrd_modelfactory import page_from_file +from ocrd_models.constants import NAMESPACES as NS + +@pytest.mark.parametrize( + "options", + [ + [], # defaults + ["--no-patches"], + ], ids=str) +def test_run_eynollah_binarization_filename( + tmp_path, + run_eynollah_ok_and_check_logs, + tests_dir, + options, +): + infile = tests_dir.joinpath('resources/kant_aufklaerung_1784_0020.tif') + outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png') + run_eynollah_ok_and_check_logs( + binarization_cli, + [ + '-i', str(infile), + '-o', str(outfile), + ] + options, + [ + 'Predicting' + ] + ) + assert outfile.exists() + with Image.open(infile) as original_img: + original_size = original_img.size + with Image.open(outfile) as binarized_img: + binarized_size = binarized_img.size + assert original_size == binarized_size + +def test_run_eynollah_binarization_directory( + tmp_path, + run_eynollah_ok_and_check_logs, + resources_dir, + image_resources, +): + outdir = tmp_path + run_eynollah_ok_and_check_logs( + binarization_cli, + [ + '-di', str(resources_dir), + '-o', str(outdir), + ], + [ + f'Predicting {image_resources[0].name}', + f'Predicting {image_resources[1].name}', + ] + ) + assert len(list(outdir.iterdir())) == 2 diff --git a/tests/cli_tests/test_enhance.py b/tests/cli_tests/test_enhance.py new file mode 100644 index 0000000..590c07f --- /dev/null +++ b/tests/cli_tests/test_enhance.py @@ -0,0 +1,57 @@ +import pytest +from PIL import Image +from eynollah.cli import ( + enhancement as enhancement_cli, +) +from ocrd_modelfactory import page_from_file +from ocrd_models.constants import NAMESPACES as NS + +@pytest.mark.parametrize( + "options", + [ + [], # defaults + ["-sos"], + ], ids=str) +def test_run_eynollah_enhancement_filename( + tmp_path, + resources_dir, + run_eynollah_ok_and_check_logs, + options, +): + infile = resources_dir / 'kant_aufklaerung_1784_0020.tif' + outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png') + run_eynollah_ok_and_check_logs( + enhancement_cli, + [ + '-i', str(infile), + '-o', str(outfile.parent), + ] + options, + [ + 'Image was enhanced', + ] + ) + with Image.open(infile) as original_img: + original_size = original_img.size + with Image.open(outfile) as enhanced_img: + enhanced_size = enhanced_img.size + assert (original_size == enhanced_size) == ("-sos" in options) + +def test_run_eynollah_enhancement_directory( + tmp_path, + resources_dir, + image_resources, + run_eynollah_ok_and_check_logs, +): + outdir = tmp_path + run_eynollah_ok_and_check_logs( + enhancement_cli, + [ + '-di', str(resources_dir), + '-o', str(outdir), + ], + [ + f'Image {image_resources[0]} was enhanced', + f'Image {image_resources[1]} was enhanced', + ] + ) + assert len(list(outdir.iterdir())) == 2 diff --git a/tests/cli_tests/test_layout.py b/tests/cli_tests/test_layout.py new file mode 100644 index 0000000..db7b88c --- /dev/null +++ b/tests/cli_tests/test_layout.py @@ -0,0 +1,109 @@ +import pytest +from eynollah.cli import ( + layout as layout_cli, +) +from ocrd_modelfactory import page_from_file +from ocrd_models.constants import NAMESPACES as NS + +@pytest.mark.parametrize( + "options", + [ + [], # defaults + #["--allow_scaling", "--curved-line"], + ["--allow_scaling", "--curved-line", "--full-layout"], + ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"], + ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based", + "--textline_light", "--light_version"], + # -ep ... + # -eoi ... + # FIXME: find out whether OCR extra was installed, otherwise skip these + ["--do_ocr"], + ["--do_ocr", "--light_version", "--textline_light"], + ["--do_ocr", "--transformer_ocr"], + #["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"], + ["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"], + # --skip_layout_and_reading_order + ], ids=str) +def test_run_eynollah_layout_filename( + tmp_path, + run_eynollah_ok_and_check_logs, + resources_dir, + options, +): + outdir = tmp_path + infile = resources_dir / 'kant_aufklaerung_1784_0020.tif' + outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml' + run_eynollah_ok_and_check_logs( + layout_cli, + [ + '-i', str(infile), + '-o', str(outfile.parent), + ] + options, + [ + str(infile) + ] + ) + assert outfile.exists() + tree = page_from_file(str(outfile)).etree + regions = tree.xpath("//page:TextRegion", namespaces=NS) + assert len(regions) >= 2, "result is inaccurate" + regions = tree.xpath("//page:SeparatorRegion", namespaces=NS) + assert len(regions) >= 2, "result is inaccurate" + lines = tree.xpath("//page:TextLine", namespaces=NS) + assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line + +@pytest.mark.parametrize( + "options", + [ + ["--tables"], + ["--tables", "--full-layout"], + ["--tables", "--full-layout", "--textline_light", "--light_version"], + ], ids=str) +def test_run_eynollah_layout_filename2( + tmp_path, + resources_dir, + run_eynollah_ok_and_check_logs, + options, +): + infile = resources_dir / 'euler_rechenkunst01_1738_0025.tif' + outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml' + run_eynollah_ok_and_check_logs( + layout_cli, + [ + '-i', str(infile), + '-o', str(outfile.parent), + ] + options, + [ + infile + ] + ) + assert outfile.exists() + tree = page_from_file(str(outfile)).etree + regions = tree.xpath("//page:TextRegion", namespaces=NS) + assert len(regions) >= 2, "result is inaccurate" + regions = tree.xpath("//page:TableRegion", namespaces=NS) + # model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP + assert len(regions) >= 1, "result is inaccurate" + regions = tree.xpath("//page:SeparatorRegion", namespaces=NS) + assert len(regions) >= 2, "result is inaccurate" + lines = tree.xpath("//page:TextLine", namespaces=NS) + assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line + +def test_run_eynollah_layout_directory( + tmp_path, + resources_dir, + run_eynollah_ok_and_check_logs, +): + outdir = tmp_path + run_eynollah_ok_and_check_logs( + layout_cli, + [ + '-di', str(resources_dir), + '-o', str(outdir), + ], + [ + 'Job done in', + 'All jobs done in', + ] + ) + assert len(list(outdir.iterdir())) == 2 diff --git a/tests/cli_tests/test_mbreorder.py b/tests/cli_tests/test_mbreorder.py new file mode 100644 index 0000000..7fb246d --- /dev/null +++ b/tests/cli_tests/test_mbreorder.py @@ -0,0 +1,53 @@ +from ocrd_modelfactory import page_from_file +from ocrd_models.constants import NAMESPACES as NS + +from eynollah.cli import ( + machine_based_reading_order as mbreorder_cli, +) + + +def test_run_eynollah_mbreorder_filename( + tmp_path, + resources_dir, + run_eynollah_ok_and_check_logs, +): + infile = resources_dir / 'kant_aufklaerung_1784_0020.xml' + outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml') + run_eynollah_ok_and_check_logs( + mbreorder_cli, + [ + '-i', str(infile), + '-o', str(outfile.parent), + ], + [ + # FIXME: mbreorder has no logging! + ] + ) + assert outfile.exists() + #in_tree = page_from_file(str(infile)).etree + #in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) + out_tree = page_from_file(str(outfile)).etree + out_order = out_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) + #assert len(out_order) >= 2, "result is inaccurate" + #assert in_order != out_order + assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3'] + +def test_run_eynollah_mbreorder_directory( + tmp_path, + resources_dir, + run_eynollah_ok_and_check_logs, +): + outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml') + outdir = tmp_path + run_eynollah_ok_and_check_logs( + mbreorder_cli, + [ + '-di', str(resources_dir), + '-o', str(outdir), + ], + [ + # FIXME: mbreorder has no logging! + ] + ) + assert len(list(outdir.iterdir())) == 2 + diff --git a/tests/cli_tests/test_ocr.py b/tests/cli_tests/test_ocr.py new file mode 100644 index 0000000..747d978 --- /dev/null +++ b/tests/cli_tests/test_ocr.py @@ -0,0 +1,67 @@ +import pytest +from eynollah.cli import ( + ocr as ocr_cli, +) +from ocrd_modelfactory import page_from_file +from ocrd_models.constants import NAMESPACES as NS + +@pytest.mark.parametrize( + "options", + [ + [], # defaults + ["-doit", #str(outrenderfile.parent)], + ], + ["-trocr"], + ], ids=str) +def test_run_eynollah_ocr_filename( + tmp_path, + run_eynollah_ok_and_check_logs, + resources_dir, + options, +): + infile = resources_dir / 'kant_aufklaerung_1784_0020.tif' + outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml') + outrenderfile = tmp_path / 'render' / 'kant_aufklaerung_1784_0020.png' + outrenderfile.parent.mkdir() + if "-doit" in options: + options.insert(options.index("-doit") + 1, str(outrenderfile.parent)) + run_eynollah_ok_and_check_logs( + ocr_cli, + [ + '-i', str(infile), + '-dx', str(infile.parent), + '-o', str(outfile.parent), + ] + options, + [ + # FIXME: ocr has no logging! + ] + ) + assert outfile.exists() + if "-doit" in options: + assert outrenderfile.exists() + #in_tree = page_from_file(str(infile)).etree + #in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) + out_tree = page_from_file(str(outfile)).etree + out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS) + assert len(out_texts) >= 2, ("result is inaccurate", out_texts) + assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts) + +def test_run_eynollah_ocr_directory( + tmp_path, + run_eynollah_ok_and_check_logs, + resources_dir, +): + outdir = tmp_path + run_eynollah_ok_and_check_logs( + ocr_cli, + [ + '-di', str(resources_dir), + '-dx', str(resources_dir), + '-o', str(outdir), + ], + [ + # FIXME: ocr has no logging! + ] + ) + assert len(list(outdir.iterdir())) == 2 + diff --git a/tests/cli_tests/test_run.py b/tests/cli_tests/test_run.py new file mode 100644 index 0000000..122bab5 --- /dev/null +++ b/tests/cli_tests/test_run.py @@ -0,0 +1,10 @@ +import pytest +from PIL import Image +from eynollah.cli import ( + layout as layout_cli, + binarization as binarization_cli, + enhancement as enhancement_cli, +) +from ocrd_modelfactory import page_from_file +from ocrd_models.constants import NAMESPACES as NS + diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..e73d0e3 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,25 @@ +from glob import glob +import os +import pytest +from pathlib import Path + + +@pytest.fixture() +def tests_dir(): + return Path(__file__).parent.resolve() + +@pytest.fixture() +def model_dir(tests_dir): + return os.environ.get('EYNOLLAH_MODELS_DIR', str(tests_dir.joinpath('..').resolve())) + +@pytest.fixture() +def resources_dir(tests_dir): + return tests_dir / 'resources' + +@pytest.fixture() +def image_resources(resources_dir): + return [Path(x) for x in glob(str(resources_dir / '*.tif'))] + +@pytest.fixture() +def eynollah_log_filter(): + return lambda logrec: logrec.name.startswith('eynollah') diff --git a/tests/test_run.py b/tests/test_run.py deleted file mode 100644 index 6d97fbb..0000000 --- a/tests/test_run.py +++ /dev/null @@ -1,229 +0,0 @@ -from os import environ -from pathlib import Path -import pytest -import logging -from PIL import Image -from eynollah.cli import ( - layout as layout_cli, - binarization as binarization_cli, - enhancement as enhancement_cli, - machine_based_reading_order as mbreorder_cli, - ocr as ocr_cli, -) -from click.testing import CliRunner -from ocrd_modelfactory import page_from_file -from ocrd_models.constants import NAMESPACES as NS - -testdir = Path(__file__).parent.resolve() - -MODELS_DIR = environ.get('EYNOLLAH_MODELS_DIR', str(testdir.joinpath('..').resolve())) - -def only_eynollah(logrec): - return logrec.name.startswith('eynollah') - -@pytest.mark.parametrize( - "options", - [ - [], # defaults - ["--no-patches"], - ], ids=str) -def test_run_eynollah_binarization_filename(tmp_path, pytestconfig, caplog, options): - infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') - outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png') - args = [ - '-m', MODELS_DIR, - '-i', str(infile), - '-o', str(outfile), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(binarization_cli, args + options, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - assert any(True for logmsg in logmsgs if logmsg.startswith('Predicting')) - assert outfile.exists() - with Image.open(infile) as original_img: - original_size = original_img.size - with Image.open(outfile) as binarized_img: - binarized_size = binarized_img.size - assert original_size == binarized_size - -def test_run_eynollah_binarization_directory(tmp_path, pytestconfig, caplog): - indir = testdir.joinpath('resources') - outdir = tmp_path - args = [ - '-m', MODELS_DIR, - '-di', str(indir), - '-o', str(outdir), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(binarization_cli, args, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Predicting')]) == 2 - assert len(list(outdir.iterdir())) == 2 - -@pytest.mark.parametrize( - "options", - [ - [], # defaults - ["-sos"], - ], ids=str) -def test_run_eynollah_enhancement_filename(tmp_path, pytestconfig, caplog, options): - infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') - outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png') - args = [ - '-m', MODELS_DIR, - '-i', str(infile), - '-o', str(outfile.parent), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(enhancement_cli, args + options, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - assert any(True for logmsg in logmsgs if logmsg.startswith('Image was enhanced')), logmsgs - assert outfile.exists() - with Image.open(infile) as original_img: - original_size = original_img.size - with Image.open(outfile) as enhanced_img: - enhanced_size = enhanced_img.size - assert (original_size == enhanced_size) == ("-sos" in options) - -def test_run_eynollah_enhancement_directory(tmp_path, pytestconfig, caplog): - indir = testdir.joinpath('resources') - outdir = tmp_path - args = [ - '-m', MODELS_DIR, - '-di', str(indir), - '-o', str(outdir), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(enhancement_cli, args, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Image was enhanced')]) == 2 - assert len(list(outdir.iterdir())) == 2 - -def test_run_eynollah_mbreorder_filename(tmp_path, pytestconfig, caplog): - infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.xml') - outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml') - args = [ - '-m', MODELS_DIR, - '-i', str(infile), - '-o', str(outfile.parent), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(mbreorder_cli, args, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - # FIXME: mbreorder has no logging! - #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs - assert outfile.exists() - #in_tree = page_from_file(str(infile)).etree - #in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) - out_tree = page_from_file(str(outfile)).etree - out_order = out_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) - #assert len(out_order) >= 2, "result is inaccurate" - #assert in_order != out_order - assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3'] - -def test_run_eynollah_mbreorder_directory(tmp_path, pytestconfig, caplog): - indir = testdir.joinpath('resources') - outdir = tmp_path - args = [ - '-m', MODELS_DIR, - '-di', str(indir), - '-o', str(outdir), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(mbreorder_cli, args, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - # FIXME: mbreorder has no logging! - #assert len([logmsg for logmsg in logmsgs if logmsg.startswith('???')]) == 2 - assert len(list(outdir.iterdir())) == 2 - -@pytest.mark.parametrize( - "options", - [ - [], # defaults - ["-doit", #str(outrenderfile.parent)], - ], - ["-trocr"], - ], ids=str) -def test_run_eynollah_ocr_filename(tmp_path, pytestconfig, caplog, options): - infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') - outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml') - outrenderfile = tmp_path.joinpath('render').joinpath('kant_aufklaerung_1784_0020.png') - outrenderfile.parent.mkdir() - args = [ - '-m', MODELS_DIR, - '-i', str(infile), - '-dx', str(infile.parent), - '-o', str(outfile.parent), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.DEBUG) - runner = CliRunner() - if "-doit" in options: - options.insert(options.index("-doit") + 1, str(outrenderfile.parent)) - with caplog.filtering(only_eynollah): - result = runner.invoke(ocr_cli, args + options, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - # FIXME: ocr has no logging! - #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs - assert outfile.exists() - if "-doit" in options: - assert outrenderfile.exists() - #in_tree = page_from_file(str(infile)).etree - #in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) - out_tree = page_from_file(str(outfile)).etree - out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS) - assert len(out_texts) >= 2, ("result is inaccurate", out_texts) - assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts) - -def test_run_eynollah_ocr_directory(tmp_path, pytestconfig, caplog): - indir = testdir.joinpath('resources') - outdir = tmp_path - args = [ - '-m', MODELS_DIR, - '-di', str(indir), - '-dx', str(indir), - '-o', str(outdir), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(ocr_cli, args, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - # FIXME: ocr has no logging! - #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs - assert len(list(outdir.iterdir())) == 2 diff --git a/tests/test_run_layout.py b/tests/test_run_layout.py deleted file mode 100644 index 29cebc4..0000000 --- a/tests/test_run_layout.py +++ /dev/null @@ -1,330 +0,0 @@ -from os import environ -from pathlib import Path -import pytest -import logging -from PIL import Image -from eynollah.cli import ( - layout as layout_cli, - binarization as binarization_cli, - enhancement as enhancement_cli, - machine_based_reading_order as mbreorder_cli, - ocr as ocr_cli, -) -from click.testing import CliRunner -from ocrd_modelfactory import page_from_file -from ocrd_models.constants import NAMESPACES as NS - -testdir = Path(__file__).parent.resolve() - -MODELS_LAYOUT = environ.get('MODELS_LAYOUT', str(testdir.joinpath('..', 'models_layout_v0_6_0').resolve())) - -def only_eynollah(logrec): - return logrec.name.startswith('eynollah') - -@pytest.mark.parametrize( - "options", - [ - [], # defaults - #["--allow_scaling", "--curved-line"], - ["--allow_scaling", "--curved-line", "--full-layout"], - ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"], - ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based", - "--textline_light", "--light_version"], - # -ep ... - # -eoi ... - # FIXME: find out whether OCR extra was installed, otherwise skip these - ["--do_ocr"], - ["--do_ocr", "--light_version", "--textline_light"], - ["--do_ocr", "--transformer_ocr"], - #["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"], - ["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"], - # --skip_layout_and_reading_order - ], ids=str) -def test_run_eynollah_layout_filename(tmp_path, pytestconfig, caplog, options): - infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') - outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml' - args = [ - '-m', MODELS_LAYOUT, - '-i', str(infile), - '-o', str(outfile.parent), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(layout_cli, args + options, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - assert str(infile) in logmsgs - assert outfile.exists() - tree = page_from_file(str(outfile)).etree - regions = tree.xpath("//page:TextRegion", namespaces=NS) - assert len(regions) >= 2, "result is inaccurate" - regions = tree.xpath("//page:SeparatorRegion", namespaces=NS) - assert len(regions) >= 2, "result is inaccurate" - lines = tree.xpath("//page:TextLine", namespaces=NS) - assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line - -@pytest.mark.parametrize( - "options", - [ - ["--tables"], - ["--tables", "--full-layout"], - ["--tables", "--full-layout", "--textline_light", "--light_version"], - ], ids=str) -def test_run_eynollah_layout_filename2(tmp_path, pytestconfig, caplog, options): - infile = testdir.joinpath('resources/euler_rechenkunst01_1738_0025.tif') - outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml' - args = [ - '-m', MODELS_LAYOUT, - '-i', str(infile), - '-o', str(outfile.parent), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(layout_cli, args + options, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - assert str(infile) in logmsgs - assert outfile.exists() - tree = page_from_file(str(outfile)).etree - regions = tree.xpath("//page:TextRegion", namespaces=NS) - assert len(regions) >= 2, "result is inaccurate" - regions = tree.xpath("//page:TableRegion", namespaces=NS) - # model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP - assert len(regions) >= 1, "result is inaccurate" - regions = tree.xpath("//page:SeparatorRegion", namespaces=NS) - assert len(regions) >= 2, "result is inaccurate" - lines = tree.xpath("//page:TextLine", namespaces=NS) - assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line - -def test_run_eynollah_layout_directory(tmp_path, pytestconfig, caplog): - indir = testdir.joinpath('resources') - outdir = tmp_path - args = [ - '-m', MODELS_LAYOUT, - '-di', str(indir), - '-o', str(outdir), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(layout_cli, args, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Job done in')]) == 2 - assert any(logmsg for logmsg in logmsgs if logmsg.startswith('All jobs done in')) - assert len(list(outdir.iterdir())) == 2 - -@pytest.mark.parametrize( - "options", - [ - [], # defaults - ["--no-patches"], - ], ids=str) -def test_run_eynollah_binarization_filename(tmp_path, pytestconfig, caplog, options): - infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') - outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png') - args = [ - '-m', MODELS_BIN, - '-i', str(infile), - '-o', str(outfile), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(binarization_cli, args + options, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - assert any(True for logmsg in logmsgs if logmsg.startswith('Predicting')) - assert outfile.exists() - with Image.open(infile) as original_img: - original_size = original_img.size - with Image.open(outfile) as binarized_img: - binarized_size = binarized_img.size - assert original_size == binarized_size - -def test_run_eynollah_binarization_directory(tmp_path, pytestconfig, caplog): - indir = testdir.joinpath('resources') - outdir = tmp_path - args = [ - '-m', MODELS_BIN, - '-di', str(indir), - '-o', str(outdir), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(binarization_cli, args, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Predicting')]) == 2 - assert len(list(outdir.iterdir())) == 2 - -@pytest.mark.parametrize( - "options", - [ - [], # defaults - ["-sos"], - ], ids=str) -def test_run_eynollah_enhancement_filename(tmp_path, pytestconfig, caplog, options): - infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') - outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png') - args = [ - '-m', MODELS_LAYOUT, - '-i', str(infile), - '-o', str(outfile.parent), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(enhancement_cli, args + options, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - assert any(True for logmsg in logmsgs if logmsg.startswith('Image was enhanced')), logmsgs - assert outfile.exists() - with Image.open(infile) as original_img: - original_size = original_img.size - with Image.open(outfile) as enhanced_img: - enhanced_size = enhanced_img.size - assert (original_size == enhanced_size) == ("-sos" in options) - -def test_run_eynollah_enhancement_directory(tmp_path, pytestconfig, caplog): - indir = testdir.joinpath('resources') - outdir = tmp_path - args = [ - '-m', MODELS_LAYOUT, - '-di', str(indir), - '-o', str(outdir), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(enhancement_cli, args, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Image was enhanced')]) == 2 - assert len(list(outdir.iterdir())) == 2 - -def test_run_eynollah_mbreorder_filename(tmp_path, pytestconfig, caplog): - infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.xml') - outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml') - args = [ - '-m', MODELS_LAYOUT, - '-i', str(infile), - '-o', str(outfile.parent), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(mbreorder_cli, args, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - # FIXME: mbreorder has no logging! - #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs - assert outfile.exists() - #in_tree = page_from_file(str(infile)).etree - #in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) - out_tree = page_from_file(str(outfile)).etree - out_order = out_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) - #assert len(out_order) >= 2, "result is inaccurate" - #assert in_order != out_order - assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3'] - -def test_run_eynollah_mbreorder_directory(tmp_path, pytestconfig, caplog): - indir = testdir.joinpath('resources') - outdir = tmp_path - args = [ - '-m', MODELS_LAYOUT, - '-di', str(indir), - '-o', str(outdir), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(mbreorder_cli, args, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - # FIXME: mbreorder has no logging! - #assert len([logmsg for logmsg in logmsgs if logmsg.startswith('???')]) == 2 - assert len(list(outdir.iterdir())) == 2 - -@pytest.mark.parametrize( - "options", - [ - [], # defaults - ["-doit", #str(outrenderfile.parent)], - ], - ["-trocr"], - ], ids=str) -def test_run_eynollah_ocr_filename(tmp_path, pytestconfig, caplog, options): - infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif') - outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml') - outrenderfile = tmp_path.joinpath('render').joinpath('kant_aufklaerung_1784_0020.png') - outrenderfile.parent.mkdir() - args = [ - '-m', MODELS_OCR, - '-i', str(infile), - '-dx', str(infile.parent), - '-o', str(outfile.parent), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.DEBUG) - runner = CliRunner() - if "-doit" in options: - options.insert(options.index("-doit") + 1, str(outrenderfile.parent)) - with caplog.filtering(only_eynollah): - result = runner.invoke(ocr_cli, args + options, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - # FIXME: ocr has no logging! - #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs - assert outfile.exists() - if "-doit" in options: - assert outrenderfile.exists() - #in_tree = page_from_file(str(infile)).etree - #in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS) - out_tree = page_from_file(str(outfile)).etree - out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS) - assert len(out_texts) >= 2, ("result is inaccurate", out_texts) - assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts) - -def test_run_eynollah_ocr_directory(tmp_path, pytestconfig, caplog): - indir = testdir.joinpath('resources') - outdir = tmp_path - args = [ - '-m', MODELS_OCR, - '-di', str(indir), - '-dx', str(indir), - '-o', str(outdir), - ] - if pytestconfig.getoption('verbose') > 0: - args.extend(['-l', 'DEBUG']) - caplog.set_level(logging.INFO) - runner = CliRunner() - with caplog.filtering(only_eynollah): - result = runner.invoke(ocr_cli, args, catch_exceptions=False) - assert result.exit_code == 0, result.stdout - logmsgs = [logrec.message for logrec in caplog.records] - # FIXME: ocr has no logging! - #assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs - assert len(list(outdir.iterdir())) == 2