mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-11-09 22:24:13 +01:00
refactor cli tests
This commit is contained in:
parent
ef999c8f0a
commit
b6f82c72b9
15 changed files with 453 additions and 592 deletions
|
|
@ -59,7 +59,7 @@ class Eynollah_ocr:
|
||||||
export_textline_images_and_text: bool=False,
|
export_textline_images_and_text: bool=False,
|
||||||
do_not_mask_with_textline_contour: bool=False,
|
do_not_mask_with_textline_contour: bool=False,
|
||||||
pref_of_dataset=None,
|
pref_of_dataset=None,
|
||||||
min_conf_value_of_textline_text : float=0.3,
|
min_conf_value_of_textline_text : Optional[float]=None,
|
||||||
logger: Optional[Logger]=None,
|
logger: Optional[Logger]=None,
|
||||||
):
|
):
|
||||||
self.tr_ocr = tr_ocr
|
self.tr_ocr = tr_ocr
|
||||||
|
|
@ -69,7 +69,7 @@ class Eynollah_ocr:
|
||||||
self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour
|
self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour
|
||||||
# prefix or dataset
|
# prefix or dataset
|
||||||
self.pref_of_dataset = pref_of_dataset
|
self.pref_of_dataset = pref_of_dataset
|
||||||
self.logger = logger if logger else getLogger('eynollah')
|
self.logger = logger if logger else getLogger('eynollah.ocr')
|
||||||
self.model_zoo = EynollahModelZoo(basedir=dir_models)
|
self.model_zoo = EynollahModelZoo(basedir=dir_models)
|
||||||
|
|
||||||
# TODO: Properly document what 'export_textline_images_and_text' is about
|
# TODO: Properly document what 'export_textline_images_and_text' is about
|
||||||
|
|
@ -77,21 +77,15 @@ class Eynollah_ocr:
|
||||||
self.logger.info("export_textline_images_and_text was set, so no actual models are loaded")
|
self.logger.info("export_textline_images_and_text was set, so no actual models are loaded")
|
||||||
return
|
return
|
||||||
|
|
||||||
self.min_conf_value_of_textline_text = min_conf_value_of_textline_text
|
self.min_conf_value_of_textline_text = min_conf_value_of_textline_text if min_conf_value_of_textline_text else 0.3
|
||||||
self.b_s = 2 if batch_size is None and tr_ocr else 8 if batch_size is None else batch_size
|
self.b_s = 2 if batch_size is None and tr_ocr else 8 if batch_size is None else batch_size
|
||||||
|
|
||||||
if tr_ocr:
|
if tr_ocr:
|
||||||
self.model_zoo.load_model('trocr_processor', '')
|
self.model_zoo.load_model('trocr_processor')
|
||||||
if model_name:
|
self.model_zoo.load_model('ocr', 'tr', model_path_override=model_name)
|
||||||
self.model_zoo.load_model('ocr', 'tr', model_name)
|
|
||||||
else:
|
|
||||||
self.model_zoo.load_model('ocr', 'tr')
|
|
||||||
self.model_zoo.get('ocr').to(self.device)
|
self.model_zoo.get('ocr').to(self.device)
|
||||||
else:
|
else:
|
||||||
if model_name:
|
self.model_zoo.load_model('ocr', '', model_path_override=model_name)
|
||||||
self.model_zoo.load_model('ocr', '', model_name)
|
|
||||||
else:
|
|
||||||
self.model_zoo.load_model('ocr', '')
|
|
||||||
self.model_zoo.load_model('num_to_char')
|
self.model_zoo.load_model('num_to_char')
|
||||||
self.end_character = len(self.model_zoo.load_model('characters')) + 2
|
self.end_character = len(self.model_zoo.load_model('characters')) + 2
|
||||||
|
|
||||||
|
|
@ -206,10 +200,10 @@ class Eynollah_ocr:
|
||||||
cropped_lines = []
|
cropped_lines = []
|
||||||
indexer_b_s = 0
|
indexer_b_s = 0
|
||||||
|
|
||||||
pixel_values_merged = self.model_zoo.get('processor')(imgs, return_tensors="pt").pixel_values
|
pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values
|
||||||
generated_ids_merged = self.model_zoo.get('ocr').generate(
|
generated_ids_merged = self.model_zoo.get('ocr').generate(
|
||||||
pixel_values_merged.to(self.device))
|
pixel_values_merged.to(self.device))
|
||||||
generated_text_merged = self.model_zoo.get('processor').batch_decode(
|
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(
|
||||||
generated_ids_merged, skip_special_tokens=True)
|
generated_ids_merged, skip_special_tokens=True)
|
||||||
|
|
||||||
extracted_texts = extracted_texts + generated_text_merged
|
extracted_texts = extracted_texts + generated_text_merged
|
||||||
|
|
@ -229,10 +223,10 @@ class Eynollah_ocr:
|
||||||
cropped_lines = []
|
cropped_lines = []
|
||||||
indexer_b_s = 0
|
indexer_b_s = 0
|
||||||
|
|
||||||
pixel_values_merged = self.model_zoo.get('processor')(imgs, return_tensors="pt").pixel_values
|
pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values
|
||||||
generated_ids_merged = self.model_zoo.get('ocr').generate(
|
generated_ids_merged = self.model_zoo.get('ocr').generate(
|
||||||
pixel_values_merged.to(self.device))
|
pixel_values_merged.to(self.device))
|
||||||
generated_text_merged = self.model_zoo.get('processor').batch_decode(
|
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(
|
||||||
generated_ids_merged, skip_special_tokens=True)
|
generated_ids_merged, skip_special_tokens=True)
|
||||||
|
|
||||||
extracted_texts = extracted_texts + generated_text_merged
|
extracted_texts = extracted_texts + generated_text_merged
|
||||||
|
|
@ -249,10 +243,10 @@ class Eynollah_ocr:
|
||||||
cropped_lines = []
|
cropped_lines = []
|
||||||
indexer_b_s = 0
|
indexer_b_s = 0
|
||||||
|
|
||||||
pixel_values_merged = self.model_zoo.get('processor')(imgs, return_tensors="pt").pixel_values
|
pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values
|
||||||
generated_ids_merged = self.model_zoo.get('ocr').generate(
|
generated_ids_merged = self.model_zoo.get('ocr').generate(
|
||||||
pixel_values_merged.to(self.device))
|
pixel_values_merged.to(self.device))
|
||||||
generated_text_merged = self.model_zoo.get('processor').batch_decode(
|
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(
|
||||||
generated_ids_merged, skip_special_tokens=True)
|
generated_ids_merged, skip_special_tokens=True)
|
||||||
|
|
||||||
extracted_texts = extracted_texts + generated_text_merged
|
extracted_texts = extracted_texts + generated_text_merged
|
||||||
|
|
@ -267,10 +261,10 @@ class Eynollah_ocr:
|
||||||
cropped_lines = []
|
cropped_lines = []
|
||||||
indexer_b_s = 0
|
indexer_b_s = 0
|
||||||
|
|
||||||
pixel_values_merged = self.model_zoo.get('processor')(imgs, return_tensors="pt").pixel_values
|
pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values
|
||||||
generated_ids_merged = self.model_zoo.get('ocr').generate(
|
generated_ids_merged = self.model_zoo.get('ocr').generate(
|
||||||
pixel_values_merged.to(self.device))
|
pixel_values_merged.to(self.device))
|
||||||
generated_text_merged = self.model_zoo.get('processor').batch_decode(
|
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(
|
||||||
generated_ids_merged, skip_special_tokens=True)
|
generated_ids_merged, skip_special_tokens=True)
|
||||||
|
|
||||||
extracted_texts = extracted_texts + generated_text_merged
|
extracted_texts = extracted_texts + generated_text_merged
|
||||||
|
|
@ -284,9 +278,9 @@ class Eynollah_ocr:
|
||||||
cropped_lines = []
|
cropped_lines = []
|
||||||
indexer_b_s = 0
|
indexer_b_s = 0
|
||||||
|
|
||||||
pixel_values_merged = self.model_zoo.get('processor')(imgs, return_tensors="pt").pixel_values
|
pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values
|
||||||
generated_ids_merged = self.model_zoo.get('ocr').generate(pixel_values_merged.to(self.device))
|
generated_ids_merged = self.model_zoo.get('ocr').generate(pixel_values_merged.to(self.device))
|
||||||
generated_text_merged = self.model_zoo.get('processor').batch_decode(generated_ids_merged, skip_special_tokens=True)
|
generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(generated_ids_merged, skip_special_tokens=True)
|
||||||
|
|
||||||
extracted_texts = extracted_texts + generated_text_merged
|
extracted_texts = extracted_texts + generated_text_merged
|
||||||
|
|
||||||
|
|
@ -301,10 +295,10 @@ class Eynollah_ocr:
|
||||||
####n_start = i*self.b_s
|
####n_start = i*self.b_s
|
||||||
####n_end = (i+1)*self.b_s
|
####n_end = (i+1)*self.b_s
|
||||||
####imgs = cropped_lines[n_start:n_end]
|
####imgs = cropped_lines[n_start:n_end]
|
||||||
####pixel_values_merged = self.model_zoo.get('processor')(imgs, return_tensors="pt").pixel_values
|
####pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values
|
||||||
####generated_ids_merged = self.model_ocr.generate(
|
####generated_ids_merged = self.model_ocr.generate(
|
||||||
#### pixel_values_merged.to(self.device))
|
#### pixel_values_merged.to(self.device))
|
||||||
####generated_text_merged = self.model_zoo.get('processor').batch_decode(
|
####generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(
|
||||||
#### generated_ids_merged, skip_special_tokens=True)
|
#### generated_ids_merged, skip_special_tokens=True)
|
||||||
|
|
||||||
####extracted_texts = extracted_texts + generated_text_merged
|
####extracted_texts = extracted_texts + generated_text_merged
|
||||||
|
|
|
||||||
|
|
@ -50,7 +50,7 @@ class Enhancer:
|
||||||
else:
|
else:
|
||||||
self.num_col_lower = num_col_lower
|
self.num_col_lower = num_col_lower
|
||||||
|
|
||||||
self.logger = logger if logger else getLogger('enhancement')
|
self.logger = logger if logger else getLogger('eynollah.enhance')
|
||||||
self.model_zoo = EynollahModelZoo(basedir=dir_models)
|
self.model_zoo = EynollahModelZoo(basedir=dir_models)
|
||||||
for v in ['binarization', 'enhancement', 'col_classifier', 'page']:
|
for v in ['binarization', 'enhancement', 'col_classifier', 'page']:
|
||||||
self.model_zoo.load_model(v)
|
self.model_zoo.load_model(v)
|
||||||
|
|
@ -142,7 +142,7 @@ class Enhancer:
|
||||||
index_y_d = img_h - img_height_model
|
index_y_d = img_h - img_height_model
|
||||||
|
|
||||||
img_patch = img[np.newaxis, index_y_d:index_y_u, index_x_d:index_x_u, :]
|
img_patch = img[np.newaxis, index_y_d:index_y_u, index_x_d:index_x_u, :]
|
||||||
label_p_pred = self.model_zoo.get('enhancement', Model).predict(img_patch, verbose=0)
|
label_p_pred = self.model_zoo.get('enhancement', Model).predict(img_patch, verbose='0')
|
||||||
seg = label_p_pred[0, :, :, :] * 255
|
seg = label_p_pred[0, :, :, :] * 255
|
||||||
|
|
||||||
if i == 0 and j == 0:
|
if i == 0 and j == 0:
|
||||||
|
|
@ -667,7 +667,7 @@ class Enhancer:
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(light_version=False)
|
img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(light_version=False)
|
||||||
|
|
||||||
return img_res
|
return img_res, is_image_enhanced
|
||||||
|
|
||||||
|
|
||||||
def run(self,
|
def run(self,
|
||||||
|
|
@ -705,9 +705,18 @@ class Enhancer:
|
||||||
self.logger.warning("will skip input for existing output file '%s'", self.output_filename)
|
self.logger.warning("will skip input for existing output file '%s'", self.output_filename)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
image_enhanced = self.run_single()
|
did_resize = False
|
||||||
|
image_enhanced, did_enhance = self.run_single()
|
||||||
if self.save_org_scale:
|
if self.save_org_scale:
|
||||||
image_enhanced = resize_image(image_enhanced, self.h_org, self.w_org)
|
image_enhanced = resize_image(image_enhanced, self.h_org, self.w_org)
|
||||||
|
did_resize = True
|
||||||
|
|
||||||
|
self.logger.info(
|
||||||
|
"Image %s was %senhanced%s.",
|
||||||
|
img_filename,
|
||||||
|
'' if did_enhance else 'not ',
|
||||||
|
'and resized' if did_resize else ''
|
||||||
|
)
|
||||||
|
|
||||||
cv2.imwrite(self.output_filename, image_enhanced)
|
cv2.imwrite(self.output_filename, image_enhanced)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -84,10 +84,13 @@ class EynollahModelZoo:
|
||||||
self,
|
self,
|
||||||
model_category: str,
|
model_category: str,
|
||||||
model_variant: str = '',
|
model_variant: str = '',
|
||||||
|
model_path_override: Optional[str] = None,
|
||||||
) -> AnyModel:
|
) -> AnyModel:
|
||||||
"""
|
"""
|
||||||
Load any model
|
Load any model
|
||||||
"""
|
"""
|
||||||
|
if model_path_override:
|
||||||
|
self.override_models((model_category, model_variant, model_path_override))
|
||||||
model_path = self.model_path(model_category, model_variant)
|
model_path = self.model_path(model_category, model_variant)
|
||||||
if model_path.suffix == '.h5' and Path(model_path.stem).exists():
|
if model_path.suffix == '.h5' and Path(model_path.stem).exists():
|
||||||
# prefer SavedModel over HDF5 format if it exists
|
# prefer SavedModel over HDF5 format if it exists
|
||||||
|
|
@ -183,5 +186,5 @@ class EynollahModelZoo:
|
||||||
Ensure that a loaded models is not referenced by ``self._loaded`` anymore
|
Ensure that a loaded models is not referenced by ``self._loaded`` anymore
|
||||||
"""
|
"""
|
||||||
if hasattr(self, '_loaded') and getattr(self, '_loaded'):
|
if hasattr(self, '_loaded') and getattr(self, '_loaded'):
|
||||||
for needle in self._loaded.keys():
|
for needle in list(self._loaded.keys()):
|
||||||
del self._loaded[needle]
|
del self._loaded[needle]
|
||||||
|
|
|
||||||
|
|
@ -322,8 +322,7 @@ class SbbBinarizer:
|
||||||
image = cv2.imread(image_path)
|
image = cv2.imread(image_path)
|
||||||
img_last = 0
|
img_last = 0
|
||||||
for n, (model_file, model) in enumerate(self.models.items()):
|
for n, (model_file, model) in enumerate(self.models.items()):
|
||||||
self.log.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.models.keys())))
|
self.log.info('Predicting %s with model %s [%s/%s]', image_path if image_path else '[image]', model_file, n + 1, len(self.models.keys()))
|
||||||
|
|
||||||
res = self.predict(model, image, use_patches)
|
res = self.predict(model, image, use_patches)
|
||||||
|
|
||||||
img_fin = np.zeros((res.shape[0], res.shape[1], 3))
|
img_fin = np.zeros((res.shape[0], res.shape[1], 3))
|
||||||
|
|
@ -348,11 +347,11 @@ class SbbBinarizer:
|
||||||
ls_imgs = list(filter(is_image_filename, os.listdir(dir_in)))
|
ls_imgs = list(filter(is_image_filename, os.listdir(dir_in)))
|
||||||
for image_name in ls_imgs:
|
for image_name in ls_imgs:
|
||||||
image_stem = image_name.split('.')[0]
|
image_stem = image_name.split('.')[0]
|
||||||
print(image_name,'image_name')
|
# print(image_name,'image_name')
|
||||||
image = cv2.imread(os.path.join(dir_in,image_name) )
|
image = cv2.imread(os.path.join(dir_in,image_name) )
|
||||||
img_last = 0
|
img_last = 0
|
||||||
for n, (model_file, model) in enumerate(self.models.items()):
|
for n, (model_file, model) in enumerate(self.models.items()):
|
||||||
self.log.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.models.keys())))
|
self.log.info('Predicting %s with model %s [%s/%s]', image_name, model_file, n + 1, len(self.models.keys()))
|
||||||
|
|
||||||
res = self.predict(model, image, use_patches)
|
res = self.predict(model, image, use_patches)
|
||||||
|
|
||||||
|
|
|
||||||
36
tests/cli_tests/conftest.py
Normal file
36
tests/cli_tests/conftest.py
Normal file
|
|
@ -0,0 +1,36 @@
|
||||||
|
from typing import List
|
||||||
|
from click import Command
|
||||||
|
import pytest
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from click.testing import CliRunner, Result
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def run_eynollah_ok_and_check_logs(
|
||||||
|
pytestconfig,
|
||||||
|
caplog,
|
||||||
|
model_dir,
|
||||||
|
eynollah_log_filter,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Generates a Click Runner for `cli`, injects model_path and logging level
|
||||||
|
to `args`, runs the command and checks whether the logs generated contain
|
||||||
|
every fragment in `expected_logs`
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _run_click_ok_logs(cli: Command, args: List[str], expected_logs: List[str]) -> Result:
|
||||||
|
args = ['-m', model_dir] + args
|
||||||
|
if pytestconfig.getoption('verbose') > 0:
|
||||||
|
args.extend(['-l', 'DEBUG'])
|
||||||
|
caplog.set_level(logging.INFO)
|
||||||
|
runner = CliRunner()
|
||||||
|
with caplog.filtering(eynollah_log_filter):
|
||||||
|
result = runner.invoke(cli, args, catch_exceptions=False)
|
||||||
|
assert result.exit_code == 0, result.stdout
|
||||||
|
if expected_logs:
|
||||||
|
logmsgs = [logrec.message for logrec in caplog.records]
|
||||||
|
assert any(logmsg.startswith(needle) for needle in expected_logs for logmsg in logmsgs), f'{expected_logs} not in {logmsgs}'
|
||||||
|
return result
|
||||||
|
|
||||||
|
return _run_click_ok_logs
|
||||||
|
|
||||||
58
tests/cli_tests/test_binarization.py
Normal file
58
tests/cli_tests/test_binarization.py
Normal file
|
|
@ -0,0 +1,58 @@
|
||||||
|
import pytest
|
||||||
|
from PIL import Image
|
||||||
|
from eynollah.cli import (
|
||||||
|
binarization as binarization_cli,
|
||||||
|
)
|
||||||
|
from ocrd_modelfactory import page_from_file
|
||||||
|
from ocrd_models.constants import NAMESPACES as NS
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"options",
|
||||||
|
[
|
||||||
|
[], # defaults
|
||||||
|
["--no-patches"],
|
||||||
|
], ids=str)
|
||||||
|
def test_run_eynollah_binarization_filename(
|
||||||
|
tmp_path,
|
||||||
|
run_eynollah_ok_and_check_logs,
|
||||||
|
tests_dir,
|
||||||
|
options,
|
||||||
|
):
|
||||||
|
infile = tests_dir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
|
||||||
|
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
|
||||||
|
run_eynollah_ok_and_check_logs(
|
||||||
|
binarization_cli,
|
||||||
|
[
|
||||||
|
'-i', str(infile),
|
||||||
|
'-o', str(outfile),
|
||||||
|
] + options,
|
||||||
|
[
|
||||||
|
'Predicting'
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert outfile.exists()
|
||||||
|
with Image.open(infile) as original_img:
|
||||||
|
original_size = original_img.size
|
||||||
|
with Image.open(outfile) as binarized_img:
|
||||||
|
binarized_size = binarized_img.size
|
||||||
|
assert original_size == binarized_size
|
||||||
|
|
||||||
|
def test_run_eynollah_binarization_directory(
|
||||||
|
tmp_path,
|
||||||
|
run_eynollah_ok_and_check_logs,
|
||||||
|
resources_dir,
|
||||||
|
image_resources,
|
||||||
|
):
|
||||||
|
outdir = tmp_path
|
||||||
|
run_eynollah_ok_and_check_logs(
|
||||||
|
binarization_cli,
|
||||||
|
[
|
||||||
|
'-di', str(resources_dir),
|
||||||
|
'-o', str(outdir),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
f'Predicting {image_resources[0].name}',
|
||||||
|
f'Predicting {image_resources[1].name}',
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert len(list(outdir.iterdir())) == 2
|
||||||
57
tests/cli_tests/test_enhance.py
Normal file
57
tests/cli_tests/test_enhance.py
Normal file
|
|
@ -0,0 +1,57 @@
|
||||||
|
import pytest
|
||||||
|
from PIL import Image
|
||||||
|
from eynollah.cli import (
|
||||||
|
enhancement as enhancement_cli,
|
||||||
|
)
|
||||||
|
from ocrd_modelfactory import page_from_file
|
||||||
|
from ocrd_models.constants import NAMESPACES as NS
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"options",
|
||||||
|
[
|
||||||
|
[], # defaults
|
||||||
|
["-sos"],
|
||||||
|
], ids=str)
|
||||||
|
def test_run_eynollah_enhancement_filename(
|
||||||
|
tmp_path,
|
||||||
|
resources_dir,
|
||||||
|
run_eynollah_ok_and_check_logs,
|
||||||
|
options,
|
||||||
|
):
|
||||||
|
infile = resources_dir / 'kant_aufklaerung_1784_0020.tif'
|
||||||
|
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
|
||||||
|
run_eynollah_ok_and_check_logs(
|
||||||
|
enhancement_cli,
|
||||||
|
[
|
||||||
|
'-i', str(infile),
|
||||||
|
'-o', str(outfile.parent),
|
||||||
|
] + options,
|
||||||
|
[
|
||||||
|
'Image was enhanced',
|
||||||
|
]
|
||||||
|
)
|
||||||
|
with Image.open(infile) as original_img:
|
||||||
|
original_size = original_img.size
|
||||||
|
with Image.open(outfile) as enhanced_img:
|
||||||
|
enhanced_size = enhanced_img.size
|
||||||
|
assert (original_size == enhanced_size) == ("-sos" in options)
|
||||||
|
|
||||||
|
def test_run_eynollah_enhancement_directory(
|
||||||
|
tmp_path,
|
||||||
|
resources_dir,
|
||||||
|
image_resources,
|
||||||
|
run_eynollah_ok_and_check_logs,
|
||||||
|
):
|
||||||
|
outdir = tmp_path
|
||||||
|
run_eynollah_ok_and_check_logs(
|
||||||
|
enhancement_cli,
|
||||||
|
[
|
||||||
|
'-di', str(resources_dir),
|
||||||
|
'-o', str(outdir),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
f'Image {image_resources[0]} was enhanced',
|
||||||
|
f'Image {image_resources[1]} was enhanced',
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert len(list(outdir.iterdir())) == 2
|
||||||
109
tests/cli_tests/test_layout.py
Normal file
109
tests/cli_tests/test_layout.py
Normal file
|
|
@ -0,0 +1,109 @@
|
||||||
|
import pytest
|
||||||
|
from eynollah.cli import (
|
||||||
|
layout as layout_cli,
|
||||||
|
)
|
||||||
|
from ocrd_modelfactory import page_from_file
|
||||||
|
from ocrd_models.constants import NAMESPACES as NS
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"options",
|
||||||
|
[
|
||||||
|
[], # defaults
|
||||||
|
#["--allow_scaling", "--curved-line"],
|
||||||
|
["--allow_scaling", "--curved-line", "--full-layout"],
|
||||||
|
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"],
|
||||||
|
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based",
|
||||||
|
"--textline_light", "--light_version"],
|
||||||
|
# -ep ...
|
||||||
|
# -eoi ...
|
||||||
|
# FIXME: find out whether OCR extra was installed, otherwise skip these
|
||||||
|
["--do_ocr"],
|
||||||
|
["--do_ocr", "--light_version", "--textline_light"],
|
||||||
|
["--do_ocr", "--transformer_ocr"],
|
||||||
|
#["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"],
|
||||||
|
["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"],
|
||||||
|
# --skip_layout_and_reading_order
|
||||||
|
], ids=str)
|
||||||
|
def test_run_eynollah_layout_filename(
|
||||||
|
tmp_path,
|
||||||
|
run_eynollah_ok_and_check_logs,
|
||||||
|
resources_dir,
|
||||||
|
options,
|
||||||
|
):
|
||||||
|
outdir = tmp_path
|
||||||
|
infile = resources_dir / 'kant_aufklaerung_1784_0020.tif'
|
||||||
|
outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml'
|
||||||
|
run_eynollah_ok_and_check_logs(
|
||||||
|
layout_cli,
|
||||||
|
[
|
||||||
|
'-i', str(infile),
|
||||||
|
'-o', str(outfile.parent),
|
||||||
|
] + options,
|
||||||
|
[
|
||||||
|
str(infile)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert outfile.exists()
|
||||||
|
tree = page_from_file(str(outfile)).etree
|
||||||
|
regions = tree.xpath("//page:TextRegion", namespaces=NS)
|
||||||
|
assert len(regions) >= 2, "result is inaccurate"
|
||||||
|
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
|
||||||
|
assert len(regions) >= 2, "result is inaccurate"
|
||||||
|
lines = tree.xpath("//page:TextLine", namespaces=NS)
|
||||||
|
assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"options",
|
||||||
|
[
|
||||||
|
["--tables"],
|
||||||
|
["--tables", "--full-layout"],
|
||||||
|
["--tables", "--full-layout", "--textline_light", "--light_version"],
|
||||||
|
], ids=str)
|
||||||
|
def test_run_eynollah_layout_filename2(
|
||||||
|
tmp_path,
|
||||||
|
resources_dir,
|
||||||
|
run_eynollah_ok_and_check_logs,
|
||||||
|
options,
|
||||||
|
):
|
||||||
|
infile = resources_dir / 'euler_rechenkunst01_1738_0025.tif'
|
||||||
|
outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml'
|
||||||
|
run_eynollah_ok_and_check_logs(
|
||||||
|
layout_cli,
|
||||||
|
[
|
||||||
|
'-i', str(infile),
|
||||||
|
'-o', str(outfile.parent),
|
||||||
|
] + options,
|
||||||
|
[
|
||||||
|
infile
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert outfile.exists()
|
||||||
|
tree = page_from_file(str(outfile)).etree
|
||||||
|
regions = tree.xpath("//page:TextRegion", namespaces=NS)
|
||||||
|
assert len(regions) >= 2, "result is inaccurate"
|
||||||
|
regions = tree.xpath("//page:TableRegion", namespaces=NS)
|
||||||
|
# model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP
|
||||||
|
assert len(regions) >= 1, "result is inaccurate"
|
||||||
|
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
|
||||||
|
assert len(regions) >= 2, "result is inaccurate"
|
||||||
|
lines = tree.xpath("//page:TextLine", namespaces=NS)
|
||||||
|
assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line
|
||||||
|
|
||||||
|
def test_run_eynollah_layout_directory(
|
||||||
|
tmp_path,
|
||||||
|
resources_dir,
|
||||||
|
run_eynollah_ok_and_check_logs,
|
||||||
|
):
|
||||||
|
outdir = tmp_path
|
||||||
|
run_eynollah_ok_and_check_logs(
|
||||||
|
layout_cli,
|
||||||
|
[
|
||||||
|
'-di', str(resources_dir),
|
||||||
|
'-o', str(outdir),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
'Job done in',
|
||||||
|
'All jobs done in',
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert len(list(outdir.iterdir())) == 2
|
||||||
53
tests/cli_tests/test_mbreorder.py
Normal file
53
tests/cli_tests/test_mbreorder.py
Normal file
|
|
@ -0,0 +1,53 @@
|
||||||
|
from ocrd_modelfactory import page_from_file
|
||||||
|
from ocrd_models.constants import NAMESPACES as NS
|
||||||
|
|
||||||
|
from eynollah.cli import (
|
||||||
|
machine_based_reading_order as mbreorder_cli,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_eynollah_mbreorder_filename(
|
||||||
|
tmp_path,
|
||||||
|
resources_dir,
|
||||||
|
run_eynollah_ok_and_check_logs,
|
||||||
|
):
|
||||||
|
infile = resources_dir / 'kant_aufklaerung_1784_0020.xml'
|
||||||
|
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
|
||||||
|
run_eynollah_ok_and_check_logs(
|
||||||
|
mbreorder_cli,
|
||||||
|
[
|
||||||
|
'-i', str(infile),
|
||||||
|
'-o', str(outfile.parent),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
# FIXME: mbreorder has no logging!
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert outfile.exists()
|
||||||
|
#in_tree = page_from_file(str(infile)).etree
|
||||||
|
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
|
||||||
|
out_tree = page_from_file(str(outfile)).etree
|
||||||
|
out_order = out_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
|
||||||
|
#assert len(out_order) >= 2, "result is inaccurate"
|
||||||
|
#assert in_order != out_order
|
||||||
|
assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3']
|
||||||
|
|
||||||
|
def test_run_eynollah_mbreorder_directory(
|
||||||
|
tmp_path,
|
||||||
|
resources_dir,
|
||||||
|
run_eynollah_ok_and_check_logs,
|
||||||
|
):
|
||||||
|
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
|
||||||
|
outdir = tmp_path
|
||||||
|
run_eynollah_ok_and_check_logs(
|
||||||
|
mbreorder_cli,
|
||||||
|
[
|
||||||
|
'-di', str(resources_dir),
|
||||||
|
'-o', str(outdir),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
# FIXME: mbreorder has no logging!
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert len(list(outdir.iterdir())) == 2
|
||||||
|
|
||||||
67
tests/cli_tests/test_ocr.py
Normal file
67
tests/cli_tests/test_ocr.py
Normal file
|
|
@ -0,0 +1,67 @@
|
||||||
|
import pytest
|
||||||
|
from eynollah.cli import (
|
||||||
|
ocr as ocr_cli,
|
||||||
|
)
|
||||||
|
from ocrd_modelfactory import page_from_file
|
||||||
|
from ocrd_models.constants import NAMESPACES as NS
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"options",
|
||||||
|
[
|
||||||
|
[], # defaults
|
||||||
|
["-doit", #str(outrenderfile.parent)],
|
||||||
|
],
|
||||||
|
["-trocr"],
|
||||||
|
], ids=str)
|
||||||
|
def test_run_eynollah_ocr_filename(
|
||||||
|
tmp_path,
|
||||||
|
run_eynollah_ok_and_check_logs,
|
||||||
|
resources_dir,
|
||||||
|
options,
|
||||||
|
):
|
||||||
|
infile = resources_dir / 'kant_aufklaerung_1784_0020.tif'
|
||||||
|
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
|
||||||
|
outrenderfile = tmp_path / 'render' / 'kant_aufklaerung_1784_0020.png'
|
||||||
|
outrenderfile.parent.mkdir()
|
||||||
|
if "-doit" in options:
|
||||||
|
options.insert(options.index("-doit") + 1, str(outrenderfile.parent))
|
||||||
|
run_eynollah_ok_and_check_logs(
|
||||||
|
ocr_cli,
|
||||||
|
[
|
||||||
|
'-i', str(infile),
|
||||||
|
'-dx', str(infile.parent),
|
||||||
|
'-o', str(outfile.parent),
|
||||||
|
] + options,
|
||||||
|
[
|
||||||
|
# FIXME: ocr has no logging!
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert outfile.exists()
|
||||||
|
if "-doit" in options:
|
||||||
|
assert outrenderfile.exists()
|
||||||
|
#in_tree = page_from_file(str(infile)).etree
|
||||||
|
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
|
||||||
|
out_tree = page_from_file(str(outfile)).etree
|
||||||
|
out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS)
|
||||||
|
assert len(out_texts) >= 2, ("result is inaccurate", out_texts)
|
||||||
|
assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts)
|
||||||
|
|
||||||
|
def test_run_eynollah_ocr_directory(
|
||||||
|
tmp_path,
|
||||||
|
run_eynollah_ok_and_check_logs,
|
||||||
|
resources_dir,
|
||||||
|
):
|
||||||
|
outdir = tmp_path
|
||||||
|
run_eynollah_ok_and_check_logs(
|
||||||
|
ocr_cli,
|
||||||
|
[
|
||||||
|
'-di', str(resources_dir),
|
||||||
|
'-dx', str(resources_dir),
|
||||||
|
'-o', str(outdir),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
# FIXME: ocr has no logging!
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert len(list(outdir.iterdir())) == 2
|
||||||
|
|
||||||
10
tests/cli_tests/test_run.py
Normal file
10
tests/cli_tests/test_run.py
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
import pytest
|
||||||
|
from PIL import Image
|
||||||
|
from eynollah.cli import (
|
||||||
|
layout as layout_cli,
|
||||||
|
binarization as binarization_cli,
|
||||||
|
enhancement as enhancement_cli,
|
||||||
|
)
|
||||||
|
from ocrd_modelfactory import page_from_file
|
||||||
|
from ocrd_models.constants import NAMESPACES as NS
|
||||||
|
|
||||||
25
tests/conftest.py
Normal file
25
tests/conftest.py
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
from glob import glob
|
||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def tests_dir():
|
||||||
|
return Path(__file__).parent.resolve()
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def model_dir(tests_dir):
|
||||||
|
return os.environ.get('EYNOLLAH_MODELS_DIR', str(tests_dir.joinpath('..').resolve()))
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def resources_dir(tests_dir):
|
||||||
|
return tests_dir / 'resources'
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def image_resources(resources_dir):
|
||||||
|
return [Path(x) for x in glob(str(resources_dir / '*.tif'))]
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def eynollah_log_filter():
|
||||||
|
return lambda logrec: logrec.name.startswith('eynollah')
|
||||||
|
|
@ -1,229 +0,0 @@
|
||||||
from os import environ
|
|
||||||
from pathlib import Path
|
|
||||||
import pytest
|
|
||||||
import logging
|
|
||||||
from PIL import Image
|
|
||||||
from eynollah.cli import (
|
|
||||||
layout as layout_cli,
|
|
||||||
binarization as binarization_cli,
|
|
||||||
enhancement as enhancement_cli,
|
|
||||||
machine_based_reading_order as mbreorder_cli,
|
|
||||||
ocr as ocr_cli,
|
|
||||||
)
|
|
||||||
from click.testing import CliRunner
|
|
||||||
from ocrd_modelfactory import page_from_file
|
|
||||||
from ocrd_models.constants import NAMESPACES as NS
|
|
||||||
|
|
||||||
testdir = Path(__file__).parent.resolve()
|
|
||||||
|
|
||||||
MODELS_DIR = environ.get('EYNOLLAH_MODELS_DIR', str(testdir.joinpath('..').resolve()))
|
|
||||||
|
|
||||||
def only_eynollah(logrec):
|
|
||||||
return logrec.name.startswith('eynollah')
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"options",
|
|
||||||
[
|
|
||||||
[], # defaults
|
|
||||||
["--no-patches"],
|
|
||||||
], ids=str)
|
|
||||||
def test_run_eynollah_binarization_filename(tmp_path, pytestconfig, caplog, options):
|
|
||||||
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
|
|
||||||
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_DIR,
|
|
||||||
'-i', str(infile),
|
|
||||||
'-o', str(outfile),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(binarization_cli, args + options, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
assert any(True for logmsg in logmsgs if logmsg.startswith('Predicting'))
|
|
||||||
assert outfile.exists()
|
|
||||||
with Image.open(infile) as original_img:
|
|
||||||
original_size = original_img.size
|
|
||||||
with Image.open(outfile) as binarized_img:
|
|
||||||
binarized_size = binarized_img.size
|
|
||||||
assert original_size == binarized_size
|
|
||||||
|
|
||||||
def test_run_eynollah_binarization_directory(tmp_path, pytestconfig, caplog):
|
|
||||||
indir = testdir.joinpath('resources')
|
|
||||||
outdir = tmp_path
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_DIR,
|
|
||||||
'-di', str(indir),
|
|
||||||
'-o', str(outdir),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(binarization_cli, args, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Predicting')]) == 2
|
|
||||||
assert len(list(outdir.iterdir())) == 2
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"options",
|
|
||||||
[
|
|
||||||
[], # defaults
|
|
||||||
["-sos"],
|
|
||||||
], ids=str)
|
|
||||||
def test_run_eynollah_enhancement_filename(tmp_path, pytestconfig, caplog, options):
|
|
||||||
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
|
|
||||||
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_DIR,
|
|
||||||
'-i', str(infile),
|
|
||||||
'-o', str(outfile.parent),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(enhancement_cli, args + options, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
assert any(True for logmsg in logmsgs if logmsg.startswith('Image was enhanced')), logmsgs
|
|
||||||
assert outfile.exists()
|
|
||||||
with Image.open(infile) as original_img:
|
|
||||||
original_size = original_img.size
|
|
||||||
with Image.open(outfile) as enhanced_img:
|
|
||||||
enhanced_size = enhanced_img.size
|
|
||||||
assert (original_size == enhanced_size) == ("-sos" in options)
|
|
||||||
|
|
||||||
def test_run_eynollah_enhancement_directory(tmp_path, pytestconfig, caplog):
|
|
||||||
indir = testdir.joinpath('resources')
|
|
||||||
outdir = tmp_path
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_DIR,
|
|
||||||
'-di', str(indir),
|
|
||||||
'-o', str(outdir),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(enhancement_cli, args, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Image was enhanced')]) == 2
|
|
||||||
assert len(list(outdir.iterdir())) == 2
|
|
||||||
|
|
||||||
def test_run_eynollah_mbreorder_filename(tmp_path, pytestconfig, caplog):
|
|
||||||
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.xml')
|
|
||||||
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_DIR,
|
|
||||||
'-i', str(infile),
|
|
||||||
'-o', str(outfile.parent),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(mbreorder_cli, args, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
# FIXME: mbreorder has no logging!
|
|
||||||
#assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
|
|
||||||
assert outfile.exists()
|
|
||||||
#in_tree = page_from_file(str(infile)).etree
|
|
||||||
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
|
|
||||||
out_tree = page_from_file(str(outfile)).etree
|
|
||||||
out_order = out_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
|
|
||||||
#assert len(out_order) >= 2, "result is inaccurate"
|
|
||||||
#assert in_order != out_order
|
|
||||||
assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3']
|
|
||||||
|
|
||||||
def test_run_eynollah_mbreorder_directory(tmp_path, pytestconfig, caplog):
|
|
||||||
indir = testdir.joinpath('resources')
|
|
||||||
outdir = tmp_path
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_DIR,
|
|
||||||
'-di', str(indir),
|
|
||||||
'-o', str(outdir),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(mbreorder_cli, args, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
# FIXME: mbreorder has no logging!
|
|
||||||
#assert len([logmsg for logmsg in logmsgs if logmsg.startswith('???')]) == 2
|
|
||||||
assert len(list(outdir.iterdir())) == 2
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"options",
|
|
||||||
[
|
|
||||||
[], # defaults
|
|
||||||
["-doit", #str(outrenderfile.parent)],
|
|
||||||
],
|
|
||||||
["-trocr"],
|
|
||||||
], ids=str)
|
|
||||||
def test_run_eynollah_ocr_filename(tmp_path, pytestconfig, caplog, options):
|
|
||||||
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
|
|
||||||
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
|
|
||||||
outrenderfile = tmp_path.joinpath('render').joinpath('kant_aufklaerung_1784_0020.png')
|
|
||||||
outrenderfile.parent.mkdir()
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_DIR,
|
|
||||||
'-i', str(infile),
|
|
||||||
'-dx', str(infile.parent),
|
|
||||||
'-o', str(outfile.parent),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.DEBUG)
|
|
||||||
runner = CliRunner()
|
|
||||||
if "-doit" in options:
|
|
||||||
options.insert(options.index("-doit") + 1, str(outrenderfile.parent))
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(ocr_cli, args + options, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
# FIXME: ocr has no logging!
|
|
||||||
#assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
|
|
||||||
assert outfile.exists()
|
|
||||||
if "-doit" in options:
|
|
||||||
assert outrenderfile.exists()
|
|
||||||
#in_tree = page_from_file(str(infile)).etree
|
|
||||||
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
|
|
||||||
out_tree = page_from_file(str(outfile)).etree
|
|
||||||
out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS)
|
|
||||||
assert len(out_texts) >= 2, ("result is inaccurate", out_texts)
|
|
||||||
assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts)
|
|
||||||
|
|
||||||
def test_run_eynollah_ocr_directory(tmp_path, pytestconfig, caplog):
|
|
||||||
indir = testdir.joinpath('resources')
|
|
||||||
outdir = tmp_path
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_DIR,
|
|
||||||
'-di', str(indir),
|
|
||||||
'-dx', str(indir),
|
|
||||||
'-o', str(outdir),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(ocr_cli, args, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
# FIXME: ocr has no logging!
|
|
||||||
#assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
|
|
||||||
assert len(list(outdir.iterdir())) == 2
|
|
||||||
|
|
@ -1,330 +0,0 @@
|
||||||
from os import environ
|
|
||||||
from pathlib import Path
|
|
||||||
import pytest
|
|
||||||
import logging
|
|
||||||
from PIL import Image
|
|
||||||
from eynollah.cli import (
|
|
||||||
layout as layout_cli,
|
|
||||||
binarization as binarization_cli,
|
|
||||||
enhancement as enhancement_cli,
|
|
||||||
machine_based_reading_order as mbreorder_cli,
|
|
||||||
ocr as ocr_cli,
|
|
||||||
)
|
|
||||||
from click.testing import CliRunner
|
|
||||||
from ocrd_modelfactory import page_from_file
|
|
||||||
from ocrd_models.constants import NAMESPACES as NS
|
|
||||||
|
|
||||||
testdir = Path(__file__).parent.resolve()
|
|
||||||
|
|
||||||
MODELS_LAYOUT = environ.get('MODELS_LAYOUT', str(testdir.joinpath('..', 'models_layout_v0_6_0').resolve()))
|
|
||||||
|
|
||||||
def only_eynollah(logrec):
|
|
||||||
return logrec.name.startswith('eynollah')
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"options",
|
|
||||||
[
|
|
||||||
[], # defaults
|
|
||||||
#["--allow_scaling", "--curved-line"],
|
|
||||||
["--allow_scaling", "--curved-line", "--full-layout"],
|
|
||||||
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"],
|
|
||||||
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based",
|
|
||||||
"--textline_light", "--light_version"],
|
|
||||||
# -ep ...
|
|
||||||
# -eoi ...
|
|
||||||
# FIXME: find out whether OCR extra was installed, otherwise skip these
|
|
||||||
["--do_ocr"],
|
|
||||||
["--do_ocr", "--light_version", "--textline_light"],
|
|
||||||
["--do_ocr", "--transformer_ocr"],
|
|
||||||
#["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"],
|
|
||||||
["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"],
|
|
||||||
# --skip_layout_and_reading_order
|
|
||||||
], ids=str)
|
|
||||||
def test_run_eynollah_layout_filename(tmp_path, pytestconfig, caplog, options):
|
|
||||||
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
|
|
||||||
outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml'
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_LAYOUT,
|
|
||||||
'-i', str(infile),
|
|
||||||
'-o', str(outfile.parent),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(layout_cli, args + options, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
assert str(infile) in logmsgs
|
|
||||||
assert outfile.exists()
|
|
||||||
tree = page_from_file(str(outfile)).etree
|
|
||||||
regions = tree.xpath("//page:TextRegion", namespaces=NS)
|
|
||||||
assert len(regions) >= 2, "result is inaccurate"
|
|
||||||
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
|
|
||||||
assert len(regions) >= 2, "result is inaccurate"
|
|
||||||
lines = tree.xpath("//page:TextLine", namespaces=NS)
|
|
||||||
assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"options",
|
|
||||||
[
|
|
||||||
["--tables"],
|
|
||||||
["--tables", "--full-layout"],
|
|
||||||
["--tables", "--full-layout", "--textline_light", "--light_version"],
|
|
||||||
], ids=str)
|
|
||||||
def test_run_eynollah_layout_filename2(tmp_path, pytestconfig, caplog, options):
|
|
||||||
infile = testdir.joinpath('resources/euler_rechenkunst01_1738_0025.tif')
|
|
||||||
outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml'
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_LAYOUT,
|
|
||||||
'-i', str(infile),
|
|
||||||
'-o', str(outfile.parent),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(layout_cli, args + options, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
assert str(infile) in logmsgs
|
|
||||||
assert outfile.exists()
|
|
||||||
tree = page_from_file(str(outfile)).etree
|
|
||||||
regions = tree.xpath("//page:TextRegion", namespaces=NS)
|
|
||||||
assert len(regions) >= 2, "result is inaccurate"
|
|
||||||
regions = tree.xpath("//page:TableRegion", namespaces=NS)
|
|
||||||
# model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP
|
|
||||||
assert len(regions) >= 1, "result is inaccurate"
|
|
||||||
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
|
|
||||||
assert len(regions) >= 2, "result is inaccurate"
|
|
||||||
lines = tree.xpath("//page:TextLine", namespaces=NS)
|
|
||||||
assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line
|
|
||||||
|
|
||||||
def test_run_eynollah_layout_directory(tmp_path, pytestconfig, caplog):
|
|
||||||
indir = testdir.joinpath('resources')
|
|
||||||
outdir = tmp_path
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_LAYOUT,
|
|
||||||
'-di', str(indir),
|
|
||||||
'-o', str(outdir),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(layout_cli, args, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Job done in')]) == 2
|
|
||||||
assert any(logmsg for logmsg in logmsgs if logmsg.startswith('All jobs done in'))
|
|
||||||
assert len(list(outdir.iterdir())) == 2
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"options",
|
|
||||||
[
|
|
||||||
[], # defaults
|
|
||||||
["--no-patches"],
|
|
||||||
], ids=str)
|
|
||||||
def test_run_eynollah_binarization_filename(tmp_path, pytestconfig, caplog, options):
|
|
||||||
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
|
|
||||||
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_BIN,
|
|
||||||
'-i', str(infile),
|
|
||||||
'-o', str(outfile),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(binarization_cli, args + options, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
assert any(True for logmsg in logmsgs if logmsg.startswith('Predicting'))
|
|
||||||
assert outfile.exists()
|
|
||||||
with Image.open(infile) as original_img:
|
|
||||||
original_size = original_img.size
|
|
||||||
with Image.open(outfile) as binarized_img:
|
|
||||||
binarized_size = binarized_img.size
|
|
||||||
assert original_size == binarized_size
|
|
||||||
|
|
||||||
def test_run_eynollah_binarization_directory(tmp_path, pytestconfig, caplog):
|
|
||||||
indir = testdir.joinpath('resources')
|
|
||||||
outdir = tmp_path
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_BIN,
|
|
||||||
'-di', str(indir),
|
|
||||||
'-o', str(outdir),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(binarization_cli, args, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Predicting')]) == 2
|
|
||||||
assert len(list(outdir.iterdir())) == 2
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"options",
|
|
||||||
[
|
|
||||||
[], # defaults
|
|
||||||
["-sos"],
|
|
||||||
], ids=str)
|
|
||||||
def test_run_eynollah_enhancement_filename(tmp_path, pytestconfig, caplog, options):
|
|
||||||
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
|
|
||||||
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_LAYOUT,
|
|
||||||
'-i', str(infile),
|
|
||||||
'-o', str(outfile.parent),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(enhancement_cli, args + options, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
assert any(True for logmsg in logmsgs if logmsg.startswith('Image was enhanced')), logmsgs
|
|
||||||
assert outfile.exists()
|
|
||||||
with Image.open(infile) as original_img:
|
|
||||||
original_size = original_img.size
|
|
||||||
with Image.open(outfile) as enhanced_img:
|
|
||||||
enhanced_size = enhanced_img.size
|
|
||||||
assert (original_size == enhanced_size) == ("-sos" in options)
|
|
||||||
|
|
||||||
def test_run_eynollah_enhancement_directory(tmp_path, pytestconfig, caplog):
|
|
||||||
indir = testdir.joinpath('resources')
|
|
||||||
outdir = tmp_path
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_LAYOUT,
|
|
||||||
'-di', str(indir),
|
|
||||||
'-o', str(outdir),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(enhancement_cli, args, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Image was enhanced')]) == 2
|
|
||||||
assert len(list(outdir.iterdir())) == 2
|
|
||||||
|
|
||||||
def test_run_eynollah_mbreorder_filename(tmp_path, pytestconfig, caplog):
|
|
||||||
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.xml')
|
|
||||||
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_LAYOUT,
|
|
||||||
'-i', str(infile),
|
|
||||||
'-o', str(outfile.parent),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(mbreorder_cli, args, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
# FIXME: mbreorder has no logging!
|
|
||||||
#assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
|
|
||||||
assert outfile.exists()
|
|
||||||
#in_tree = page_from_file(str(infile)).etree
|
|
||||||
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
|
|
||||||
out_tree = page_from_file(str(outfile)).etree
|
|
||||||
out_order = out_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
|
|
||||||
#assert len(out_order) >= 2, "result is inaccurate"
|
|
||||||
#assert in_order != out_order
|
|
||||||
assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3']
|
|
||||||
|
|
||||||
def test_run_eynollah_mbreorder_directory(tmp_path, pytestconfig, caplog):
|
|
||||||
indir = testdir.joinpath('resources')
|
|
||||||
outdir = tmp_path
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_LAYOUT,
|
|
||||||
'-di', str(indir),
|
|
||||||
'-o', str(outdir),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(mbreorder_cli, args, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
# FIXME: mbreorder has no logging!
|
|
||||||
#assert len([logmsg for logmsg in logmsgs if logmsg.startswith('???')]) == 2
|
|
||||||
assert len(list(outdir.iterdir())) == 2
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"options",
|
|
||||||
[
|
|
||||||
[], # defaults
|
|
||||||
["-doit", #str(outrenderfile.parent)],
|
|
||||||
],
|
|
||||||
["-trocr"],
|
|
||||||
], ids=str)
|
|
||||||
def test_run_eynollah_ocr_filename(tmp_path, pytestconfig, caplog, options):
|
|
||||||
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
|
|
||||||
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
|
|
||||||
outrenderfile = tmp_path.joinpath('render').joinpath('kant_aufklaerung_1784_0020.png')
|
|
||||||
outrenderfile.parent.mkdir()
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_OCR,
|
|
||||||
'-i', str(infile),
|
|
||||||
'-dx', str(infile.parent),
|
|
||||||
'-o', str(outfile.parent),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.DEBUG)
|
|
||||||
runner = CliRunner()
|
|
||||||
if "-doit" in options:
|
|
||||||
options.insert(options.index("-doit") + 1, str(outrenderfile.parent))
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(ocr_cli, args + options, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
# FIXME: ocr has no logging!
|
|
||||||
#assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
|
|
||||||
assert outfile.exists()
|
|
||||||
if "-doit" in options:
|
|
||||||
assert outrenderfile.exists()
|
|
||||||
#in_tree = page_from_file(str(infile)).etree
|
|
||||||
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
|
|
||||||
out_tree = page_from_file(str(outfile)).etree
|
|
||||||
out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS)
|
|
||||||
assert len(out_texts) >= 2, ("result is inaccurate", out_texts)
|
|
||||||
assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts)
|
|
||||||
|
|
||||||
def test_run_eynollah_ocr_directory(tmp_path, pytestconfig, caplog):
|
|
||||||
indir = testdir.joinpath('resources')
|
|
||||||
outdir = tmp_path
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_OCR,
|
|
||||||
'-di', str(indir),
|
|
||||||
'-dx', str(indir),
|
|
||||||
'-o', str(outdir),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(ocr_cli, args, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
# FIXME: ocr has no logging!
|
|
||||||
#assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
|
|
||||||
assert len(list(outdir.iterdir())) == 2
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue