Merge remote-tracking branch 'bertsky/mbro_dead_code-plus-fixes-plus-tests' into prepare-release-v0.5.0

This commit is contained in:
kba 2025-09-25 20:05:03 +02:00
commit 5e15c4f248
11 changed files with 4190 additions and 273 deletions

View file

@ -82,13 +82,21 @@ smoke-test: tests/resources/kant_aufklaerung_1784_0020.tif
eynollah layout -i $< -o $(TMPDIR) -m $(CURDIR)/models_eynollah
fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 $(TMPDIR)/$(basename $(<F)).xml
fgrep -c -e TextRegion -e ImageRegion -e SeparatorRegion $(TMPDIR)/$(basename $(<F)).xml
# directory mode (skip one, add one):
# layout, directory mode (skip one, add one):
eynollah layout -di $(<D) -o $(TMPDIR) -m $(CURDIR)/models_eynollah
test -s $(TMPDIR)/euler_rechenkunst01_1738_0025.xml
# mbreorder, directory mode (overwrite):
eynollah machine-based-reading-order -di $(<D) -o $(TMPDIR) -m $(CURDIR)/models_eynollah
fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 $(TMPDIR)/$(basename $(<F)).xml
fgrep -c -e RegionRefIndexed $(TMPDIR)/$(basename $(<F)).xml
# binarize:
eynollah binarization -m $(CURDIR)/default-2021-03-09 -i $< -o $(TMPDIR)/$(<F)
test -s $(TMPDIR)/$(<F)
@set -x; test "$$(identify -format '%w %h' $<)" = "$$(identify -format '%w %h' $(TMPDIR)/$(<F))"
# enhance:
eynollah enhancement -m $(CURDIR)/models_eynollah -sos -i $< -o $(TMPDIR) -O
test -s $(TMPDIR)/$(<F)
@set -x; test "$$(identify -format '%w %h' $<)" = "$$(identify -format '%w %h' $(TMPDIR)/$(<F))"
$(RM) -r $(TMPDIR)
ocrd-test: export OCRD_MISSING_OUTPUT := ABORT

View file

@ -13,22 +13,23 @@ def main():
@main.command()
@click.option(
"--dir_xml",
"-dx",
help="directory of page-xml files",
type=click.Path(exists=True, file_okay=False),
)
@click.option(
"--xml_file",
"-xml",
help="xml filename",
"--input",
"-i",
help="PAGE-XML input filename",
type=click.Path(exists=True, dir_okay=False),
)
@click.option(
"--dir_out",
"-do",
"--dir_in",
"-di",
help="directory of PAGE-XML input files (instead of --input)",
type=click.Path(exists=True, file_okay=False),
)
@click.option(
"--out",
"-o",
help="directory for output images",
type=click.Path(exists=True, file_okay=False),
required=True,
)
@click.option(
"--model",
@ -37,24 +38,38 @@ def main():
type=click.Path(exists=True, file_okay=False),
required=True,
)
@click.option(
"--log_level",
"-l",
type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']),
help="Override log level globally to this",
)
def machine_based_reading_order(dir_xml, xml_file, dir_out, model):
raedingorder_object = machine_based_reading_order_on_layout(model, dir_out=dir_out, logger=getLogger('enhancement'))
def machine_based_reading_order(input, dir_in, out, model, log_level):
assert bool(input) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both."
orderer = machine_based_reading_order_on_layout(model, dir_out=out)
if log_level:
orderer.logger.setLevel(getLevelName(log_level))
if dir_xml:
raedingorder_object.run(dir_in=dir_xml)
if dir_in:
orderer.run(dir_in=dir_in)
else:
raedingorder_object.run(xml_filename=xml_file)
orderer.run(xml_filename=input)
@main.command()
@click.option('--patches/--no-patches', default=True, help='by enabling this parameter you let the model to see the image in patches.')
@click.option('--model_dir', '-m', type=click.Path(exists=True, file_okay=False), required=True, help='directory containing models for prediction')
@click.option("--input-image", "-i", help="input image", type=click.Path(exists=True, dir_okay=False))
@click.option(
"--input-image", "--image",
"-i",
help="input image filename",
type=click.Path(exists=True, dir_okay=False)
)
@click.option(
"--dir_in",
"-di",
help="directory of input images",
help="directory of input images (instead of --image)",
type=click.Path(exists=True, file_okay=False),
)
@click.option(
@ -62,25 +77,34 @@ def machine_based_reading_order(dir_xml, xml_file, dir_out, model):
"-o",
help="output image (if using -i) or output image directory (if using -di)",
type=click.Path(file_okay=True, dir_okay=True),
required=True,
)
def binarization(patches, model_dir, input_image, dir_in, output):
assert (dir_in is None) != (input_image is None), "Specify either -di and or -i not both"
SbbBinarizer(model_dir).run(image_path=input_image, use_patches=patches, output=output, dir_in=dir_in)
@click.option(
"--log_level",
"-l",
type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']),
help="Override log level globally to this",
)
def binarization(patches, model_dir, input_image, dir_in, output, log_level):
assert bool(input_image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both."
binarizer = SbbBinarizer(model_dir)
if log_level:
binarizer.log.setLevel(getLevelName(log_level))
binarizer.run(image_path=input_image, use_patches=patches, output=output, dir_in=dir_in)
@main.command()
@click.option(
"--image",
"-i",
help="image filename",
help="input image filename",
type=click.Path(exists=True, dir_okay=False),
)
@click.option(
"--out",
"-o",
help="directory to write output xml data",
help="directory for output PAGE-XML files",
type=click.Path(exists=True, file_okay=False),
required=True,
)
@ -93,7 +117,7 @@ def binarization(patches, model_dir, input_image, dir_in, output):
@click.option(
"--dir_in",
"-di",
help="directory of images",
help="directory of input images (instead of --image)",
type=click.Path(exists=True, file_okay=False),
)
@click.option(
@ -128,35 +152,34 @@ def binarization(patches, model_dir, input_image, dir_in, output):
)
def enhancement(image, out, overwrite, dir_in, model, num_col_upper, num_col_lower, save_org_scale, log_level):
assert bool(image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both."
initLogging()
if log_level:
getLogger('enhancement').setLevel(getLevelName(log_level))
assert image or dir_in, "Either a single image -i or a dir_in -di is required"
enhancer_object = Enhancer(
enhancer = Enhancer(
model,
logger=getLogger('enhancement'),
dir_out=out,
num_col_upper=num_col_upper,
num_col_lower=num_col_lower,
save_org_scale=save_org_scale,
)
if log_level:
enhancer.logger.setLevel(getLevelName(log_level))
if dir_in:
enhancer_object.run(dir_in=dir_in, overwrite=overwrite)
enhancer.run(dir_in=dir_in, overwrite=overwrite)
else:
enhancer_object.run(image_filename=image, overwrite=overwrite)
enhancer.run(image_filename=image, overwrite=overwrite)
@main.command()
@click.option(
"--image",
"-i",
help="image filename",
help="input image filename",
type=click.Path(exists=True, dir_okay=False),
)
@click.option(
"--out",
"-o",
help="directory to write output xml data",
help="directory for output PAGE-XML files",
type=click.Path(exists=True, file_okay=False),
required=True,
)
@ -169,7 +192,7 @@ def enhancement(image, out, overwrite, dir_in, model, num_col_upper, num_col_low
@click.option(
"--dir_in",
"-di",
help="directory of images",
help="directory of input images (instead of --image)",
type=click.Path(exists=True, file_okay=False),
)
@click.option(
@ -360,8 +383,6 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
getLogger('eynollah').setLevel(logging.INFO)
else:
initLogging()
if log_level:
getLogger('eynollah').setLevel(getLevelName(log_level))
assert enable_plotting or not save_layout, "Plotting with -sl also requires -ep"
assert enable_plotting or not save_deskewed, "Plotting with -sd also requires -ep"
assert enable_plotting or not save_all, "Plotting with -sa also requires -ep"
@ -380,16 +401,10 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
assert not extract_only_images or not tables, "Image extraction -eoi can not be set alongside tables -tab"
assert not extract_only_images or not right2left, "Image extraction -eoi can not be set alongside right2left -r2l"
assert not extract_only_images or not headers_off, "Image extraction -eoi can not be set alongside headers_off -ho"
assert image or dir_in, "Either a single image -i or a dir_in -di is required"
assert bool(image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both."
eynollah = Eynollah(
model,
dir_out=out,
dir_of_cropped_images=save_images,
extract_only_images=extract_only_images,
dir_of_layout=save_layout,
dir_of_deskewed=save_deskewed,
dir_of_all=save_all,
dir_save_page=save_page,
enable_plotting=enable_plotting,
allow_enhancement=allow_enhancement,
curved_line=curved_line,
@ -412,56 +427,64 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
threshold_art_class_textline=threshold_art_class_textline,
threshold_art_class_layout=threshold_art_class_layout,
)
if dir_in:
eynollah.run(dir_in=dir_in, overwrite=overwrite)
else:
eynollah.run(image_filename=image, overwrite=overwrite)
if log_level:
eynollah.logger.setLevel(getLevelName(log_level))
eynollah.run(overwrite=overwrite,
image_filename=image,
dir_in=dir_in,
dir_out=out,
dir_of_cropped_images=save_images,
dir_of_layout=save_layout,
dir_of_deskewed=save_deskewed,
dir_of_all=save_all,
dir_save_page=save_page,
)
@main.command()
@click.option(
"--image",
"-i",
help="image filename",
help="input image filename",
type=click.Path(exists=True, dir_okay=False),
)
@click.option(
"--dir_in",
"-di",
help="directory of input images (instead of --image)",
type=click.Path(exists=True, file_okay=False),
)
@click.option(
"--dir_in_bin",
"-dib",
help="directory of binarized images (in addition to --dir_in for RGB images; filename stems must match the RGB image files, with '.png' suffix).\nPerform prediction using both RGB and binary images. (This does not necessarily improve results, however it may be beneficial for certain document images.)",
type=click.Path(exists=True, file_okay=False),
)
@click.option(
"--dir_xmls",
"-dx",
help="directory of input PAGE-XML files (in addition to --dir_in; filename stems must match the image files, with '.xml' suffix).",
type=click.Path(exists=True, file_okay=False),
required=True,
)
@click.option(
"--out",
"-o",
help="directory for output PAGE-XML files",
type=click.Path(exists=True, file_okay=False),
required=True,
)
@click.option(
"--dir_out_image_text",
"-doit",
help="directory for output images, newly rendered with predicted text",
type=click.Path(exists=True, file_okay=False),
)
@click.option(
"--overwrite",
"-O",
help="overwrite (instead of skipping) if output xml exists",
is_flag=True,
)
@click.option(
"--dir_in",
"-di",
help="directory of images",
type=click.Path(exists=True, file_okay=False),
)
@click.option(
"--dir_in_bin",
"-dib",
help="directory of binarized images. This should be given if you want to do prediction based on both rgb and bin images. And all bin images are png files",
type=click.Path(exists=True, file_okay=False),
)
@click.option(
"--out",
"-o",
help="directory to write output xml data",
type=click.Path(exists=True, file_okay=False),
required=True,
)
@click.option(
"--dir_xmls",
"-dx",
help="directory of xmls",
type=click.Path(exists=True, file_okay=False),
)
@click.option(
"--dir_out_image_text",
"-doit",
help="directory of images with predicted text",
type=click.Path(exists=True, file_okay=False),
)
@click.option(
"--model",
"-m",
@ -491,12 +514,6 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
is_flag=True,
help="if this parameter set to true, cropped textline images will not be masked with textline contour.",
)
@click.option(
"--prediction_with_both_of_rgb_and_bin",
"-brb/-nbrb",
is_flag=True,
help="If this parameter is set to True, the prediction will be performed using both RGB and binary images. However, this does not necessarily improve results; it may be beneficial for certain document images.",
)
@click.option(
"--batch_size",
"-bs",
@ -519,37 +536,36 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
help="Override log level globally to this",
)
def ocr(image, overwrite, dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, model_name, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, prediction_with_both_of_rgb_and_bin, batch_size, dataset_abbrevation, min_conf_value_of_textline_text, log_level):
def ocr(image, dir_in, dir_in_bin, dir_xmls, out, dir_out_image_text, overwrite, model, model_name, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, batch_size, dataset_abbrevation, min_conf_value_of_textline_text, log_level):
initLogging()
if log_level:
getLogger('eynollah').setLevel(getLevelName(log_level))
assert not model or not model_name, "model directory -m can not be set alongside specific model name --model_name"
assert bool(model) != bool(model_name), "Either -m (model directory) or --model_name (specific model name) must be provided."
assert not export_textline_images_and_text or not tr_ocr, "Exporting textline and text -etit can not be set alongside transformer ocr -tr_ocr"
assert not export_textline_images_and_text or not model, "Exporting textline and text -etit can not be set alongside model -m"
assert not export_textline_images_and_text or not batch_size, "Exporting textline and text -etit can not be set alongside batch size -bs"
assert not export_textline_images_and_text or not dir_in_bin, "Exporting textline and text -etit can not be set alongside directory of bin images -dib"
assert not export_textline_images_and_text or not dir_out_image_text, "Exporting textline and text -etit can not be set alongside directory of images with predicted text -doit"
assert not export_textline_images_and_text or not prediction_with_both_of_rgb_and_bin, "Exporting textline and text -etit can not be set alongside prediction with both rgb and bin -brb"
assert (bool(image) ^ bool(dir_in)), "Either -i (single image) or -di (directory) must be provided, but not both."
assert bool(image) != bool(dir_in), "Either -i (single image) or -di (directory) must be provided, but not both."
eynollah_ocr = Eynollah_ocr(
image_filename=image,
dir_xmls=dir_xmls,
dir_out_image_text=dir_out_image_text,
dir_in=dir_in,
dir_in_bin=dir_in_bin,
dir_out=out,
dir_models=model,
model_name=model_name,
tr_ocr=tr_ocr,
export_textline_images_and_text=export_textline_images_and_text,
do_not_mask_with_textline_contour=do_not_mask_with_textline_contour,
prediction_with_both_of_rgb_and_bin=prediction_with_both_of_rgb_and_bin,
batch_size=batch_size,
pref_of_dataset=dataset_abbrevation,
min_conf_value_of_textline_text=min_conf_value_of_textline_text,
)
eynollah_ocr.run(overwrite=overwrite)
if log_level:
eynollah_ocr.logger.setLevel(getLevelName(log_level))
eynollah_ocr.run(overwrite=overwrite,
dir_in=dir_in,
dir_in_bin=dir_in_bin,
image_filename=image,
dir_xmls=dir_xmls,
dir_out_image_text=dir_out_image_text,
dir_out=out,
)
if __name__ == "__main__":
main()

View file

@ -107,6 +107,7 @@ from .utils.drop_capitals import (
from .utils.marginals import get_marginals
from .utils.resize import resize_image
from .utils import (
is_image_filename,
boosting_headers_by_longshot_region_segmentation,
crop_image_inside_box,
find_num_col,
@ -190,13 +191,7 @@ class Eynollah:
def __init__(
self,
dir_models : str,
dir_out : Optional[str] = None,
dir_of_cropped_images : Optional[str] = None,
extract_only_images : bool =False,
dir_of_layout : Optional[str] = None,
dir_of_deskewed : Optional[str] = None,
dir_of_all : Optional[str] = None,
dir_save_page : Optional[str] = None,
enable_plotting : bool = False,
allow_enhancement : bool = False,
curved_line : bool = False,
@ -220,18 +215,12 @@ class Eynollah:
skip_layout_and_reading_order : bool = False,
):
self.logger = getLogger('eynollah')
self.plotter = None
if skip_layout_and_reading_order:
textline_light = True
self.light_version = light_version
self.dir_out = dir_out
self.dir_of_all = dir_of_all
self.dir_save_page = dir_save_page
self.reading_order_machine_based = reading_order_machine_based
self.dir_of_deskewed = dir_of_deskewed
self.dir_of_deskewed = dir_of_deskewed
self.dir_of_cropped_images=dir_of_cropped_images
self.dir_of_layout=dir_of_layout
self.enable_plotting = enable_plotting
self.allow_enhancement = allow_enhancement
self.curved_line = curved_line
@ -422,21 +411,11 @@ class Eynollah:
if dpi is not None:
self.dpi = dpi
def reset_file_name_dir(self, image_filename):
def reset_file_name_dir(self, image_filename, dir_out):
t_c = time.time()
self.cache_images(image_filename=image_filename)
self.plotter = None if not self.enable_plotting else EynollahPlotter(
dir_out=self.dir_out,
dir_of_all=self.dir_of_all,
dir_save_page=self.dir_save_page,
dir_of_deskewed=self.dir_of_deskewed,
dir_of_cropped_images=self.dir_of_cropped_images,
dir_of_layout=self.dir_of_layout,
image_filename_stem=Path(Path(image_filename).name).stem)
self.writer = EynollahXmlWriter(
dir_out=self.dir_out,
dir_out=dir_out,
image_filename=image_filename,
curved_line=self.curved_line,
textline_light = self.textline_light)
@ -4602,7 +4581,17 @@ class Eynollah:
return ordered_left_marginals, ordered_right_marginals, ordered_left_marginals_textline, ordered_right_marginals_textline, ordered_left_marginals_bbox, ordered_right_marginals_bbox, ordered_left_slopes_marginals, ordered_right_slopes_marginals
def run(self, image_filename : Optional[str] = None, dir_in : Optional[str] = None, overwrite : bool = False):
def run(self,
overwrite: bool = False,
image_filename: Optional[str] = None,
dir_in: Optional[str] = None,
dir_out: Optional[str] = None,
dir_of_cropped_images: Optional[str] = None,
dir_of_layout: Optional[str] = None,
dir_of_deskewed: Optional[str] = None,
dir_of_all: Optional[str] = None,
dir_save_page: Optional[str] = None,
):
"""
Get image and scales, then extract the page of scanned image
"""
@ -4623,20 +4612,37 @@ class Eynollah:
enabled_modes.append("Table detection")
if enabled_modes:
self.logger.info("Enabled modes: " + ", ".join(enabled_modes))
if self.enable_plotting:
self.logger.info("Saving debug plots")
if dir_of_cropped_images:
self.logger.info(f"Saving cropped images to: {dir_of_cropped_images}")
if dir_of_layout:
self.logger.info(f"Saving layout plots to: {dir_of_layout}")
if dir_of_deskewed:
self.logger.info(f"Saving deskewed images to: {dir_of_deskewed}")
if dir_in:
self.ls_imgs = os.listdir(dir_in)
self.ls_imgs = [ind_img for ind_img in self.ls_imgs if ind_img.endswith('.jpg') or ind_img.endswith('.jpeg') or ind_img.endswith('.png') or ind_img.endswith('.tif') or ind_img.endswith('.tiff') or ind_img.endswith('.JPG') or ind_img.endswith('.JPEG') or ind_img.endswith('.TIF') or ind_img.endswith('.TIFF') or ind_img.endswith('.PNG')]
ls_imgs = [os.path.join(dir_in, image_filename)
for image_filename in filter(is_image_filename,
os.listdir(dir_in))]
elif image_filename:
self.ls_imgs = [image_filename]
ls_imgs = [image_filename]
else:
raise ValueError("run requires either a single image filename or a directory")
for img_filename in self.ls_imgs:
for img_filename in ls_imgs:
self.logger.info(img_filename)
t0 = time.time()
self.reset_file_name_dir(os.path.join(dir_in or "", img_filename))
self.reset_file_name_dir(img_filename, dir_out)
if self.enable_plotting:
self.plotter = EynollahPlotter(dir_out=dir_out,
dir_of_all=dir_of_all,
dir_save_page=dir_save_page,
dir_of_deskewed=dir_of_deskewed,
dir_of_cropped_images=dir_of_cropped_images,
dir_of_layout=dir_of_layout,
image_filename_stem=Path(image_filename).stem)
#print("text region early -11 in %.1fs", time.time() - t0)
if os.path.exists(self.writer.output_filename):
if overwrite:
@ -5236,19 +5242,6 @@ class Eynollah:
self.logger.info("Step 5/5: Output Generation")
output_config = []
if self.enable_plotting:
output_config.append("Saving debug plots")
if self.dir_of_cropped_images:
output_config.append(f"Saving cropped images to: {self.dir_of_cropped_images}")
if self.dir_of_layout:
output_config.append(f"Saving layout plots to: {self.dir_of_layout}")
if self.dir_of_deskewed:
output_config.append(f"Saving deskewed images to: {self.dir_of_deskewed}")
if output_config:
self.logger.info("Output configuration:\n * %s", "\n * ".join(output_config))
pcgts = self.writer.build_pagexml_full_layout(
contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot,
all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h,
@ -5368,21 +5361,8 @@ class Eynollah:
self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s")
self.logger.info("Step 5/5: Output Generation")
self.logger.info("Generating PAGE-XML output")
if self.enable_plotting:
self.logger.info("Saving debug plots")
if self.dir_of_cropped_images:
self.logger.info(f"Saving cropped images to: {self.dir_of_cropped_images}")
if self.dir_of_layout:
self.logger.info(f"Saving layout plots to: {self.dir_of_layout}")
if self.dir_of_deskewed:
self.logger.info(f"Saving deskewed images to: {self.dir_of_deskewed}")
pcgts = self.writer.build_pagexml_no_full_layout(
txt_con_org, page_coord, order_text_new, id_of_texts_tot,
all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals_left, polygons_of_marginals_right,
@ -5400,32 +5380,19 @@ class Eynollah_ocr:
dir_models,
model_name=None,
dir_xmls=None,
dir_in=None,
image_filename=None,
dir_in_bin=None,
dir_out=None,
dir_out_image_text=None,
tr_ocr=False,
batch_size=None,
export_textline_images_and_text=False,
do_not_mask_with_textline_contour=False,
prediction_with_both_of_rgb_and_bin=False,
pref_of_dataset=None,
min_conf_value_of_textline_text : Optional[float]=None,
logger=None,
):
self.dir_in = dir_in
self.image_filename = image_filename
self.dir_in_bin = dir_in_bin
self.dir_out = dir_out
self.dir_xmls = dir_xmls
self.dir_models = dir_models
self.model_name = model_name
self.tr_ocr = tr_ocr
self.export_textline_images_and_text = export_textline_images_and_text
self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour
self.dir_out_image_text = dir_out_image_text
self.prediction_with_both_of_rgb_and_bin = prediction_with_both_of_rgb_and_bin
self.pref_of_dataset = pref_of_dataset
self.logger = logger if logger else getLogger('eynollah')
@ -5477,24 +5444,27 @@ class Eynollah_ocr:
)
self.end_character = len(characters) + 2
def run(self, overwrite : bool = False):
if self.dir_in:
ls_imgs = os.listdir(self.dir_in)
ls_imgs = [ind_img for ind_img in ls_imgs if ind_img.endswith('.jpg') or ind_img.endswith('.jpeg') or ind_img.endswith('.png') or ind_img.endswith('.tif') or ind_img.endswith('.tiff') or ind_img.endswith('.JPG') or ind_img.endswith('.JPEG') or ind_img.endswith('.TIF') or ind_img.endswith('.TIFF') or ind_img.endswith('.PNG')]
def run(self, overwrite: bool = False,
dir_in: Optional[str] = None,
dir_in_bin: Optional[str] = None,
image_filename: Optional[str] = None,
dir_xmls: Optional[str] = None,
dir_out_image_text: Optional[str] = None,
dir_out: Optional[str] = None,
):
if dir_in:
ls_imgs = [os.path.join(dir_in, image_filename)
for image_filename in filter(is_image_filename,
os.listdir(dir_in))]
else:
ls_imgs = [self.image_filename]
ls_imgs = [image_filename]
if self.tr_ocr:
tr_ocr_input_height_and_width = 384
for ind_img in ls_imgs:
if self.dir_in:
file_name = Path(ind_img).stem
dir_img = os.path.join(self.dir_in, ind_img)
else:
file_name = Path(self.image_filename).stem
dir_img = self.image_filename
dir_xml = os.path.join(self.dir_xmls, file_name+'.xml')
out_file_ocr = os.path.join(self.dir_out, file_name+'.xml')
for dir_img in ls_imgs:
file_name = Path(dir_img).stem
dir_xml = os.path.join(dir_xmls, file_name+'.xml')
out_file_ocr = os.path.join(dir_out, file_name+'.xml')
if os.path.exists(out_file_ocr):
if overwrite:
@ -5505,8 +5475,8 @@ class Eynollah_ocr:
img = cv2.imread(dir_img)
if self.dir_out_image_text:
out_image_with_text = os.path.join(self.dir_out_image_text, file_name+'.png')
if dir_out_image_text:
out_image_with_text = os.path.join(dir_out_image_text, file_name+'.png')
image_text = Image.new("RGB", (img.shape[1], img.shape[0]), "white")
draw = ImageDraw.Draw(image_text)
total_bb_coordinates = []
@ -5544,7 +5514,7 @@ class Eynollah_ocr:
textline_coords = np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] )
x,y,w,h = cv2.boundingRect(textline_coords)
if self.dir_out_image_text:
if dir_out_image_text:
total_bb_coordinates.append([x,y,w,h])
h2w_ratio = h/float(w)
@ -5557,7 +5527,7 @@ class Eynollah_ocr:
img_crop = img_poly_on_img[y:y+h, x:x+w, :]
img_crop[mask_poly==0] = 255
self.logger.debug("processing %d lines for '%s'", len(cropped_lines), nn.attrib['id'])
if h2w_ratio > 0.1:
cropped_lines.append(resize_image(img_crop, tr_ocr_input_height_and_width, tr_ocr_input_height_and_width) )
cropped_lines_meging_indexing.append(0)
@ -5666,7 +5636,7 @@ class Eynollah_ocr:
unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer)
if self.dir_out_image_text:
if dir_out_image_text:
font_path = "Charis-7.000/Charis-Regular.ttf" # Make sure this file exists!
font = ImageFont.truetype(font_path, 40)
@ -5794,18 +5764,10 @@ class Eynollah_ocr:
img_size=(image_width, image_height)
for ind_img in ls_imgs:
if self.dir_in:
file_name = Path(ind_img).stem
dir_img = os.path.join(self.dir_in, ind_img)
else:
file_name = Path(self.image_filename).stem
dir_img = self.image_filename
#file_name = Path(ind_img).stem
#dir_img = os.path.join(self.dir_in, ind_img)
dir_xml = os.path.join(self.dir_xmls, file_name+'.xml')
out_file_ocr = os.path.join(self.dir_out, file_name+'.xml')
for dir_img in ls_imgs:
file_name = Path(dir_img).stem
dir_xml = os.path.join(dir_xmls, file_name+'.xml')
out_file_ocr = os.path.join(dir_out, file_name+'.xml')
if os.path.exists(out_file_ocr):
if overwrite:
@ -5815,13 +5777,13 @@ class Eynollah_ocr:
continue
img = cv2.imread(dir_img)
if self.prediction_with_both_of_rgb_and_bin:
if dir_in_bin is not None:
cropped_lines_bin = []
dir_img_bin = os.path.join(self.dir_in_bin, file_name+'.png')
dir_img_bin = os.path.join(dir_in_bin, file_name+'.png')
img_bin = cv2.imread(dir_img_bin)
if self.dir_out_image_text:
out_image_with_text = os.path.join(self.dir_out_image_text, file_name+'.png')
if dir_out_image_text:
out_image_with_text = os.path.join(dir_out_image_text, file_name+'.png')
image_text = Image.new("RGB", (img.shape[1], img.shape[0]), "white")
draw = ImageDraw.Draw(image_text)
total_bb_coordinates = []
@ -5865,13 +5827,13 @@ class Eynollah_ocr:
if type_textregion=='drop-capital':
angle_degrees = 0
if self.dir_out_image_text:
if dir_out_image_text:
total_bb_coordinates.append([x,y,w,h])
w_scaled = w * image_height/float(h)
img_poly_on_img = np.copy(img)
if self.prediction_with_both_of_rgb_and_bin:
if dir_in_bin is not None:
img_poly_on_img_bin = np.copy(img_bin)
img_crop_bin = img_poly_on_img_bin[y:y+h, x:x+w, :]
@ -5894,7 +5856,7 @@ class Eynollah_ocr:
img_crop = rotate_image_with_padding(img_crop, better_des_slope )
if self.prediction_with_both_of_rgb_and_bin:
if dir_in_bin is not None:
img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope )
mask_poly = rotate_image_with_padding(mask_poly, better_des_slope )
@ -5909,13 +5871,13 @@ class Eynollah_ocr:
if not self.do_not_mask_with_textline_contour:
img_crop[mask_poly==0] = 255
if self.prediction_with_both_of_rgb_and_bin:
if dir_in_bin is not None:
img_crop_bin = img_crop_bin[y_n:y_n+h_n, x_n:x_n+w_n, :]
if not self.do_not_mask_with_textline_contour:
img_crop_bin[mask_poly==0] = 255
if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 90:
if self.prediction_with_both_of_rgb_and_bin:
if dir_in_bin is not None:
img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin)
else:
img_crop, _ = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly)
@ -5925,14 +5887,14 @@ class Eynollah_ocr:
better_des_slope = 0
if not self.do_not_mask_with_textline_contour:
img_crop[mask_poly==0] = 255
if self.prediction_with_both_of_rgb_and_bin:
if dir_in_bin is not None:
if not self.do_not_mask_with_textline_contour:
img_crop_bin[mask_poly==0] = 255
if type_textregion=='drop-capital':
pass
else:
if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 90:
if self.prediction_with_both_of_rgb_and_bin:
if dir_in_bin is not None:
img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin)
else:
img_crop, _ = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly)
@ -5947,14 +5909,12 @@ class Eynollah_ocr:
cropped_lines_ver_index.append(0)
cropped_lines_meging_indexing.append(0)
if self.prediction_with_both_of_rgb_and_bin:
if dir_in_bin is not None:
img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width)
cropped_lines_bin.append(img_fin)
else:
if self.prediction_with_both_of_rgb_and_bin:
splited_images, splited_images_bin = return_textlines_split_if_needed(img_crop, img_crop_bin, prediction_with_both_of_rgb_and_bin=self.prediction_with_both_of_rgb_and_bin)
else:
splited_images, splited_images_bin = return_textlines_split_if_needed(img_crop, None)
splited_images, splited_images_bin = return_textlines_split_if_needed(
img_crop, img_crop_bin if dir_in_bin is not None else None)
if splited_images:
img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[0], image_height, image_width)
cropped_lines.append(img_fin)
@ -5975,7 +5935,7 @@ class Eynollah_ocr:
else:
cropped_lines_ver_index.append(0)
if self.prediction_with_both_of_rgb_and_bin:
if dir_in_bin is not None:
img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images_bin[0], image_height, image_width)
cropped_lines_bin.append(img_fin)
img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images_bin[1], image_height, image_width)
@ -5991,7 +5951,7 @@ class Eynollah_ocr:
else:
cropped_lines_ver_index.append(0)
if self.prediction_with_both_of_rgb_and_bin:
if dir_in_bin is not None:
img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width)
cropped_lines_bin.append(img_fin)
@ -6004,29 +5964,15 @@ class Eynollah_ocr:
if cheild_text.tag.endswith("Unicode"):
textline_text = cheild_text.text
if textline_text:
if self.do_not_mask_with_textline_contour:
if self.pref_of_dataset:
with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'.txt'), 'w') as text_file:
text_file.write(textline_text)
cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'.png'), img_crop )
else:
with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.txt'), 'w') as text_file:
text_file.write(textline_text)
cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.png'), img_crop )
else:
if self.pref_of_dataset:
with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'_masked.txt'), 'w') as text_file:
text_file.write(textline_text)
cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'_masked.png'), img_crop )
else:
with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_masked.txt'), 'w') as text_file:
text_file.write(textline_text)
cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_masked.png'), img_crop )
base_name = os.path.join(dir_out, file_name + '_line_' + str(indexer_textlines))
if self.pref_of_dataset:
base_name += '_' + self.pref_of_dataset
if not self.do_not_mask_with_textline_contour:
base_name += '_masked'
with open(base_name + '.txt', 'w') as text_file:
text_file.write(textline_text)
cv2.imwrite(base_name + '.png', img_crop)
indexer_textlines+=1
if not self.export_textline_images_and_text:
@ -6057,7 +6003,7 @@ class Eynollah_ocr:
else:
imgs_ver_flipped = None
if self.prediction_with_both_of_rgb_and_bin:
if dir_in_bin is not None:
imgs_bin = cropped_lines_bin[n_start:]
imgs_bin = np.array(imgs_bin)
imgs_bin = imgs_bin.reshape(imgs_bin.shape[0], image_height, image_width, 3)
@ -6087,7 +6033,7 @@ class Eynollah_ocr:
imgs_ver_flipped = None
if self.prediction_with_both_of_rgb_and_bin:
if dir_in_bin is not None:
imgs_bin = cropped_lines_bin[n_start:n_end]
imgs_bin = np.array(imgs_bin).reshape(self.b_s, image_height, image_width, 3)
@ -6100,6 +6046,7 @@ class Eynollah_ocr:
imgs_bin_ver_flipped = None
self.logger.debug("processing next %d lines", len(imgs))
preds = self.prediction_model.predict(imgs, verbose=0)
if len(indices_ver)>0:
@ -6126,7 +6073,7 @@ class Eynollah_ocr:
if len(indices_where_flipped_conf_value_is_higher)>0:
indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher]
preds[indices_to_be_replaced,:,:] = preds_flipped[indices_where_flipped_conf_value_is_higher, :, :]
if self.prediction_with_both_of_rgb_and_bin:
if dir_in_bin is not None:
preds_bin = self.prediction_model.predict(imgs_bin, verbose=0)
if len(indices_ver)>0:
@ -6173,7 +6120,7 @@ class Eynollah_ocr:
extracted_texts.append("")
extracted_conf_value.append(0)
del cropped_lines
if self.prediction_with_both_of_rgb_and_bin:
if dir_in_bin is not None:
del cropped_lines_bin
gc.collect()
@ -6186,7 +6133,7 @@ class Eynollah_ocr:
unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer)
if self.dir_out_image_text:
if dir_out_image_text:
font_path = "Charis-7.000/Charis-Regular.ttf" # Make sure this file exists!
font = ImageFont.truetype(font_path, 40)

View file

@ -21,6 +21,7 @@ from tensorflow.keras.models import load_model
from .utils.resize import resize_image
from .utils.pil_cv2 import pil2cv
from .utils import (
is_image_filename,
crop_image_inside_box
)
@ -701,13 +702,13 @@ class Enhancer:
t0_tot = time.time()
if dir_in:
self.ls_imgs = os.listdir(dir_in)
ls_imgs = list(filter(is_image_filename, os.listdir(dir_in)))
elif image_filename:
self.ls_imgs = [image_filename]
ls_imgs = [image_filename]
else:
raise ValueError("run requires either a single image filename or a directory")
for img_filename in self.ls_imgs:
for img_filename in ls_imgs:
self.logger.info(img_filename)
t0 = time.time()

View file

@ -25,6 +25,7 @@ from .utils.contour import (
return_contours_of_image,
return_parent_contours,
)
from .utils import is_xml_filename
DPI_THRESHOLD = 298
KERNEL = np.ones((5, 5), np.uint8)
@ -39,7 +40,7 @@ class machine_based_reading_order_on_layout:
):
self.dir_out = dir_out
self.logger = logger if logger else getLogger('mbro on layout')
self.logger = logger if logger else getLogger('mbreorder')
# for parallelization of CPU-intensive tasks:
self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200)
atexit.register(self.executor.shutdown)
@ -751,13 +752,13 @@ class machine_based_reading_order_on_layout:
t0_tot = time.time()
if dir_in:
self.ls_xmls = os.listdir(dir_in)
ls_xmls = list(filter(is_xml_filename, os.listdir(dir_in)))
elif xml_filename:
self.ls_xmls = [xml_filename]
ls_xmls = [xml_filename]
else:
raise ValueError("run requires either a single image filename or a directory")
for xml_filename in self.ls_xmls:
for xml_filename in ls_xmls:
self.logger.info(xml_filename)
t0 = time.time()

View file

@ -16,6 +16,7 @@ import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.python.keras import backend as tensorflow_backend
from .utils import is_image_filename
def resize_image(img_in, input_height, input_width):
return cv2.resize(img_in, (input_width, input_height), interpolation=cv2.INTER_NEAREST)
@ -347,7 +348,7 @@ class SbbBinarizer:
cv2.imwrite(output, img_last)
return img_last
else:
ls_imgs = os.listdir(dir_in)
ls_imgs = list(filter(is_image_filename, os.listdir(dir_in)))
for image_name in ls_imgs:
image_stem = image_name.split('.')[0]
print(image_name,'image_name')

View file

@ -2194,3 +2194,14 @@ def return_boxes_of_images_by_order_of_reading_new(
return boxes, peaks_neg_tot_tables_new
else:
return boxes, peaks_neg_tot_tables
def is_image_filename(fname: str) -> bool:
return fname.lower().endswith(('.jpg',
'.jpeg',
'.png',
'.tif',
'.tiff',
))
def is_xml_filename(fname: str) -> bool:
return fname.lower().endswith('.xml')

View file

@ -109,13 +109,13 @@ def fit_text_single_line(draw, text, font_path, max_width, max_height):
return ImageFont.truetype(font_path, 10) # Smallest font fallback
def return_textlines_split_if_needed(textline_image, textline_image_bin, prediction_with_both_of_rgb_and_bin=False):
def return_textlines_split_if_needed(textline_image, textline_image_bin=None):
split_point = return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image)
if split_point:
image1 = textline_image[:, :split_point,:]# image.crop((0, 0, width2, height))
image2 = textline_image[:, split_point:,:]#image.crop((width1, 0, width, height))
if prediction_with_both_of_rgb_and_bin:
if textline_image_bin is not None:
image1_bin = textline_image_bin[:, :split_point,:]# image.crop((0, 0, width2, height))
image2_bin = textline_image_bin[:, split_point:,:]#image.crop((width1, 0, width, height))
return [image1, image2], [image1_bin, image2_bin]

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -2,7 +2,13 @@ from os import environ
from pathlib import Path
import logging
from PIL import Image
from eynollah.cli import layout as layout_cli, binarization as binarization_cli
from eynollah.cli import (
layout as layout_cli,
binarization as binarization_cli,
enhancement as enhancement_cli,
machine_based_reading_order as mbreorder_cli,
ocr as ocr_cli,
)
from click.testing import CliRunner
from ocrd_modelfactory import page_from_file
from ocrd_models.constants import NAMESPACES as NS
@ -44,8 +50,7 @@ def test_run_eynollah_layout_filename(tmp_path, subtests, pytestconfig, caplog):
options=options):
with caplog.filtering(only_eynollah):
result = runner.invoke(layout_cli, args + options, catch_exceptions=False)
print(result)
assert result.exit_code == 0
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
assert str(infile) in logmsgs
assert outfile.exists()
@ -72,9 +77,8 @@ def test_run_eynollah_layout_directory(tmp_path, pytestconfig, caplog):
return logrec.name == 'eynollah'
runner = CliRunner()
with caplog.filtering(only_eynollah):
result = runner.invoke(layout_cli, args)
print(result)
assert result.exit_code == 0
result = runner.invoke(layout_cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Job done in')]) == 2
assert any(logmsg for logmsg in logmsgs if logmsg.startswith('All jobs done in'))
@ -88,6 +92,8 @@ def test_run_eynollah_binarization_filename(tmp_path, subtests, pytestconfig, ca
'-i', str(infile),
'-o', str(outfile),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'SbbBinarizer'
@ -99,9 +105,8 @@ def test_run_eynollah_binarization_filename(tmp_path, subtests, pytestconfig, ca
with subtests.test(#msg="test CLI",
options=options):
with caplog.filtering(only_eynollah):
result = runner.invoke(binarization_cli, args + options)
print(result)
assert result.exit_code == 0
result = runner.invoke(binarization_cli, args + options, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
assert any(True for logmsg in logmsgs if logmsg.startswith('Predicting'))
assert outfile.exists()
@ -119,14 +124,186 @@ def test_run_eynollah_binarization_directory(tmp_path, subtests, pytestconfig, c
'-di', str(indir),
'-o', str(outdir),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'SbbBinarizer'
runner = CliRunner()
with caplog.filtering(only_eynollah):
result = runner.invoke(binarization_cli, args)
print(result)
assert result.exit_code == 0
result = runner.invoke(binarization_cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Predicting')]) == 2
assert len(list(outdir.iterdir())) == 2
def test_run_eynollah_enhancement_filename(tmp_path, subtests, pytestconfig, caplog):
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
args = [
'-m', EYNOLLAH_MODELS,
'-i', str(infile),
'-o', str(outfile.parent),
# subtests write to same location
'--overwrite',
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'enhancement'
runner = CliRunner()
for options in [
[], # defaults
["-sos"],
]:
with subtests.test(#msg="test CLI",
options=options):
with caplog.filtering(only_eynollah):
result = runner.invoke(enhancement_cli, args + options, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
assert any(True for logmsg in logmsgs if logmsg.startswith('Image was enhanced')), logmsgs
assert outfile.exists()
with Image.open(infile) as original_img:
original_size = original_img.size
with Image.open(outfile) as enhanced_img:
enhanced_size = enhanced_img.size
assert (original_size == enhanced_size) == ("-sos" in options)
def test_run_eynollah_enhancement_directory(tmp_path, subtests, pytestconfig, caplog):
indir = testdir.joinpath('resources')
outdir = tmp_path
args = [
'-m', EYNOLLAH_MODELS,
'-di', str(indir),
'-o', str(outdir),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'enhancement'
runner = CliRunner()
with caplog.filtering(only_eynollah):
result = runner.invoke(enhancement_cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Image was enhanced')]) == 2
assert len(list(outdir.iterdir())) == 2
def test_run_eynollah_mbreorder_filename(tmp_path, subtests, pytestconfig, caplog):
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.xml')
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
args = [
'-m', EYNOLLAH_MODELS,
'-i', str(infile),
'-o', str(outfile.parent),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'mbreorder'
runner = CliRunner()
with caplog.filtering(only_eynollah):
result = runner.invoke(mbreorder_cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
# FIXME: mbreorder has no logging!
#assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
assert outfile.exists()
#in_tree = page_from_file(str(infile)).etree
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
out_tree = page_from_file(str(outfile)).etree
out_order = out_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
#assert len(out_order) >= 2, "result is inaccurate"
#assert in_order != out_order
assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3']
def test_run_eynollah_mbreorder_directory(tmp_path, subtests, pytestconfig, caplog):
indir = testdir.joinpath('resources')
outdir = tmp_path
args = [
'-m', EYNOLLAH_MODELS,
'-di', str(indir),
'-o', str(outdir),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'mbreorder'
runner = CliRunner()
with caplog.filtering(only_eynollah):
result = runner.invoke(mbreorder_cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
# FIXME: mbreorder has no logging!
#assert len([logmsg for logmsg in logmsgs if logmsg.startswith('???')]) == 2
assert len(list(outdir.iterdir())) == 2
def test_run_eynollah_ocr_filename(tmp_path, subtests, pytestconfig, caplog):
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
outrenderfile = tmp_path.joinpath('render').joinpath('kant_aufklaerung_1784_0020.xml')
outrenderfile.parent.mkdir()
args = [
'-m', EYNOLLAH_MODELS,
'-i', str(infile),
'-dx', str(infile.parent),
'-o', str(outfile.parent),
# subtests write to same location
'--overwrite',
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.DEBUG)
def only_eynollah(logrec):
return logrec.name == 'eynollah'
runner = CliRunner()
for options in [
[], # defaults
["-doit", str(outrenderfile.parent)],
["-trocr"],
]:
with subtests.test(#msg="test CLI",
options=options):
with caplog.filtering(only_eynollah):
result = runner.invoke(ocr_cli, args + options, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
# FIXME: ocr has no logging!
#assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
assert outfile.exists()
if "-doit" in options:
assert outrenderfile.exists()
#in_tree = page_from_file(str(infile)).etree
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
out_tree = page_from_file(str(outfile)).etree
out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS)
assert len(out_texts) >= 2, ("result is inaccurate", out_texts)
assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts)
def test_run_eynollah_ocr_directory(tmp_path, subtests, pytestconfig, caplog):
indir = testdir.joinpath('resources')
outdir = tmp_path
args = [
'-m', EYNOLLAH_MODELS,
'-di', str(indir),
'-dx', str(indir),
'-o', str(outdir),
]
if pytestconfig.getoption('verbose') > 0:
args.extend(['-l', 'DEBUG'])
caplog.set_level(logging.INFO)
def only_eynollah(logrec):
return logrec.name == 'eynollah'
runner = CliRunner()
with caplog.filtering(only_eynollah):
result = runner.invoke(ocr_cli, args, catch_exceptions=False)
assert result.exit_code == 0, result.stdout
logmsgs = [logrec.message for logrec in caplog.records]
# FIXME: ocr has no logging!
#assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
assert len(list(outdir.iterdir())) == 2