mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-12-11 05:34:15 +01:00
Restored correct functionality of the extract_only_images mode and cleaned up the argument handling
This commit is contained in:
parent
51abe9617a
commit
d687d862d6
2 changed files with 32 additions and 90 deletions
|
|
@ -33,30 +33,6 @@ import click
|
|||
help="if a directory is given, images in documents will be cropped and saved there",
|
||||
type=click.Path(exists=True, file_okay=False),
|
||||
)
|
||||
@click.option(
|
||||
"--save_layout",
|
||||
"-sl",
|
||||
help="if a directory is given, plot of layout will be saved there",
|
||||
type=click.Path(exists=True, file_okay=False),
|
||||
)
|
||||
@click.option(
|
||||
"--save_deskewed",
|
||||
"-sd",
|
||||
help="if a directory is given, deskewed image will be saved there",
|
||||
type=click.Path(exists=True, file_okay=False),
|
||||
)
|
||||
@click.option(
|
||||
"--save_all",
|
||||
"-sa",
|
||||
help="if a directory is given, all plots needed for documentation will be saved there",
|
||||
type=click.Path(exists=True, file_okay=False),
|
||||
)
|
||||
@click.option(
|
||||
"--save_page",
|
||||
"-sp",
|
||||
help="if a directory is given, page crop of image will be saved there",
|
||||
type=click.Path(exists=True, file_okay=False),
|
||||
)
|
||||
@click.option(
|
||||
"--enable-plotting/--disable-plotting",
|
||||
"-ep/-noep",
|
||||
|
|
@ -75,12 +51,6 @@ import click
|
|||
is_flag=True,
|
||||
help="if this parameter set to true, this tool would ignore page extraction",
|
||||
)
|
||||
@click.option(
|
||||
"--reading_order_machine_based/--heuristic_reading_order",
|
||||
"-romb/-hro",
|
||||
is_flag=True,
|
||||
help="if this parameter set to true, this tool would apply machine based reading order detection",
|
||||
)
|
||||
@click.option(
|
||||
"--num_col_upper",
|
||||
"-ncu",
|
||||
|
|
@ -91,22 +61,6 @@ import click
|
|||
"-ncl",
|
||||
help="upper limit of columns in document image",
|
||||
)
|
||||
@click.option(
|
||||
"--threshold_art_class_layout",
|
||||
"-tharl",
|
||||
help="threshold of artifical class in the case of layout detection. The default value is 0.1",
|
||||
)
|
||||
@click.option(
|
||||
"--threshold_art_class_textline",
|
||||
"-thart",
|
||||
help="threshold of artifical class in the case of textline detection. The default value is 0.1",
|
||||
)
|
||||
@click.option(
|
||||
"--skip_layout_and_reading_order",
|
||||
"-slro/-noslro",
|
||||
is_flag=True,
|
||||
help="if this parameter set to true, this tool will ignore layout detection and reading order. It means that textline detection will be done within printspace and contours of textline will be written in xml output file.",
|
||||
)
|
||||
@click.pass_context
|
||||
def extract_images_cli(
|
||||
ctx,
|
||||
|
|
@ -115,30 +69,17 @@ def extract_images_cli(
|
|||
overwrite,
|
||||
dir_in,
|
||||
save_images,
|
||||
save_layout,
|
||||
save_deskewed,
|
||||
save_all,
|
||||
save_page,
|
||||
enable_plotting,
|
||||
input_binary,
|
||||
reading_order_machine_based,
|
||||
num_col_upper,
|
||||
num_col_lower,
|
||||
threshold_art_class_textline,
|
||||
threshold_art_class_layout,
|
||||
skip_layout_and_reading_order,
|
||||
ignore_page_extraction,
|
||||
):
|
||||
"""
|
||||
Detect Layout (with optional image enhancement and reading order detection)
|
||||
"""
|
||||
assert enable_plotting or not save_layout, "Plotting with -sl also requires -ep"
|
||||
assert enable_plotting or not save_deskewed, "Plotting with -sd also requires -ep"
|
||||
assert enable_plotting or not save_all, "Plotting with -sa also requires -ep"
|
||||
assert enable_plotting or not save_page, "Plotting with -sp also requires -ep"
|
||||
assert enable_plotting or not save_images, "Plotting with -si also requires -ep"
|
||||
assert not enable_plotting or save_layout or save_deskewed or save_all or save_page or save_images, \
|
||||
"Plotting with -ep also requires -sl, -sd, -sa, -sp, -si or -ae"
|
||||
assert not enable_plotting or save_images, "Plotting with -ep also requires -si"
|
||||
assert bool(image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both."
|
||||
|
||||
from ..extract_images import EynollahImageExtractor
|
||||
|
|
@ -147,21 +88,13 @@ def extract_images_cli(
|
|||
enable_plotting=enable_plotting,
|
||||
input_binary=input_binary,
|
||||
ignore_page_extraction=ignore_page_extraction,
|
||||
reading_order_machine_based=reading_order_machine_based,
|
||||
num_col_upper=num_col_upper,
|
||||
num_col_lower=num_col_lower,
|
||||
skip_layout_and_reading_order=skip_layout_and_reading_order,
|
||||
threshold_art_class_textline=threshold_art_class_textline,
|
||||
threshold_art_class_layout=threshold_art_class_layout,
|
||||
)
|
||||
extractor.run(overwrite=overwrite,
|
||||
image_filename=image,
|
||||
dir_in=dir_in,
|
||||
dir_out=out,
|
||||
dir_of_cropped_images=save_images,
|
||||
dir_of_layout=save_layout,
|
||||
dir_of_deskewed=save_deskewed,
|
||||
dir_of_all=save_all,
|
||||
dir_save_page=save_page,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -30,23 +30,26 @@ class EynollahImageExtractor(Eynollah):
|
|||
enable_plotting : bool = False,
|
||||
input_binary : bool = False,
|
||||
ignore_page_extraction : bool = False,
|
||||
reading_order_machine_based : bool = False,
|
||||
num_col_upper : Optional[int] = None,
|
||||
num_col_lower : Optional[int] = None,
|
||||
threshold_art_class_layout: Optional[float] = None,
|
||||
threshold_art_class_textline: Optional[float] = None,
|
||||
skip_layout_and_reading_order : bool = False,
|
||||
full_layout : bool = False,
|
||||
tables : bool = False,
|
||||
curved_line : bool = False,
|
||||
allow_enhancement : bool = False,
|
||||
|
||||
):
|
||||
self.logger = logging.getLogger('eynollah.extract_images')
|
||||
self.model_zoo = model_zoo
|
||||
self.plotter = None
|
||||
|
||||
self.reading_order_machine_based = reading_order_machine_based
|
||||
self.tables = tables
|
||||
self.curved_line = curved_line
|
||||
self.allow_enhancement = allow_enhancement
|
||||
|
||||
self.enable_plotting = enable_plotting
|
||||
# --input-binary sensible if image is very dark, if layout is not working.
|
||||
self.input_binary = input_binary
|
||||
self.ignore_page_extraction = ignore_page_extraction
|
||||
self.skip_layout_and_reading_order = skip_layout_and_reading_order
|
||||
self.full_layout = full_layout
|
||||
if num_col_upper:
|
||||
self.num_col_upper = int(num_col_upper)
|
||||
else:
|
||||
|
|
@ -58,16 +61,6 @@ class EynollahImageExtractor(Eynollah):
|
|||
|
||||
# for parallelization of CPU-intensive tasks:
|
||||
self.executor = ProcessPoolExecutor(max_workers=cpu_count())
|
||||
|
||||
if threshold_art_class_layout:
|
||||
self.threshold_art_class_layout = float(threshold_art_class_layout)
|
||||
else:
|
||||
self.threshold_art_class_layout = 0.1
|
||||
|
||||
if threshold_art_class_textline:
|
||||
self.threshold_art_class_textline = float(threshold_art_class_textline)
|
||||
else:
|
||||
self.threshold_art_class_textline = 0.1
|
||||
|
||||
t_start = time.time()
|
||||
|
||||
|
|
@ -115,7 +108,7 @@ class EynollahImageExtractor(Eynollah):
|
|||
img_h_new = int(img.shape[0] / float(img.shape[1]) * img_w_new)
|
||||
img_resized = resize_image(img,img_h_new, img_w_new )
|
||||
|
||||
prediction_regions_org, _ = self.do_prediction_new_concept(True, img_resized, self.model_zoo.get("region"))
|
||||
prediction_regions_org, _ = self.do_prediction_new_concept(True, img_resized, self.model_zoo.get("extract_images"))
|
||||
|
||||
prediction_regions_org = resize_image(prediction_regions_org,img_height_h, img_width_h )
|
||||
image_page, page_coord, cont_page = self.extract_page()
|
||||
|
|
@ -183,7 +176,6 @@ class EynollahImageExtractor(Eynollah):
|
|||
"""
|
||||
self.logger.debug("enter run")
|
||||
t0_tot = time.time()
|
||||
|
||||
# Log enabled features directly
|
||||
enabled_modes = []
|
||||
if self.full_layout:
|
||||
|
|
@ -261,10 +253,27 @@ class EynollahImageExtractor(Eynollah):
|
|||
_, _, _, polygons_of_images, \
|
||||
image_page, page_coord, cont_page = \
|
||||
self.get_regions_light_v_extract_only_images(img_res, num_col_classifier)
|
||||
|
||||
pcgts = self.writer.build_pagexml_no_full_layout(
|
||||
[], page_coord, [], [], [], [],
|
||||
polygons_of_images, [], [], [], [], [], [], [], [], [],
|
||||
cont_page, [], [])
|
||||
found_polygons_text_region=[],
|
||||
page_coord=page_coord,
|
||||
order_of_texts=[],
|
||||
all_found_textline_polygons=[],
|
||||
all_box_coord=[],
|
||||
found_polygons_text_region_img=polygons_of_images,
|
||||
found_polygons_marginals_left=[],
|
||||
found_polygons_marginals_right=[],
|
||||
all_found_textline_polygons_marginals_left=[],
|
||||
all_found_textline_polygons_marginals_right=[],
|
||||
all_box_coord_marginals_left=[],
|
||||
all_box_coord_marginals_right=[],
|
||||
slopes=[],
|
||||
slopes_marginals_left=[],
|
||||
slopes_marginals_right=[],
|
||||
cont_page=cont_page,
|
||||
polygons_seplines=[],
|
||||
found_polygons_tables=[],
|
||||
)
|
||||
if self.plotter:
|
||||
self.plotter.write_images_into_directory(polygons_of_images, image_page)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue