mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-06-15 07:09:55 +02:00
factor run_single() out of run(), simplify kwargs
This commit is contained in:
parent
1a0a1cb00b
commit
e17d34fafa
3 changed files with 55 additions and 112 deletions
|
@ -272,10 +272,7 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
|
||||||
eynollah = Eynollah(
|
eynollah = Eynollah(
|
||||||
model,
|
model,
|
||||||
logger=getLogger('Eynollah'),
|
logger=getLogger('Eynollah'),
|
||||||
image_filename=image,
|
|
||||||
overwrite=overwrite,
|
|
||||||
dir_out=out,
|
dir_out=out,
|
||||||
dir_in=dir_in,
|
|
||||||
dir_of_cropped_images=save_images,
|
dir_of_cropped_images=save_images,
|
||||||
extract_only_images=extract_only_images,
|
extract_only_images=extract_only_images,
|
||||||
dir_of_layout=save_layout,
|
dir_of_layout=save_layout,
|
||||||
|
@ -301,9 +298,9 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
|
||||||
skip_layout_and_reading_order=skip_layout_and_reading_order,
|
skip_layout_and_reading_order=skip_layout_and_reading_order,
|
||||||
)
|
)
|
||||||
if dir_in:
|
if dir_in:
|
||||||
eynollah.run()
|
eynollah.run(dir_in=dir_in, overwrite=overwrite)
|
||||||
else:
|
else:
|
||||||
pcgts = eynollah.run()
|
pcgts = eynollah.run(image_filename=image)
|
||||||
eynollah.writer.write_pagexml(pcgts)
|
eynollah.writer.write_pagexml(pcgts)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -180,12 +180,7 @@ class Eynollah:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
dir_models : str,
|
dir_models : str,
|
||||||
image_filename : Optional[str] = None,
|
|
||||||
image_pil : Optional[Image] = None,
|
|
||||||
image_filename_stem : Optional[str] = None,
|
|
||||||
overwrite : bool = False,
|
|
||||||
dir_out : Optional[str] = None,
|
dir_out : Optional[str] = None,
|
||||||
dir_in : Optional[str] = None,
|
|
||||||
dir_of_cropped_images : Optional[str] = None,
|
dir_of_cropped_images : Optional[str] = None,
|
||||||
extract_only_images : bool =False,
|
extract_only_images : bool =False,
|
||||||
dir_of_layout : Optional[str] = None,
|
dir_of_layout : Optional[str] = None,
|
||||||
|
@ -209,24 +204,12 @@ class Eynollah:
|
||||||
num_col_upper : Optional[int] = None,
|
num_col_upper : Optional[int] = None,
|
||||||
num_col_lower : Optional[int] = None,
|
num_col_lower : Optional[int] = None,
|
||||||
skip_layout_and_reading_order : bool = False,
|
skip_layout_and_reading_order : bool = False,
|
||||||
override_dpi : Optional[int] = None,
|
|
||||||
logger : Logger = None,
|
logger : Logger = None,
|
||||||
pcgts : Optional[OcrdPage] = None,
|
|
||||||
):
|
):
|
||||||
if skip_layout_and_reading_order:
|
if skip_layout_and_reading_order:
|
||||||
textline_light = True
|
textline_light = True
|
||||||
self.light_version = light_version
|
self.light_version = light_version
|
||||||
if not dir_in:
|
|
||||||
if image_pil:
|
|
||||||
self._imgs = self._cache_images(image_pil=image_pil)
|
|
||||||
else:
|
|
||||||
self._imgs = self._cache_images(image_filename=image_filename)
|
|
||||||
if override_dpi:
|
|
||||||
self.dpi = override_dpi
|
|
||||||
self.image_filename = image_filename
|
|
||||||
self.overwrite = overwrite
|
|
||||||
self.dir_out = dir_out
|
self.dir_out = dir_out
|
||||||
self.dir_in = dir_in
|
|
||||||
self.dir_of_all = dir_of_all
|
self.dir_of_all = dir_of_all
|
||||||
self.dir_save_page = dir_save_page
|
self.dir_save_page = dir_save_page
|
||||||
self.reading_order_machine_based = reading_order_machine_based
|
self.reading_order_machine_based = reading_order_machine_based
|
||||||
|
@ -257,21 +240,6 @@ class Eynollah:
|
||||||
self.num_col_lower = int(num_col_lower)
|
self.num_col_lower = int(num_col_lower)
|
||||||
else:
|
else:
|
||||||
self.num_col_lower = num_col_lower
|
self.num_col_lower = num_col_lower
|
||||||
if not dir_in:
|
|
||||||
self.plotter = None if not enable_plotting else EynollahPlotter(
|
|
||||||
dir_out=self.dir_out,
|
|
||||||
dir_of_all=dir_of_all,
|
|
||||||
dir_save_page=dir_save_page,
|
|
||||||
dir_of_deskewed=dir_of_deskewed,
|
|
||||||
dir_of_cropped_images=dir_of_cropped_images,
|
|
||||||
dir_of_layout=dir_of_layout,
|
|
||||||
image_filename_stem=Path(Path(image_filename).name).stem)
|
|
||||||
self.writer = EynollahXmlWriter(
|
|
||||||
dir_out=self.dir_out,
|
|
||||||
image_filename=self.image_filename,
|
|
||||||
curved_line=self.curved_line,
|
|
||||||
textline_light = self.textline_light,
|
|
||||||
pcgts=pcgts)
|
|
||||||
self.logger = logger if logger else getLogger('eynollah')
|
self.logger = logger if logger else getLogger('eynollah')
|
||||||
# for parallelization of CPU-intensive tasks:
|
# for parallelization of CPU-intensive tasks:
|
||||||
self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200)
|
self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200)
|
||||||
|
@ -370,7 +338,7 @@ class Eynollah:
|
||||||
if self.tables:
|
if self.tables:
|
||||||
self.model_table = self.our_load_model(self.model_table_dir)
|
self.model_table = self.our_load_model(self.model_table_dir)
|
||||||
|
|
||||||
def _cache_images(self, image_filename=None, image_pil=None):
|
def cache_images(self, image_filename=None, image_pil=None, dpi=None):
|
||||||
ret = {}
|
ret = {}
|
||||||
t_c0 = time.time()
|
t_c0 = time.time()
|
||||||
if image_filename:
|
if image_filename:
|
||||||
|
@ -388,12 +356,13 @@ class Eynollah:
|
||||||
ret['img_grayscale'] = cv2.cvtColor(ret['img'], cv2.COLOR_BGR2GRAY)
|
ret['img_grayscale'] = cv2.cvtColor(ret['img'], cv2.COLOR_BGR2GRAY)
|
||||||
for prefix in ('', '_grayscale'):
|
for prefix in ('', '_grayscale'):
|
||||||
ret[f'img{prefix}_uint8'] = ret[f'img{prefix}'].astype(np.uint8)
|
ret[f'img{prefix}_uint8'] = ret[f'img{prefix}'].astype(np.uint8)
|
||||||
return ret
|
self._imgs = ret
|
||||||
|
if dpi is not None:
|
||||||
|
self.dpi = dpi
|
||||||
|
|
||||||
def reset_file_name_dir(self, image_filename):
|
def reset_file_name_dir(self, image_filename):
|
||||||
t_c = time.time()
|
t_c = time.time()
|
||||||
self._imgs = self._cache_images(image_filename=image_filename)
|
self.cache_images(image_filename=image_filename)
|
||||||
self.image_filename = image_filename
|
|
||||||
|
|
||||||
self.plotter = None if not self.enable_plotting else EynollahPlotter(
|
self.plotter = None if not self.enable_plotting else EynollahPlotter(
|
||||||
dir_out=self.dir_out,
|
dir_out=self.dir_out,
|
||||||
|
@ -406,7 +375,7 @@ class Eynollah:
|
||||||
|
|
||||||
self.writer = EynollahXmlWriter(
|
self.writer = EynollahXmlWriter(
|
||||||
dir_out=self.dir_out,
|
dir_out=self.dir_out,
|
||||||
image_filename=self.image_filename,
|
image_filename=image_filename,
|
||||||
curved_line=self.curved_line,
|
curved_line=self.curved_line,
|
||||||
textline_light = self.textline_light)
|
textline_light = self.textline_light)
|
||||||
|
|
||||||
|
@ -4224,30 +4193,49 @@ class Eynollah:
|
||||||
return (slopes_rem, all_found_textline_polygons_rem, boxes_text_rem, txt_con_org_rem,
|
return (slopes_rem, all_found_textline_polygons_rem, boxes_text_rem, txt_con_org_rem,
|
||||||
contours_only_text_parent_rem, index_by_text_par_con_rem_sort)
|
contours_only_text_parent_rem, index_by_text_par_con_rem_sort)
|
||||||
|
|
||||||
def run(self):
|
def run(self, image_filename : Optional[str] = None, dir_in : Optional[str] = None, overwrite : bool = False):
|
||||||
"""
|
"""
|
||||||
Get image and scales, then extract the page of scanned image
|
Get image and scales, then extract the page of scanned image
|
||||||
"""
|
"""
|
||||||
self.logger.debug("enter run")
|
self.logger.debug("enter run")
|
||||||
|
|
||||||
t0_tot = time.time()
|
t0_tot = time.time()
|
||||||
|
|
||||||
if not self.dir_in:
|
if dir_in:
|
||||||
self.ls_imgs = [self.image_filename]
|
self.ls_imgs = os.listdir(dir_in)
|
||||||
|
elif image_filename:
|
||||||
|
self.ls_imgs = [image_filename]
|
||||||
|
else:
|
||||||
|
raise ValueError("run requires either a single image filename or a directory")
|
||||||
|
|
||||||
for img_name in self.ls_imgs:
|
for img_filename in self.ls_imgs:
|
||||||
self.logger.info(img_name)
|
self.logger.info(img_filename)
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
if self.dir_in:
|
|
||||||
self.reset_file_name_dir(os.path.join(self.dir_in,img_name))
|
self.reset_file_name_dir(os.path.join(dir_in or "", img_filename))
|
||||||
#print("text region early -11 in %.1fs", time.time() - t0)
|
#print("text region early -11 in %.1fs", time.time() - t0)
|
||||||
if os.path.exists(self.writer.output_filename):
|
if os.path.exists(self.writer.output_filename):
|
||||||
if self.overwrite:
|
if overwrite:
|
||||||
self.logger.warning("will overwrite existing output file '%s'", self.writer.output_filename)
|
self.logger.warning("will overwrite existing output file '%s'", self.writer.output_filename)
|
||||||
else:
|
else:
|
||||||
self.logger.warning("will skip input for existing output file '%s'", self.writer.output_filename)
|
self.logger.warning("will skip input for existing output file '%s'", self.writer.output_filename)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
pcgts = self.run_single()
|
||||||
|
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||||
|
#print("Job done in %.1fs" % (time.time() - t0))
|
||||||
|
if dir_in:
|
||||||
|
self.writer.write_pagexml(pcgts)
|
||||||
|
else:
|
||||||
|
return pcgts
|
||||||
|
|
||||||
|
if dir_in:
|
||||||
|
self.logger.info("All jobs done in %.1fs", time.time() - t0_tot)
|
||||||
|
print("all Job done in %.1fs", time.time() - t0_tot)
|
||||||
|
|
||||||
|
def run_single(self):
|
||||||
|
# conditional merely for indentation (= smaller diff)
|
||||||
|
if True:
|
||||||
|
t0 = time.time()
|
||||||
img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version)
|
img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version)
|
||||||
self.logger.info("Enhancing took %.1fs ", time.time() - t0)
|
self.logger.info("Enhancing took %.1fs ", time.time() - t0)
|
||||||
if self.extract_only_images:
|
if self.extract_only_images:
|
||||||
|
@ -4260,11 +4248,6 @@ class Eynollah:
|
||||||
cont_page, [], [], ocr_all_textlines)
|
cont_page, [], [], ocr_all_textlines)
|
||||||
if self.plotter:
|
if self.plotter:
|
||||||
self.plotter.write_images_into_directory(polygons_of_images, image_page)
|
self.plotter.write_images_into_directory(polygons_of_images, image_page)
|
||||||
|
|
||||||
if self.dir_in:
|
|
||||||
self.writer.write_pagexml(pcgts)
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
return pcgts
|
return pcgts
|
||||||
|
|
||||||
if self.skip_layout_and_reading_order:
|
if self.skip_layout_and_reading_order:
|
||||||
|
@ -4307,10 +4290,6 @@ class Eynollah:
|
||||||
all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals,
|
all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals,
|
||||||
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals,
|
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals,
|
||||||
cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines)
|
cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines)
|
||||||
if self.dir_in:
|
|
||||||
self.writer.write_pagexml(pcgts)
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
return pcgts
|
return pcgts
|
||||||
|
|
||||||
#print("text region early -1 in %.1fs", time.time() - t0)
|
#print("text region early -1 in %.1fs", time.time() - t0)
|
||||||
|
@ -4363,11 +4342,6 @@ class Eynollah:
|
||||||
pcgts = self.writer.build_pagexml_no_full_layout(
|
pcgts = self.writer.build_pagexml_no_full_layout(
|
||||||
[], page_coord, [], [], [], [], [], [], [], [], [], [],
|
[], page_coord, [], [], [], [], [], [], [], [], [], [],
|
||||||
cont_page, [], [], ocr_all_textlines)
|
cont_page, [], [], ocr_all_textlines)
|
||||||
self.logger.info("Job done in %.1fs", time.time() - t1)
|
|
||||||
if self.dir_in:
|
|
||||||
self.writer.write_pagexml(pcgts)
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
return pcgts
|
return pcgts
|
||||||
|
|
||||||
#print("text region early in %.1fs", time.time() - t0)
|
#print("text region early in %.1fs", time.time() - t0)
|
||||||
|
@ -4553,11 +4527,6 @@ class Eynollah:
|
||||||
polygons_of_images,
|
polygons_of_images,
|
||||||
polygons_of_marginals, empty_marginals, empty_marginals, [], [],
|
polygons_of_marginals, empty_marginals, empty_marginals, [], [],
|
||||||
cont_page, polygons_lines_xml, contours_tables, [])
|
cont_page, polygons_lines_xml, contours_tables, [])
|
||||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
|
||||||
if self.dir_in:
|
|
||||||
self.writer.write_pagexml(pcgts)
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
return pcgts
|
return pcgts
|
||||||
|
|
||||||
#print("text region early 3 in %.1fs", time.time() - t0)
|
#print("text region early 3 in %.1fs", time.time() - t0)
|
||||||
|
@ -4748,12 +4717,6 @@ class Eynollah:
|
||||||
polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals,
|
polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals,
|
||||||
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals,
|
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals,
|
||||||
cont_page, polygons_lines_xml, ocr_all_textlines)
|
cont_page, polygons_lines_xml, ocr_all_textlines)
|
||||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
|
||||||
#print("Job done in %.1fs", time.time() - t0)
|
|
||||||
if self.dir_in:
|
|
||||||
self.writer.write_pagexml(pcgts)
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
return pcgts
|
return pcgts
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
@ -4834,20 +4797,7 @@ class Eynollah:
|
||||||
all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals,
|
all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals,
|
||||||
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals,
|
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals,
|
||||||
cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines)
|
cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines)
|
||||||
#print("Job done in %.1fs" % (time.time() - t0))
|
|
||||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
|
||||||
if not self.dir_in:
|
|
||||||
return pcgts
|
return pcgts
|
||||||
#print("text region early 7 in %.1fs", time.time() - t0)
|
|
||||||
|
|
||||||
if self.dir_in:
|
|
||||||
self.writer.write_pagexml(pcgts)
|
|
||||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
|
||||||
#print("Job done in %.1fs" % (time.time() - t0))
|
|
||||||
|
|
||||||
if self.dir_in:
|
|
||||||
self.logger.info("All jobs done in %.1fs", time.time() - t0_tot)
|
|
||||||
print("all Job done in %.1fs", time.time() - t0_tot)
|
|
||||||
|
|
||||||
|
|
||||||
class Eynollah_ocr:
|
class Eynollah_ocr:
|
||||||
|
|
|
@ -30,11 +30,7 @@ class EynollahProcessor(Processor):
|
||||||
allow_scaling=self.parameter['allow_scaling'],
|
allow_scaling=self.parameter['allow_scaling'],
|
||||||
headers_off=self.parameter['headers_off'],
|
headers_off=self.parameter['headers_off'],
|
||||||
tables=self.parameter['tables'],
|
tables=self.parameter['tables'],
|
||||||
override_dpi=self.parameter['dpi'],
|
|
||||||
# trick Eynollah to do init independent of an image
|
|
||||||
dir_in="."
|
|
||||||
)
|
)
|
||||||
self.eynollah.dir_in = None
|
|
||||||
self.eynollah.plotter = None
|
self.eynollah.plotter = None
|
||||||
|
|
||||||
def shutdown(self):
|
def shutdown(self):
|
||||||
|
@ -81,9 +77,9 @@ class EynollahProcessor(Processor):
|
||||||
image_filename = "dummy" # will be replaced by ocrd.Processor.process_page_file
|
image_filename = "dummy" # will be replaced by ocrd.Processor.process_page_file
|
||||||
result.images.append(OcrdPageResultImage(page_image, '.IMG', page)) # mark as new original
|
result.images.append(OcrdPageResultImage(page_image, '.IMG', page)) # mark as new original
|
||||||
# FIXME: mask out already existing regions (incremental segmentation)
|
# FIXME: mask out already existing regions (incremental segmentation)
|
||||||
self.eynollah.image_filename = image_filename
|
self.eynollah.cache_images(
|
||||||
self.eynollah._imgs = self.eynollah._cache_images(
|
image_pil=page_image,
|
||||||
image_pil=page_image
|
dpi=self.parameter['dpi'],
|
||||||
)
|
)
|
||||||
self.eynollah.writer = EynollahXmlWriter(
|
self.eynollah.writer = EynollahXmlWriter(
|
||||||
dir_out=None,
|
dir_out=None,
|
||||||
|
@ -91,5 +87,5 @@ class EynollahProcessor(Processor):
|
||||||
curved_line=self.eynollah.curved_line,
|
curved_line=self.eynollah.curved_line,
|
||||||
textline_light=self.eynollah.textline_light,
|
textline_light=self.eynollah.textline_light,
|
||||||
pcgts=pcgts)
|
pcgts=pcgts)
|
||||||
self.eynollah.run()
|
self.eynollah.run_single()
|
||||||
return result
|
return result
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue