mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-06-14 14:49:54 +02:00
factor run_single() out of run(), simplify kwargs
This commit is contained in:
parent
1a0a1cb00b
commit
e17d34fafa
3 changed files with 55 additions and 112 deletions
|
@ -272,10 +272,7 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
|
|||
eynollah = Eynollah(
|
||||
model,
|
||||
logger=getLogger('Eynollah'),
|
||||
image_filename=image,
|
||||
overwrite=overwrite,
|
||||
dir_out=out,
|
||||
dir_in=dir_in,
|
||||
dir_of_cropped_images=save_images,
|
||||
extract_only_images=extract_only_images,
|
||||
dir_of_layout=save_layout,
|
||||
|
@ -301,9 +298,9 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
|
|||
skip_layout_and_reading_order=skip_layout_and_reading_order,
|
||||
)
|
||||
if dir_in:
|
||||
eynollah.run()
|
||||
eynollah.run(dir_in=dir_in, overwrite=overwrite)
|
||||
else:
|
||||
pcgts = eynollah.run()
|
||||
pcgts = eynollah.run(image_filename=image)
|
||||
eynollah.writer.write_pagexml(pcgts)
|
||||
|
||||
|
||||
|
|
|
@ -180,12 +180,7 @@ class Eynollah:
|
|||
def __init__(
|
||||
self,
|
||||
dir_models : str,
|
||||
image_filename : Optional[str] = None,
|
||||
image_pil : Optional[Image] = None,
|
||||
image_filename_stem : Optional[str] = None,
|
||||
overwrite : bool = False,
|
||||
dir_out : Optional[str] = None,
|
||||
dir_in : Optional[str] = None,
|
||||
dir_of_cropped_images : Optional[str] = None,
|
||||
extract_only_images : bool =False,
|
||||
dir_of_layout : Optional[str] = None,
|
||||
|
@ -209,24 +204,12 @@ class Eynollah:
|
|||
num_col_upper : Optional[int] = None,
|
||||
num_col_lower : Optional[int] = None,
|
||||
skip_layout_and_reading_order : bool = False,
|
||||
override_dpi : Optional[int] = None,
|
||||
logger : Logger = None,
|
||||
pcgts : Optional[OcrdPage] = None,
|
||||
):
|
||||
if skip_layout_and_reading_order:
|
||||
textline_light = True
|
||||
self.light_version = light_version
|
||||
if not dir_in:
|
||||
if image_pil:
|
||||
self._imgs = self._cache_images(image_pil=image_pil)
|
||||
else:
|
||||
self._imgs = self._cache_images(image_filename=image_filename)
|
||||
if override_dpi:
|
||||
self.dpi = override_dpi
|
||||
self.image_filename = image_filename
|
||||
self.overwrite = overwrite
|
||||
self.dir_out = dir_out
|
||||
self.dir_in = dir_in
|
||||
self.dir_of_all = dir_of_all
|
||||
self.dir_save_page = dir_save_page
|
||||
self.reading_order_machine_based = reading_order_machine_based
|
||||
|
@ -257,21 +240,6 @@ class Eynollah:
|
|||
self.num_col_lower = int(num_col_lower)
|
||||
else:
|
||||
self.num_col_lower = num_col_lower
|
||||
if not dir_in:
|
||||
self.plotter = None if not enable_plotting else EynollahPlotter(
|
||||
dir_out=self.dir_out,
|
||||
dir_of_all=dir_of_all,
|
||||
dir_save_page=dir_save_page,
|
||||
dir_of_deskewed=dir_of_deskewed,
|
||||
dir_of_cropped_images=dir_of_cropped_images,
|
||||
dir_of_layout=dir_of_layout,
|
||||
image_filename_stem=Path(Path(image_filename).name).stem)
|
||||
self.writer = EynollahXmlWriter(
|
||||
dir_out=self.dir_out,
|
||||
image_filename=self.image_filename,
|
||||
curved_line=self.curved_line,
|
||||
textline_light = self.textline_light,
|
||||
pcgts=pcgts)
|
||||
self.logger = logger if logger else getLogger('eynollah')
|
||||
# for parallelization of CPU-intensive tasks:
|
||||
self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200)
|
||||
|
@ -370,7 +338,7 @@ class Eynollah:
|
|||
if self.tables:
|
||||
self.model_table = self.our_load_model(self.model_table_dir)
|
||||
|
||||
def _cache_images(self, image_filename=None, image_pil=None):
|
||||
def cache_images(self, image_filename=None, image_pil=None, dpi=None):
|
||||
ret = {}
|
||||
t_c0 = time.time()
|
||||
if image_filename:
|
||||
|
@ -388,13 +356,14 @@ class Eynollah:
|
|||
ret['img_grayscale'] = cv2.cvtColor(ret['img'], cv2.COLOR_BGR2GRAY)
|
||||
for prefix in ('', '_grayscale'):
|
||||
ret[f'img{prefix}_uint8'] = ret[f'img{prefix}'].astype(np.uint8)
|
||||
return ret
|
||||
self._imgs = ret
|
||||
if dpi is not None:
|
||||
self.dpi = dpi
|
||||
|
||||
def reset_file_name_dir(self, image_filename):
|
||||
t_c = time.time()
|
||||
self._imgs = self._cache_images(image_filename=image_filename)
|
||||
self.image_filename = image_filename
|
||||
|
||||
self.cache_images(image_filename=image_filename)
|
||||
|
||||
self.plotter = None if not self.enable_plotting else EynollahPlotter(
|
||||
dir_out=self.dir_out,
|
||||
dir_of_all=self.dir_of_all,
|
||||
|
@ -403,10 +372,10 @@ class Eynollah:
|
|||
dir_of_cropped_images=self.dir_of_cropped_images,
|
||||
dir_of_layout=self.dir_of_layout,
|
||||
image_filename_stem=Path(Path(image_filename).name).stem)
|
||||
|
||||
|
||||
self.writer = EynollahXmlWriter(
|
||||
dir_out=self.dir_out,
|
||||
image_filename=self.image_filename,
|
||||
image_filename=image_filename,
|
||||
curved_line=self.curved_line,
|
||||
textline_light = self.textline_light)
|
||||
|
||||
|
@ -4224,30 +4193,49 @@ class Eynollah:
|
|||
return (slopes_rem, all_found_textline_polygons_rem, boxes_text_rem, txt_con_org_rem,
|
||||
contours_only_text_parent_rem, index_by_text_par_con_rem_sort)
|
||||
|
||||
def run(self):
|
||||
def run(self, image_filename : Optional[str] = None, dir_in : Optional[str] = None, overwrite : bool = False):
|
||||
"""
|
||||
Get image and scales, then extract the page of scanned image
|
||||
"""
|
||||
self.logger.debug("enter run")
|
||||
|
||||
t0_tot = time.time()
|
||||
|
||||
if not self.dir_in:
|
||||
self.ls_imgs = [self.image_filename]
|
||||
if dir_in:
|
||||
self.ls_imgs = os.listdir(dir_in)
|
||||
elif image_filename:
|
||||
self.ls_imgs = [image_filename]
|
||||
else:
|
||||
raise ValueError("run requires either a single image filename or a directory")
|
||||
|
||||
for img_name in self.ls_imgs:
|
||||
self.logger.info(img_name)
|
||||
for img_filename in self.ls_imgs:
|
||||
self.logger.info(img_filename)
|
||||
t0 = time.time()
|
||||
if self.dir_in:
|
||||
self.reset_file_name_dir(os.path.join(self.dir_in,img_name))
|
||||
#print("text region early -11 in %.1fs", time.time() - t0)
|
||||
if os.path.exists(self.writer.output_filename):
|
||||
if self.overwrite:
|
||||
self.logger.warning("will overwrite existing output file '%s'", self.writer.output_filename)
|
||||
else:
|
||||
self.logger.warning("will skip input for existing output file '%s'", self.writer.output_filename)
|
||||
continue
|
||||
|
||||
self.reset_file_name_dir(os.path.join(dir_in or "", img_filename))
|
||||
#print("text region early -11 in %.1fs", time.time() - t0)
|
||||
if os.path.exists(self.writer.output_filename):
|
||||
if overwrite:
|
||||
self.logger.warning("will overwrite existing output file '%s'", self.writer.output_filename)
|
||||
else:
|
||||
self.logger.warning("will skip input for existing output file '%s'", self.writer.output_filename)
|
||||
continue
|
||||
|
||||
pcgts = self.run_single()
|
||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||
#print("Job done in %.1fs" % (time.time() - t0))
|
||||
if dir_in:
|
||||
self.writer.write_pagexml(pcgts)
|
||||
else:
|
||||
return pcgts
|
||||
|
||||
if dir_in:
|
||||
self.logger.info("All jobs done in %.1fs", time.time() - t0_tot)
|
||||
print("all Job done in %.1fs", time.time() - t0_tot)
|
||||
|
||||
def run_single(self):
|
||||
# conditional merely for indentation (= smaller diff)
|
||||
if True:
|
||||
t0 = time.time()
|
||||
img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version)
|
||||
self.logger.info("Enhancing took %.1fs ", time.time() - t0)
|
||||
if self.extract_only_images:
|
||||
|
@ -4260,12 +4248,7 @@ class Eynollah:
|
|||
cont_page, [], [], ocr_all_textlines)
|
||||
if self.plotter:
|
||||
self.plotter.write_images_into_directory(polygons_of_images, image_page)
|
||||
|
||||
if self.dir_in:
|
||||
self.writer.write_pagexml(pcgts)
|
||||
continue
|
||||
else:
|
||||
return pcgts
|
||||
return pcgts
|
||||
|
||||
if self.skip_layout_and_reading_order:
|
||||
_ ,_, _, textline_mask_tot_ea, img_bin_light = \
|
||||
|
@ -4307,11 +4290,7 @@ class Eynollah:
|
|||
all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals,
|
||||
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals,
|
||||
cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines)
|
||||
if self.dir_in:
|
||||
self.writer.write_pagexml(pcgts)
|
||||
continue
|
||||
else:
|
||||
return pcgts
|
||||
return pcgts
|
||||
|
||||
#print("text region early -1 in %.1fs", time.time() - t0)
|
||||
t1 = time.time()
|
||||
|
@ -4363,12 +4342,7 @@ class Eynollah:
|
|||
pcgts = self.writer.build_pagexml_no_full_layout(
|
||||
[], page_coord, [], [], [], [], [], [], [], [], [], [],
|
||||
cont_page, [], [], ocr_all_textlines)
|
||||
self.logger.info("Job done in %.1fs", time.time() - t1)
|
||||
if self.dir_in:
|
||||
self.writer.write_pagexml(pcgts)
|
||||
continue
|
||||
else:
|
||||
return pcgts
|
||||
return pcgts
|
||||
|
||||
#print("text region early in %.1fs", time.time() - t0)
|
||||
t1 = time.time()
|
||||
|
@ -4553,12 +4527,7 @@ class Eynollah:
|
|||
polygons_of_images,
|
||||
polygons_of_marginals, empty_marginals, empty_marginals, [], [],
|
||||
cont_page, polygons_lines_xml, contours_tables, [])
|
||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||
if self.dir_in:
|
||||
self.writer.write_pagexml(pcgts)
|
||||
continue
|
||||
else:
|
||||
return pcgts
|
||||
return pcgts
|
||||
|
||||
#print("text region early 3 in %.1fs", time.time() - t0)
|
||||
if self.light_version:
|
||||
|
@ -4748,13 +4717,7 @@ class Eynollah:
|
|||
polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals,
|
||||
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals,
|
||||
cont_page, polygons_lines_xml, ocr_all_textlines)
|
||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||
#print("Job done in %.1fs", time.time() - t0)
|
||||
if self.dir_in:
|
||||
self.writer.write_pagexml(pcgts)
|
||||
continue
|
||||
else:
|
||||
return pcgts
|
||||
return pcgts
|
||||
|
||||
else:
|
||||
contours_only_text_parent_h = None
|
||||
|
@ -4834,22 +4797,9 @@ class Eynollah:
|
|||
all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals,
|
||||
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals,
|
||||
cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines)
|
||||
#print("Job done in %.1fs" % (time.time() - t0))
|
||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||
if not self.dir_in:
|
||||
return pcgts
|
||||
#print("text region early 7 in %.1fs", time.time() - t0)
|
||||
return pcgts
|
||||
|
||||
|
||||
if self.dir_in:
|
||||
self.writer.write_pagexml(pcgts)
|
||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||
#print("Job done in %.1fs" % (time.time() - t0))
|
||||
|
||||
if self.dir_in:
|
||||
self.logger.info("All jobs done in %.1fs", time.time() - t0_tot)
|
||||
print("all Job done in %.1fs", time.time() - t0_tot)
|
||||
|
||||
|
||||
class Eynollah_ocr:
|
||||
def __init__(
|
||||
self,
|
||||
|
|
|
@ -30,11 +30,7 @@ class EynollahProcessor(Processor):
|
|||
allow_scaling=self.parameter['allow_scaling'],
|
||||
headers_off=self.parameter['headers_off'],
|
||||
tables=self.parameter['tables'],
|
||||
override_dpi=self.parameter['dpi'],
|
||||
# trick Eynollah to do init independent of an image
|
||||
dir_in="."
|
||||
)
|
||||
self.eynollah.dir_in = None
|
||||
self.eynollah.plotter = None
|
||||
|
||||
def shutdown(self):
|
||||
|
@ -81,9 +77,9 @@ class EynollahProcessor(Processor):
|
|||
image_filename = "dummy" # will be replaced by ocrd.Processor.process_page_file
|
||||
result.images.append(OcrdPageResultImage(page_image, '.IMG', page)) # mark as new original
|
||||
# FIXME: mask out already existing regions (incremental segmentation)
|
||||
self.eynollah.image_filename = image_filename
|
||||
self.eynollah._imgs = self.eynollah._cache_images(
|
||||
image_pil=page_image
|
||||
self.eynollah.cache_images(
|
||||
image_pil=page_image,
|
||||
dpi=self.parameter['dpi'],
|
||||
)
|
||||
self.eynollah.writer = EynollahXmlWriter(
|
||||
dir_out=None,
|
||||
|
@ -91,5 +87,5 @@ class EynollahProcessor(Processor):
|
|||
curved_line=self.eynollah.curved_line,
|
||||
textline_light=self.eynollah.textline_light,
|
||||
pcgts=pcgts)
|
||||
self.eynollah.run()
|
||||
self.eynollah.run_single()
|
||||
return result
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue