factor run_single() out of run(), simplify kwargs

pull/148/head^2
Robert Sachunsky 2 weeks ago
parent 1a0a1cb00b
commit e17d34fafa

@ -272,10 +272,7 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
eynollah = Eynollah(
model,
logger=getLogger('Eynollah'),
image_filename=image,
overwrite=overwrite,
dir_out=out,
dir_in=dir_in,
dir_of_cropped_images=save_images,
extract_only_images=extract_only_images,
dir_of_layout=save_layout,
@ -301,9 +298,9 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
skip_layout_and_reading_order=skip_layout_and_reading_order,
)
if dir_in:
eynollah.run()
eynollah.run(dir_in=dir_in, overwrite=overwrite)
else:
pcgts = eynollah.run()
pcgts = eynollah.run(image_filename=image)
eynollah.writer.write_pagexml(pcgts)

@ -180,12 +180,7 @@ class Eynollah:
def __init__(
self,
dir_models : str,
image_filename : Optional[str] = None,
image_pil : Optional[Image] = None,
image_filename_stem : Optional[str] = None,
overwrite : bool = False,
dir_out : Optional[str] = None,
dir_in : Optional[str] = None,
dir_of_cropped_images : Optional[str] = None,
extract_only_images : bool =False,
dir_of_layout : Optional[str] = None,
@ -209,24 +204,12 @@ class Eynollah:
num_col_upper : Optional[int] = None,
num_col_lower : Optional[int] = None,
skip_layout_and_reading_order : bool = False,
override_dpi : Optional[int] = None,
logger : Logger = None,
pcgts : Optional[OcrdPage] = None,
):
if skip_layout_and_reading_order:
textline_light = True
self.light_version = light_version
if not dir_in:
if image_pil:
self._imgs = self._cache_images(image_pil=image_pil)
else:
self._imgs = self._cache_images(image_filename=image_filename)
if override_dpi:
self.dpi = override_dpi
self.image_filename = image_filename
self.overwrite = overwrite
self.dir_out = dir_out
self.dir_in = dir_in
self.dir_of_all = dir_of_all
self.dir_save_page = dir_save_page
self.reading_order_machine_based = reading_order_machine_based
@ -257,21 +240,6 @@ class Eynollah:
self.num_col_lower = int(num_col_lower)
else:
self.num_col_lower = num_col_lower
if not dir_in:
self.plotter = None if not enable_plotting else EynollahPlotter(
dir_out=self.dir_out,
dir_of_all=dir_of_all,
dir_save_page=dir_save_page,
dir_of_deskewed=dir_of_deskewed,
dir_of_cropped_images=dir_of_cropped_images,
dir_of_layout=dir_of_layout,
image_filename_stem=Path(Path(image_filename).name).stem)
self.writer = EynollahXmlWriter(
dir_out=self.dir_out,
image_filename=self.image_filename,
curved_line=self.curved_line,
textline_light = self.textline_light,
pcgts=pcgts)
self.logger = logger if logger else getLogger('eynollah')
# for parallelization of CPU-intensive tasks:
self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200)
@ -370,7 +338,7 @@ class Eynollah:
if self.tables:
self.model_table = self.our_load_model(self.model_table_dir)
def _cache_images(self, image_filename=None, image_pil=None):
def cache_images(self, image_filename=None, image_pil=None, dpi=None):
ret = {}
t_c0 = time.time()
if image_filename:
@ -388,13 +356,14 @@ class Eynollah:
ret['img_grayscale'] = cv2.cvtColor(ret['img'], cv2.COLOR_BGR2GRAY)
for prefix in ('', '_grayscale'):
ret[f'img{prefix}_uint8'] = ret[f'img{prefix}'].astype(np.uint8)
return ret
self._imgs = ret
if dpi is not None:
self.dpi = dpi
def reset_file_name_dir(self, image_filename):
t_c = time.time()
self._imgs = self._cache_images(image_filename=image_filename)
self.image_filename = image_filename
self.cache_images(image_filename=image_filename)
self.plotter = None if not self.enable_plotting else EynollahPlotter(
dir_out=self.dir_out,
dir_of_all=self.dir_of_all,
@ -403,10 +372,10 @@ class Eynollah:
dir_of_cropped_images=self.dir_of_cropped_images,
dir_of_layout=self.dir_of_layout,
image_filename_stem=Path(Path(image_filename).name).stem)
self.writer = EynollahXmlWriter(
dir_out=self.dir_out,
image_filename=self.image_filename,
image_filename=image_filename,
curved_line=self.curved_line,
textline_light = self.textline_light)
@ -4224,30 +4193,49 @@ class Eynollah:
return (slopes_rem, all_found_textline_polygons_rem, boxes_text_rem, txt_con_org_rem,
contours_only_text_parent_rem, index_by_text_par_con_rem_sort)
def run(self):
def run(self, image_filename : Optional[str] = None, dir_in : Optional[str] = None, overwrite : bool = False):
"""
Get image and scales, then extract the page of scanned image
"""
self.logger.debug("enter run")
t0_tot = time.time()
if not self.dir_in:
self.ls_imgs = [self.image_filename]
if dir_in:
self.ls_imgs = os.listdir(dir_in)
elif image_filename:
self.ls_imgs = [image_filename]
else:
raise ValueError("run requires either a single image filename or a directory")
for img_name in self.ls_imgs:
self.logger.info(img_name)
for img_filename in self.ls_imgs:
self.logger.info(img_filename)
t0 = time.time()
if self.dir_in:
self.reset_file_name_dir(os.path.join(self.dir_in,img_name))
#print("text region early -11 in %.1fs", time.time() - t0)
if os.path.exists(self.writer.output_filename):
if self.overwrite:
self.logger.warning("will overwrite existing output file '%s'", self.writer.output_filename)
else:
self.logger.warning("will skip input for existing output file '%s'", self.writer.output_filename)
continue
self.reset_file_name_dir(os.path.join(dir_in or "", img_filename))
#print("text region early -11 in %.1fs", time.time() - t0)
if os.path.exists(self.writer.output_filename):
if overwrite:
self.logger.warning("will overwrite existing output file '%s'", self.writer.output_filename)
else:
self.logger.warning("will skip input for existing output file '%s'", self.writer.output_filename)
continue
pcgts = self.run_single()
self.logger.info("Job done in %.1fs", time.time() - t0)
#print("Job done in %.1fs" % (time.time() - t0))
if dir_in:
self.writer.write_pagexml(pcgts)
else:
return pcgts
if dir_in:
self.logger.info("All jobs done in %.1fs", time.time() - t0_tot)
print("all Job done in %.1fs", time.time() - t0_tot)
def run_single(self):
# conditional merely for indentation (= smaller diff)
if True:
t0 = time.time()
img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version)
self.logger.info("Enhancing took %.1fs ", time.time() - t0)
if self.extract_only_images:
@ -4260,12 +4248,7 @@ class Eynollah:
cont_page, [], [], ocr_all_textlines)
if self.plotter:
self.plotter.write_images_into_directory(polygons_of_images, image_page)
if self.dir_in:
self.writer.write_pagexml(pcgts)
continue
else:
return pcgts
return pcgts
if self.skip_layout_and_reading_order:
_ ,_, _, textline_mask_tot_ea, img_bin_light = \
@ -4307,11 +4290,7 @@ class Eynollah:
all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals,
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals,
cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines)
if self.dir_in:
self.writer.write_pagexml(pcgts)
continue
else:
return pcgts
return pcgts
#print("text region early -1 in %.1fs", time.time() - t0)
t1 = time.time()
@ -4363,12 +4342,7 @@ class Eynollah:
pcgts = self.writer.build_pagexml_no_full_layout(
[], page_coord, [], [], [], [], [], [], [], [], [], [],
cont_page, [], [], ocr_all_textlines)
self.logger.info("Job done in %.1fs", time.time() - t1)
if self.dir_in:
self.writer.write_pagexml(pcgts)
continue
else:
return pcgts
return pcgts
#print("text region early in %.1fs", time.time() - t0)
t1 = time.time()
@ -4553,12 +4527,7 @@ class Eynollah:
polygons_of_images,
polygons_of_marginals, empty_marginals, empty_marginals, [], [],
cont_page, polygons_lines_xml, contours_tables, [])
self.logger.info("Job done in %.1fs", time.time() - t0)
if self.dir_in:
self.writer.write_pagexml(pcgts)
continue
else:
return pcgts
return pcgts
#print("text region early 3 in %.1fs", time.time() - t0)
if self.light_version:
@ -4748,13 +4717,7 @@ class Eynollah:
polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals,
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals,
cont_page, polygons_lines_xml, ocr_all_textlines)
self.logger.info("Job done in %.1fs", time.time() - t0)
#print("Job done in %.1fs", time.time() - t0)
if self.dir_in:
self.writer.write_pagexml(pcgts)
continue
else:
return pcgts
return pcgts
else:
contours_only_text_parent_h = None
@ -4834,22 +4797,9 @@ class Eynollah:
all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals,
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals,
cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines)
#print("Job done in %.1fs" % (time.time() - t0))
self.logger.info("Job done in %.1fs", time.time() - t0)
if not self.dir_in:
return pcgts
#print("text region early 7 in %.1fs", time.time() - t0)
return pcgts
if self.dir_in:
self.writer.write_pagexml(pcgts)
self.logger.info("Job done in %.1fs", time.time() - t0)
#print("Job done in %.1fs" % (time.time() - t0))
if self.dir_in:
self.logger.info("All jobs done in %.1fs", time.time() - t0_tot)
print("all Job done in %.1fs", time.time() - t0_tot)
class Eynollah_ocr:
def __init__(
self,

@ -30,11 +30,7 @@ class EynollahProcessor(Processor):
allow_scaling=self.parameter['allow_scaling'],
headers_off=self.parameter['headers_off'],
tables=self.parameter['tables'],
override_dpi=self.parameter['dpi'],
# trick Eynollah to do init independent of an image
dir_in="."
)
self.eynollah.dir_in = None
self.eynollah.plotter = None
def shutdown(self):
@ -81,9 +77,9 @@ class EynollahProcessor(Processor):
image_filename = "dummy" # will be replaced by ocrd.Processor.process_page_file
result.images.append(OcrdPageResultImage(page_image, '.IMG', page)) # mark as new original
# FIXME: mask out already existing regions (incremental segmentation)
self.eynollah.image_filename = image_filename
self.eynollah._imgs = self.eynollah._cache_images(
image_pil=page_image
self.eynollah.cache_images(
image_pil=page_image,
dpi=self.parameter['dpi'],
)
self.eynollah.writer = EynollahXmlWriter(
dir_out=None,
@ -91,5 +87,5 @@ class EynollahProcessor(Processor):
curved_line=self.eynollah.curved_line,
textline_light=self.eynollah.textline_light,
pcgts=pcgts)
self.eynollah.run()
self.eynollah.run_single()
return result

Loading…
Cancel
Save