From e17d34fafaccf3047024bf6d38aafe18967ef0df Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 1 Apr 2025 22:12:24 +0200 Subject: [PATCH] factor run_single() out of run(), simplify kwargs --- src/eynollah/cli.py | 7 +- src/eynollah/eynollah.py | 148 +++++++++++++------------------------- src/eynollah/processor.py | 12 ++-- 3 files changed, 55 insertions(+), 112 deletions(-) diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index 7dab4c7..fab0667 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -272,10 +272,7 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ eynollah = Eynollah( model, logger=getLogger('Eynollah'), - image_filename=image, - overwrite=overwrite, dir_out=out, - dir_in=dir_in, dir_of_cropped_images=save_images, extract_only_images=extract_only_images, dir_of_layout=save_layout, @@ -301,9 +298,9 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ skip_layout_and_reading_order=skip_layout_and_reading_order, ) if dir_in: - eynollah.run() + eynollah.run(dir_in=dir_in, overwrite=overwrite) else: - pcgts = eynollah.run() + pcgts = eynollah.run(image_filename=image) eynollah.writer.write_pagexml(pcgts) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 35f7898..18ae868 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -180,12 +180,7 @@ class Eynollah: def __init__( self, dir_models : str, - image_filename : Optional[str] = None, - image_pil : Optional[Image] = None, - image_filename_stem : Optional[str] = None, - overwrite : bool = False, dir_out : Optional[str] = None, - dir_in : Optional[str] = None, dir_of_cropped_images : Optional[str] = None, extract_only_images : bool =False, dir_of_layout : Optional[str] = None, @@ -209,24 +204,12 @@ class Eynollah: num_col_upper : Optional[int] = None, num_col_lower : Optional[int] = None, skip_layout_and_reading_order : bool = False, - override_dpi : Optional[int] = None, logger : Logger = None, - pcgts : Optional[OcrdPage] = None, ): if skip_layout_and_reading_order: textline_light = True self.light_version = light_version - if not dir_in: - if image_pil: - self._imgs = self._cache_images(image_pil=image_pil) - else: - self._imgs = self._cache_images(image_filename=image_filename) - if override_dpi: - self.dpi = override_dpi - self.image_filename = image_filename - self.overwrite = overwrite self.dir_out = dir_out - self.dir_in = dir_in self.dir_of_all = dir_of_all self.dir_save_page = dir_save_page self.reading_order_machine_based = reading_order_machine_based @@ -257,21 +240,6 @@ class Eynollah: self.num_col_lower = int(num_col_lower) else: self.num_col_lower = num_col_lower - if not dir_in: - self.plotter = None if not enable_plotting else EynollahPlotter( - dir_out=self.dir_out, - dir_of_all=dir_of_all, - dir_save_page=dir_save_page, - dir_of_deskewed=dir_of_deskewed, - dir_of_cropped_images=dir_of_cropped_images, - dir_of_layout=dir_of_layout, - image_filename_stem=Path(Path(image_filename).name).stem) - self.writer = EynollahXmlWriter( - dir_out=self.dir_out, - image_filename=self.image_filename, - curved_line=self.curved_line, - textline_light = self.textline_light, - pcgts=pcgts) self.logger = logger if logger else getLogger('eynollah') # for parallelization of CPU-intensive tasks: self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200) @@ -370,7 +338,7 @@ class Eynollah: if self.tables: self.model_table = self.our_load_model(self.model_table_dir) - def _cache_images(self, image_filename=None, image_pil=None): + def cache_images(self, image_filename=None, image_pil=None, dpi=None): ret = {} t_c0 = time.time() if image_filename: @@ -388,13 +356,14 @@ class Eynollah: ret['img_grayscale'] = cv2.cvtColor(ret['img'], cv2.COLOR_BGR2GRAY) for prefix in ('', '_grayscale'): ret[f'img{prefix}_uint8'] = ret[f'img{prefix}'].astype(np.uint8) - return ret + self._imgs = ret + if dpi is not None: + self.dpi = dpi def reset_file_name_dir(self, image_filename): t_c = time.time() - self._imgs = self._cache_images(image_filename=image_filename) - self.image_filename = image_filename - + self.cache_images(image_filename=image_filename) + self.plotter = None if not self.enable_plotting else EynollahPlotter( dir_out=self.dir_out, dir_of_all=self.dir_of_all, @@ -403,10 +372,10 @@ class Eynollah: dir_of_cropped_images=self.dir_of_cropped_images, dir_of_layout=self.dir_of_layout, image_filename_stem=Path(Path(image_filename).name).stem) - + self.writer = EynollahXmlWriter( dir_out=self.dir_out, - image_filename=self.image_filename, + image_filename=image_filename, curved_line=self.curved_line, textline_light = self.textline_light) @@ -4224,30 +4193,49 @@ class Eynollah: return (slopes_rem, all_found_textline_polygons_rem, boxes_text_rem, txt_con_org_rem, contours_only_text_parent_rem, index_by_text_par_con_rem_sort) - def run(self): + def run(self, image_filename : Optional[str] = None, dir_in : Optional[str] = None, overwrite : bool = False): """ Get image and scales, then extract the page of scanned image """ self.logger.debug("enter run") - t0_tot = time.time() - if not self.dir_in: - self.ls_imgs = [self.image_filename] + if dir_in: + self.ls_imgs = os.listdir(dir_in) + elif image_filename: + self.ls_imgs = [image_filename] + else: + raise ValueError("run requires either a single image filename or a directory") - for img_name in self.ls_imgs: - self.logger.info(img_name) + for img_filename in self.ls_imgs: + self.logger.info(img_filename) t0 = time.time() - if self.dir_in: - self.reset_file_name_dir(os.path.join(self.dir_in,img_name)) - #print("text region early -11 in %.1fs", time.time() - t0) - if os.path.exists(self.writer.output_filename): - if self.overwrite: - self.logger.warning("will overwrite existing output file '%s'", self.writer.output_filename) - else: - self.logger.warning("will skip input for existing output file '%s'", self.writer.output_filename) - continue + self.reset_file_name_dir(os.path.join(dir_in or "", img_filename)) + #print("text region early -11 in %.1fs", time.time() - t0) + if os.path.exists(self.writer.output_filename): + if overwrite: + self.logger.warning("will overwrite existing output file '%s'", self.writer.output_filename) + else: + self.logger.warning("will skip input for existing output file '%s'", self.writer.output_filename) + continue + + pcgts = self.run_single() + self.logger.info("Job done in %.1fs", time.time() - t0) + #print("Job done in %.1fs" % (time.time() - t0)) + if dir_in: + self.writer.write_pagexml(pcgts) + else: + return pcgts + + if dir_in: + self.logger.info("All jobs done in %.1fs", time.time() - t0_tot) + print("all Job done in %.1fs", time.time() - t0_tot) + + def run_single(self): + # conditional merely for indentation (= smaller diff) + if True: + t0 = time.time() img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version) self.logger.info("Enhancing took %.1fs ", time.time() - t0) if self.extract_only_images: @@ -4260,12 +4248,7 @@ class Eynollah: cont_page, [], [], ocr_all_textlines) if self.plotter: self.plotter.write_images_into_directory(polygons_of_images, image_page) - - if self.dir_in: - self.writer.write_pagexml(pcgts) - continue - else: - return pcgts + return pcgts if self.skip_layout_and_reading_order: _ ,_, _, textline_mask_tot_ea, img_bin_light = \ @@ -4307,11 +4290,7 @@ class Eynollah: all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines) - if self.dir_in: - self.writer.write_pagexml(pcgts) - continue - else: - return pcgts + return pcgts #print("text region early -1 in %.1fs", time.time() - t0) t1 = time.time() @@ -4363,12 +4342,7 @@ class Eynollah: pcgts = self.writer.build_pagexml_no_full_layout( [], page_coord, [], [], [], [], [], [], [], [], [], [], cont_page, [], [], ocr_all_textlines) - self.logger.info("Job done in %.1fs", time.time() - t1) - if self.dir_in: - self.writer.write_pagexml(pcgts) - continue - else: - return pcgts + return pcgts #print("text region early in %.1fs", time.time() - t0) t1 = time.time() @@ -4553,12 +4527,7 @@ class Eynollah: polygons_of_images, polygons_of_marginals, empty_marginals, empty_marginals, [], [], cont_page, polygons_lines_xml, contours_tables, []) - self.logger.info("Job done in %.1fs", time.time() - t0) - if self.dir_in: - self.writer.write_pagexml(pcgts) - continue - else: - return pcgts + return pcgts #print("text region early 3 in %.1fs", time.time() - t0) if self.light_version: @@ -4748,13 +4717,7 @@ class Eynollah: polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_xml, ocr_all_textlines) - self.logger.info("Job done in %.1fs", time.time() - t0) - #print("Job done in %.1fs", time.time() - t0) - if self.dir_in: - self.writer.write_pagexml(pcgts) - continue - else: - return pcgts + return pcgts else: contours_only_text_parent_h = None @@ -4834,22 +4797,9 @@ class Eynollah: all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines) - #print("Job done in %.1fs" % (time.time() - t0)) - self.logger.info("Job done in %.1fs", time.time() - t0) - if not self.dir_in: - return pcgts - #print("text region early 7 in %.1fs", time.time() - t0) + return pcgts + - if self.dir_in: - self.writer.write_pagexml(pcgts) - self.logger.info("Job done in %.1fs", time.time() - t0) - #print("Job done in %.1fs" % (time.time() - t0)) - - if self.dir_in: - self.logger.info("All jobs done in %.1fs", time.time() - t0_tot) - print("all Job done in %.1fs", time.time() - t0_tot) - - class Eynollah_ocr: def __init__( self, diff --git a/src/eynollah/processor.py b/src/eynollah/processor.py index ed409f4..8f99489 100644 --- a/src/eynollah/processor.py +++ b/src/eynollah/processor.py @@ -30,11 +30,7 @@ class EynollahProcessor(Processor): allow_scaling=self.parameter['allow_scaling'], headers_off=self.parameter['headers_off'], tables=self.parameter['tables'], - override_dpi=self.parameter['dpi'], - # trick Eynollah to do init independent of an image - dir_in="." ) - self.eynollah.dir_in = None self.eynollah.plotter = None def shutdown(self): @@ -81,9 +77,9 @@ class EynollahProcessor(Processor): image_filename = "dummy" # will be replaced by ocrd.Processor.process_page_file result.images.append(OcrdPageResultImage(page_image, '.IMG', page)) # mark as new original # FIXME: mask out already existing regions (incremental segmentation) - self.eynollah.image_filename = image_filename - self.eynollah._imgs = self.eynollah._cache_images( - image_pil=page_image + self.eynollah.cache_images( + image_pil=page_image, + dpi=self.parameter['dpi'], ) self.eynollah.writer = EynollahXmlWriter( dir_out=None, @@ -91,5 +87,5 @@ class EynollahProcessor(Processor): curved_line=self.eynollah.curved_line, textline_light=self.eynollah.textline_light, pcgts=pcgts) - self.eynollah.run() + self.eynollah.run_single() return result