From 6d55f297a55cf73bfb48bfe991e957638ef32eba Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Mar 2026 10:15:51 +0100 Subject: [PATCH] =?UTF-8?q?run:=20use=20ProcessPoolExecutor=20for=20parall?= =?UTF-8?q?el=20`run=5Fsingle`=20across=20pages=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - reintroduce ProcessPoolExecutor (previously for parallel deskewing within pages) - wrap Eynollah instance into global, so (with forking) serialization can be avoided – same pattern as in core ocrd.Processor - move timing/logging into `run_single()`, respectively --- src/eynollah/eynollah.py | 48 +++++++++++++++++++++++++++------------ src/eynollah/processor.py | 3 ++- 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 6d86d29..fcc92f0 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -26,6 +26,8 @@ import time from typing import Optional from functools import partial from pathlib import Path +import multiprocessing as mp +from concurrent.futures import ProcessPoolExecutor, as_completed import gc import cv2 @@ -96,6 +98,13 @@ MAX_SLOPE = 999 KERNEL = np.ones((5, 5), np.uint8) +_instance = None +def _set_instance(instance): + global _instance + _instance = instance +def _run_single(*args, **kwargs): + return _instance.run_single(*args, **kwargs) + class Eynollah: def __init__( self, @@ -2261,35 +2270,41 @@ class Eynollah: ls_imgs = [os.path.join(dir_in, image_filename) for image_filename in filter(is_image_filename, os.listdir(dir_in))] + with ProcessPoolExecutor(mp_context=mp.get_context('fork'), + initializer=_set_instance, + initargs=(self,) + ) as exe: + jobs = {exe.submit(_run_single, img_filename, + dir_out=dir_out, + overwrite=overwrite): img_filename + for img_filename in ls_imgs} + for job in as_completed(jobs): + img_filename = jobs[job] + try: + job.result() + except: + self.logger.exception("Job %s failed", img_filename) + self.logger.info("All jobs done in %.1fs", time.time() - t0_tot) elif image_filename: - ls_imgs = [image_filename] - else: - raise ValueError("run requires either a single image filename or a directory") - - for img_filename in ls_imgs: - self.logger.info(img_filename) - t0 = time.time() - try: - self.run_single(img_filename, dir_out, overwrite=overwrite) - self.logger.info("Job done in %.1fs", time.time() - t0) + self.run_single(image_filename, dir_out=dir_out, overwrite=overwrite) except: self.logger.exception("Job failed") - - if dir_in: - self.logger.info("All jobs done in %.1fs", time.time() - t0_tot) + else: + raise ValueError("run requires either a single image filename or a directory") if self.enable_plotting: del self.plotter def run_single(self, img_filename: str, - dir_out: Optional[str], + dir_out: Optional[str] = None, overwrite: bool = False, img_pil=None, pcgts=None, ) -> None: t0 = time.time() + self.logger.info(img_filename) image = self.cache_images(image_filename=img_filename, image_pil=img_pil) writer = EynollahXmlWriter( @@ -2377,6 +2392,7 @@ class Eynollah: ) self.logger.info("Basic processing complete") writer.write_pagexml(pcgts) + self.logger.info("Job done in %.1fs", time.time() - t0) return #print("text region early -1 in %.1fs", time.time() - t0) @@ -2441,6 +2457,7 @@ class Eynollah: found_polygons_tables=[], ) writer.write_pagexml(pcgts) + self.logger.info("Job done in %.1fs", time.time() - t0) return #print("text region early in %.1fs", time.time() - t0) @@ -2724,9 +2741,9 @@ class Eynollah: found_polygons_tables=contours_tables ) writer.write_pagexml(pcgts) + self.logger.info("Job done in %.1fs", time.time() - t0) return - #print("text region early 3 in %.1fs", time.time() - t0) contours_only_text_parent = dilate_textregion_contours(contours_only_text_parent) contours_only_text_parent , contours_only_text_parent_d_ordered = self.filter_contours_inside_a_bigger_one( @@ -2931,4 +2948,5 @@ class Eynollah: ) writer.write_pagexml(pcgts) + self.logger.info("Job done in %.1fs", time.time() - t0) return diff --git a/src/eynollah/processor.py b/src/eynollah/processor.py index ba7f789..47fa770 100644 --- a/src/eynollah/processor.py +++ b/src/eynollah/processor.py @@ -76,7 +76,8 @@ class EynollahProcessor(Processor): image_filename = "dummy" # will be replaced by ocrd.Processor.process_page_file result.images.append(OcrdPageResultImage(page_image, '.IMG', page)) # mark as new original # FIXME: mask out already existing regions (incremental segmentation) - self.eynollah.run_single(image_filename, None, img_pil=page_image, pcgts=pcgts, + self.eynollah.run_single(image_filename, + img_pil=page_image, pcgts=pcgts, # ocrd.Processor will handle OCRD_EXISTING_OUTPUT more flexibly overwrite=True) return result