run: use ProcessPoolExecutor for parallel run_single across pages…

- reintroduce ProcessPoolExecutor
  (previously for parallel deskewing within pages)
- wrap Eynollah instance into global, so (with forking)
  serialization can be avoided – same pattern as in core ocrd.Processor
- move timing/logging into `run_single()`, respectively
This commit is contained in:
Robert Sachunsky 2026-03-13 10:15:51 +01:00
parent 96cfddf92d
commit 6d55f297a5
2 changed files with 35 additions and 16 deletions

View file

@ -26,6 +26,8 @@ import time
from typing import Optional from typing import Optional
from functools import partial from functools import partial
from pathlib import Path from pathlib import Path
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor, as_completed
import gc import gc
import cv2 import cv2
@ -96,6 +98,13 @@ MAX_SLOPE = 999
KERNEL = np.ones((5, 5), np.uint8) KERNEL = np.ones((5, 5), np.uint8)
_instance = None
def _set_instance(instance):
global _instance
_instance = instance
def _run_single(*args, **kwargs):
return _instance.run_single(*args, **kwargs)
class Eynollah: class Eynollah:
def __init__( def __init__(
self, self,
@ -2261,35 +2270,41 @@ class Eynollah:
ls_imgs = [os.path.join(dir_in, image_filename) ls_imgs = [os.path.join(dir_in, image_filename)
for image_filename in filter(is_image_filename, for image_filename in filter(is_image_filename,
os.listdir(dir_in))] os.listdir(dir_in))]
elif image_filename: with ProcessPoolExecutor(mp_context=mp.get_context('fork'),
ls_imgs = [image_filename] initializer=_set_instance,
else: initargs=(self,)
raise ValueError("run requires either a single image filename or a directory") ) as exe:
jobs = {exe.submit(_run_single, img_filename,
for img_filename in ls_imgs: dir_out=dir_out,
self.logger.info(img_filename) overwrite=overwrite): img_filename
t0 = time.time() for img_filename in ls_imgs}
for job in as_completed(jobs):
img_filename = jobs[job]
try: try:
self.run_single(img_filename, dir_out, overwrite=overwrite) job.result()
self.logger.info("Job done in %.1fs", time.time() - t0) except:
self.logger.exception("Job %s failed", img_filename)
self.logger.info("All jobs done in %.1fs", time.time() - t0_tot)
elif image_filename:
try:
self.run_single(image_filename, dir_out=dir_out, overwrite=overwrite)
except: except:
self.logger.exception("Job failed") self.logger.exception("Job failed")
else:
if dir_in: raise ValueError("run requires either a single image filename or a directory")
self.logger.info("All jobs done in %.1fs", time.time() - t0_tot)
if self.enable_plotting: if self.enable_plotting:
del self.plotter del self.plotter
def run_single(self, def run_single(self,
img_filename: str, img_filename: str,
dir_out: Optional[str], dir_out: Optional[str] = None,
overwrite: bool = False, overwrite: bool = False,
img_pil=None, img_pil=None,
pcgts=None, pcgts=None,
) -> None: ) -> None:
t0 = time.time() t0 = time.time()
self.logger.info(img_filename)
image = self.cache_images(image_filename=img_filename, image_pil=img_pil) image = self.cache_images(image_filename=img_filename, image_pil=img_pil)
writer = EynollahXmlWriter( writer = EynollahXmlWriter(
@ -2377,6 +2392,7 @@ class Eynollah:
) )
self.logger.info("Basic processing complete") self.logger.info("Basic processing complete")
writer.write_pagexml(pcgts) writer.write_pagexml(pcgts)
self.logger.info("Job done in %.1fs", time.time() - t0)
return return
#print("text region early -1 in %.1fs", time.time() - t0) #print("text region early -1 in %.1fs", time.time() - t0)
@ -2441,6 +2457,7 @@ class Eynollah:
found_polygons_tables=[], found_polygons_tables=[],
) )
writer.write_pagexml(pcgts) writer.write_pagexml(pcgts)
self.logger.info("Job done in %.1fs", time.time() - t0)
return return
#print("text region early in %.1fs", time.time() - t0) #print("text region early in %.1fs", time.time() - t0)
@ -2724,9 +2741,9 @@ class Eynollah:
found_polygons_tables=contours_tables found_polygons_tables=contours_tables
) )
writer.write_pagexml(pcgts) writer.write_pagexml(pcgts)
self.logger.info("Job done in %.1fs", time.time() - t0)
return return
#print("text region early 3 in %.1fs", time.time() - t0) #print("text region early 3 in %.1fs", time.time() - t0)
contours_only_text_parent = dilate_textregion_contours(contours_only_text_parent) contours_only_text_parent = dilate_textregion_contours(contours_only_text_parent)
contours_only_text_parent , contours_only_text_parent_d_ordered = self.filter_contours_inside_a_bigger_one( contours_only_text_parent , contours_only_text_parent_d_ordered = self.filter_contours_inside_a_bigger_one(
@ -2931,4 +2948,5 @@ class Eynollah:
) )
writer.write_pagexml(pcgts) writer.write_pagexml(pcgts)
self.logger.info("Job done in %.1fs", time.time() - t0)
return return

View file

@ -76,7 +76,8 @@ class EynollahProcessor(Processor):
image_filename = "dummy" # will be replaced by ocrd.Processor.process_page_file image_filename = "dummy" # will be replaced by ocrd.Processor.process_page_file
result.images.append(OcrdPageResultImage(page_image, '.IMG', page)) # mark as new original result.images.append(OcrdPageResultImage(page_image, '.IMG', page)) # mark as new original
# FIXME: mask out already existing regions (incremental segmentation) # FIXME: mask out already existing regions (incremental segmentation)
self.eynollah.run_single(image_filename, None, img_pil=page_image, pcgts=pcgts, self.eynollah.run_single(image_filename,
img_pil=page_image, pcgts=pcgts,
# ocrd.Processor will handle OCRD_EXISTING_OUTPUT more flexibly # ocrd.Processor will handle OCRD_EXISTING_OUTPUT more flexibly
overwrite=True) overwrite=True)
return result return result