mirror of
https://github.com/qurator-spk/eynollah.git
synced 2026-03-24 08:02:45 +01:00
run: use ProcessPoolExecutor for parallel run_single across pages…
- reintroduce ProcessPoolExecutor (previously for parallel deskewing within pages) - wrap Eynollah instance into global, so (with forking) serialization can be avoided – same pattern as in core ocrd.Processor - move timing/logging into `run_single()`, respectively
This commit is contained in:
parent
96cfddf92d
commit
6d55f297a5
2 changed files with 35 additions and 16 deletions
|
|
@ -26,6 +26,8 @@ import time
|
|||
from typing import Optional
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
import multiprocessing as mp
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
import gc
|
||||
|
||||
import cv2
|
||||
|
|
@ -96,6 +98,13 @@ MAX_SLOPE = 999
|
|||
KERNEL = np.ones((5, 5), np.uint8)
|
||||
|
||||
|
||||
_instance = None
|
||||
def _set_instance(instance):
|
||||
global _instance
|
||||
_instance = instance
|
||||
def _run_single(*args, **kwargs):
|
||||
return _instance.run_single(*args, **kwargs)
|
||||
|
||||
class Eynollah:
|
||||
def __init__(
|
||||
self,
|
||||
|
|
@ -2261,35 +2270,41 @@ class Eynollah:
|
|||
ls_imgs = [os.path.join(dir_in, image_filename)
|
||||
for image_filename in filter(is_image_filename,
|
||||
os.listdir(dir_in))]
|
||||
elif image_filename:
|
||||
ls_imgs = [image_filename]
|
||||
else:
|
||||
raise ValueError("run requires either a single image filename or a directory")
|
||||
|
||||
for img_filename in ls_imgs:
|
||||
self.logger.info(img_filename)
|
||||
t0 = time.time()
|
||||
|
||||
with ProcessPoolExecutor(mp_context=mp.get_context('fork'),
|
||||
initializer=_set_instance,
|
||||
initargs=(self,)
|
||||
) as exe:
|
||||
jobs = {exe.submit(_run_single, img_filename,
|
||||
dir_out=dir_out,
|
||||
overwrite=overwrite): img_filename
|
||||
for img_filename in ls_imgs}
|
||||
for job in as_completed(jobs):
|
||||
img_filename = jobs[job]
|
||||
try:
|
||||
self.run_single(img_filename, dir_out, overwrite=overwrite)
|
||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||
job.result()
|
||||
except:
|
||||
self.logger.exception("Job %s failed", img_filename)
|
||||
self.logger.info("All jobs done in %.1fs", time.time() - t0_tot)
|
||||
elif image_filename:
|
||||
try:
|
||||
self.run_single(image_filename, dir_out=dir_out, overwrite=overwrite)
|
||||
except:
|
||||
self.logger.exception("Job failed")
|
||||
|
||||
if dir_in:
|
||||
self.logger.info("All jobs done in %.1fs", time.time() - t0_tot)
|
||||
else:
|
||||
raise ValueError("run requires either a single image filename or a directory")
|
||||
|
||||
if self.enable_plotting:
|
||||
del self.plotter
|
||||
|
||||
def run_single(self,
|
||||
img_filename: str,
|
||||
dir_out: Optional[str],
|
||||
dir_out: Optional[str] = None,
|
||||
overwrite: bool = False,
|
||||
img_pil=None,
|
||||
pcgts=None,
|
||||
) -> None:
|
||||
t0 = time.time()
|
||||
self.logger.info(img_filename)
|
||||
|
||||
image = self.cache_images(image_filename=img_filename, image_pil=img_pil)
|
||||
writer = EynollahXmlWriter(
|
||||
|
|
@ -2377,6 +2392,7 @@ class Eynollah:
|
|||
)
|
||||
self.logger.info("Basic processing complete")
|
||||
writer.write_pagexml(pcgts)
|
||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||
return
|
||||
|
||||
#print("text region early -1 in %.1fs", time.time() - t0)
|
||||
|
|
@ -2441,6 +2457,7 @@ class Eynollah:
|
|||
found_polygons_tables=[],
|
||||
)
|
||||
writer.write_pagexml(pcgts)
|
||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||
return
|
||||
|
||||
#print("text region early in %.1fs", time.time() - t0)
|
||||
|
|
@ -2724,9 +2741,9 @@ class Eynollah:
|
|||
found_polygons_tables=contours_tables
|
||||
)
|
||||
writer.write_pagexml(pcgts)
|
||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||
return
|
||||
|
||||
|
||||
#print("text region early 3 in %.1fs", time.time() - t0)
|
||||
contours_only_text_parent = dilate_textregion_contours(contours_only_text_parent)
|
||||
contours_only_text_parent , contours_only_text_parent_d_ordered = self.filter_contours_inside_a_bigger_one(
|
||||
|
|
@ -2931,4 +2948,5 @@ class Eynollah:
|
|||
)
|
||||
|
||||
writer.write_pagexml(pcgts)
|
||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||
return
|
||||
|
|
|
|||
|
|
@ -76,7 +76,8 @@ class EynollahProcessor(Processor):
|
|||
image_filename = "dummy" # will be replaced by ocrd.Processor.process_page_file
|
||||
result.images.append(OcrdPageResultImage(page_image, '.IMG', page)) # mark as new original
|
||||
# FIXME: mask out already existing regions (incremental segmentation)
|
||||
self.eynollah.run_single(image_filename, None, img_pil=page_image, pcgts=pcgts,
|
||||
self.eynollah.run_single(image_filename,
|
||||
img_pil=page_image, pcgts=pcgts,
|
||||
# ocrd.Processor will handle OCRD_EXISTING_OUTPUT more flexibly
|
||||
overwrite=True)
|
||||
return result
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue