mirror of
https://github.com/qurator-spk/eynollah.git
synced 2026-03-24 08:02:45 +01:00
run: use ProcessPoolExecutor for parallel run_single across pages…
- reintroduce ProcessPoolExecutor (previously for parallel deskewing within pages) - wrap Eynollah instance into global, so (with forking) serialization can be avoided – same pattern as in core ocrd.Processor - move timing/logging into `run_single()`, respectively
This commit is contained in:
parent
96cfddf92d
commit
6d55f297a5
2 changed files with 35 additions and 16 deletions
|
|
@ -26,6 +26,8 @@ import time
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import multiprocessing as mp
|
||||||
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||||
import gc
|
import gc
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
|
|
@ -96,6 +98,13 @@ MAX_SLOPE = 999
|
||||||
KERNEL = np.ones((5, 5), np.uint8)
|
KERNEL = np.ones((5, 5), np.uint8)
|
||||||
|
|
||||||
|
|
||||||
|
_instance = None
|
||||||
|
def _set_instance(instance):
|
||||||
|
global _instance
|
||||||
|
_instance = instance
|
||||||
|
def _run_single(*args, **kwargs):
|
||||||
|
return _instance.run_single(*args, **kwargs)
|
||||||
|
|
||||||
class Eynollah:
|
class Eynollah:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|
@ -2261,35 +2270,41 @@ class Eynollah:
|
||||||
ls_imgs = [os.path.join(dir_in, image_filename)
|
ls_imgs = [os.path.join(dir_in, image_filename)
|
||||||
for image_filename in filter(is_image_filename,
|
for image_filename in filter(is_image_filename,
|
||||||
os.listdir(dir_in))]
|
os.listdir(dir_in))]
|
||||||
|
with ProcessPoolExecutor(mp_context=mp.get_context('fork'),
|
||||||
|
initializer=_set_instance,
|
||||||
|
initargs=(self,)
|
||||||
|
) as exe:
|
||||||
|
jobs = {exe.submit(_run_single, img_filename,
|
||||||
|
dir_out=dir_out,
|
||||||
|
overwrite=overwrite): img_filename
|
||||||
|
for img_filename in ls_imgs}
|
||||||
|
for job in as_completed(jobs):
|
||||||
|
img_filename = jobs[job]
|
||||||
|
try:
|
||||||
|
job.result()
|
||||||
|
except:
|
||||||
|
self.logger.exception("Job %s failed", img_filename)
|
||||||
|
self.logger.info("All jobs done in %.1fs", time.time() - t0_tot)
|
||||||
elif image_filename:
|
elif image_filename:
|
||||||
ls_imgs = [image_filename]
|
|
||||||
else:
|
|
||||||
raise ValueError("run requires either a single image filename or a directory")
|
|
||||||
|
|
||||||
for img_filename in ls_imgs:
|
|
||||||
self.logger.info(img_filename)
|
|
||||||
t0 = time.time()
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.run_single(img_filename, dir_out, overwrite=overwrite)
|
self.run_single(image_filename, dir_out=dir_out, overwrite=overwrite)
|
||||||
self.logger.info("Job done in %.1fs", time.time() - t0)
|
|
||||||
except:
|
except:
|
||||||
self.logger.exception("Job failed")
|
self.logger.exception("Job failed")
|
||||||
|
else:
|
||||||
if dir_in:
|
raise ValueError("run requires either a single image filename or a directory")
|
||||||
self.logger.info("All jobs done in %.1fs", time.time() - t0_tot)
|
|
||||||
|
|
||||||
if self.enable_plotting:
|
if self.enable_plotting:
|
||||||
del self.plotter
|
del self.plotter
|
||||||
|
|
||||||
def run_single(self,
|
def run_single(self,
|
||||||
img_filename: str,
|
img_filename: str,
|
||||||
dir_out: Optional[str],
|
dir_out: Optional[str] = None,
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
img_pil=None,
|
img_pil=None,
|
||||||
pcgts=None,
|
pcgts=None,
|
||||||
) -> None:
|
) -> None:
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
|
self.logger.info(img_filename)
|
||||||
|
|
||||||
image = self.cache_images(image_filename=img_filename, image_pil=img_pil)
|
image = self.cache_images(image_filename=img_filename, image_pil=img_pil)
|
||||||
writer = EynollahXmlWriter(
|
writer = EynollahXmlWriter(
|
||||||
|
|
@ -2377,6 +2392,7 @@ class Eynollah:
|
||||||
)
|
)
|
||||||
self.logger.info("Basic processing complete")
|
self.logger.info("Basic processing complete")
|
||||||
writer.write_pagexml(pcgts)
|
writer.write_pagexml(pcgts)
|
||||||
|
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||||
return
|
return
|
||||||
|
|
||||||
#print("text region early -1 in %.1fs", time.time() - t0)
|
#print("text region early -1 in %.1fs", time.time() - t0)
|
||||||
|
|
@ -2441,6 +2457,7 @@ class Eynollah:
|
||||||
found_polygons_tables=[],
|
found_polygons_tables=[],
|
||||||
)
|
)
|
||||||
writer.write_pagexml(pcgts)
|
writer.write_pagexml(pcgts)
|
||||||
|
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||||
return
|
return
|
||||||
|
|
||||||
#print("text region early in %.1fs", time.time() - t0)
|
#print("text region early in %.1fs", time.time() - t0)
|
||||||
|
|
@ -2724,9 +2741,9 @@ class Eynollah:
|
||||||
found_polygons_tables=contours_tables
|
found_polygons_tables=contours_tables
|
||||||
)
|
)
|
||||||
writer.write_pagexml(pcgts)
|
writer.write_pagexml(pcgts)
|
||||||
|
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
#print("text region early 3 in %.1fs", time.time() - t0)
|
#print("text region early 3 in %.1fs", time.time() - t0)
|
||||||
contours_only_text_parent = dilate_textregion_contours(contours_only_text_parent)
|
contours_only_text_parent = dilate_textregion_contours(contours_only_text_parent)
|
||||||
contours_only_text_parent , contours_only_text_parent_d_ordered = self.filter_contours_inside_a_bigger_one(
|
contours_only_text_parent , contours_only_text_parent_d_ordered = self.filter_contours_inside_a_bigger_one(
|
||||||
|
|
@ -2931,4 +2948,5 @@ class Eynollah:
|
||||||
)
|
)
|
||||||
|
|
||||||
writer.write_pagexml(pcgts)
|
writer.write_pagexml(pcgts)
|
||||||
|
self.logger.info("Job done in %.1fs", time.time() - t0)
|
||||||
return
|
return
|
||||||
|
|
|
||||||
|
|
@ -76,7 +76,8 @@ class EynollahProcessor(Processor):
|
||||||
image_filename = "dummy" # will be replaced by ocrd.Processor.process_page_file
|
image_filename = "dummy" # will be replaced by ocrd.Processor.process_page_file
|
||||||
result.images.append(OcrdPageResultImage(page_image, '.IMG', page)) # mark as new original
|
result.images.append(OcrdPageResultImage(page_image, '.IMG', page)) # mark as new original
|
||||||
# FIXME: mask out already existing regions (incremental segmentation)
|
# FIXME: mask out already existing regions (incremental segmentation)
|
||||||
self.eynollah.run_single(image_filename, None, img_pil=page_image, pcgts=pcgts,
|
self.eynollah.run_single(image_filename,
|
||||||
|
img_pil=page_image, pcgts=pcgts,
|
||||||
# ocrd.Processor will handle OCRD_EXISTING_OUTPUT more flexibly
|
# ocrd.Processor will handle OCRD_EXISTING_OUTPUT more flexibly
|
||||||
overwrite=True)
|
overwrite=True)
|
||||||
return result
|
return result
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue