mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-10-26 23:34:13 +01:00
96 lines
4.4 KiB
Python
96 lines
4.4 KiB
Python
from functools import cached_property
|
|
from typing import Optional
|
|
from ocrd_models import OcrdPage
|
|
from ocrd import OcrdPageResultImage, Processor, OcrdPageResult
|
|
|
|
from .eynollah import Eynollah, EynollahXmlWriter
|
|
|
|
class EynollahProcessor(Processor):
|
|
# already employs background CPU multiprocessing per page
|
|
# already employs GPU (without singleton process atm)
|
|
max_workers = 1
|
|
|
|
@cached_property
|
|
def executable(self) -> str:
|
|
return 'ocrd-eynollah-segment'
|
|
|
|
def setup(self) -> None:
|
|
assert self.parameter
|
|
if self.parameter['textline_light'] != self.parameter['light_version']:
|
|
raise ValueError("Error: You must set or unset both parameter 'textline_light' (to enable light textline detection), "
|
|
"and parameter 'light_version' (faster+simpler method for main region detection and deskewing)")
|
|
self.eynollah = Eynollah(
|
|
self.resolve_resource(self.parameter['models']),
|
|
allow_enhancement=self.parameter['allow_enhancement'],
|
|
curved_line=self.parameter['curved_line'],
|
|
right2left=self.parameter['right_to_left'],
|
|
reading_order_machine_based=self.parameter['reading_order_machine_based'],
|
|
ignore_page_extraction=self.parameter['ignore_page_extraction'],
|
|
light_version=self.parameter['light_version'],
|
|
textline_light=self.parameter['textline_light'],
|
|
full_layout=self.parameter['full_layout'],
|
|
allow_scaling=self.parameter['allow_scaling'],
|
|
headers_off=self.parameter['headers_off'],
|
|
tables=self.parameter['tables'],
|
|
logger=self.logger
|
|
)
|
|
self.eynollah.plotter = None
|
|
|
|
def shutdown(self):
|
|
if hasattr(self, 'eynollah'):
|
|
del self.eynollah
|
|
|
|
def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
|
|
"""
|
|
Performs cropping, region and line segmentation with Eynollah.
|
|
|
|
For each page, open and deserialize PAGE input file (from existing
|
|
PAGE file in the input fileGrp, or generated from image file).
|
|
Retrieve its respective page-level image (ignoring annotation that
|
|
already added `binarized`, `cropped` or `deskewed` features).
|
|
|
|
Set up Eynollah to detect regions and lines, and add each one to the
|
|
page, respectively.
|
|
|
|
\b
|
|
- If ``tables``, try to detect table blocks and add them as TableRegion.
|
|
- If ``full_layout``, then in addition to paragraphs and marginals, also
|
|
try to detect drop capitals and headings.
|
|
- If ``ignore_page_extraction``, then attempt no cropping of the page.
|
|
- If ``curved_line``, then compute contour polygons for text lines
|
|
instead of simple bounding boxes.
|
|
- If ``reading_order_machine_based``, then detect reading order via
|
|
data-driven model instead of geometrical heuristics.
|
|
|
|
Produce a new output file by serialising the resulting hierarchy.
|
|
"""
|
|
assert input_pcgts
|
|
assert input_pcgts[0]
|
|
assert self.parameter
|
|
pcgts = input_pcgts[0]
|
|
result = OcrdPageResult(pcgts)
|
|
page = pcgts.get_Page()
|
|
page_image, _, _ = self.workspace.image_from_page(
|
|
page, page_id,
|
|
# avoid any features that would change the coordinate system: cropped,deskewed
|
|
# (the PAGE builder merely adds regions, so afterwards we would not know which to transform)
|
|
# also avoid binarization as models usually fare better on grayscale/RGB
|
|
feature_filter='cropped,deskewed,binarized')
|
|
if hasattr(page_image, 'filename'):
|
|
image_filename = page_image.filename
|
|
else:
|
|
image_filename = "dummy" # will be replaced by ocrd.Processor.process_page_file
|
|
result.images.append(OcrdPageResultImage(page_image, '.IMG', page)) # mark as new original
|
|
# FIXME: mask out already existing regions (incremental segmentation)
|
|
self.eynollah.cache_images(
|
|
image_pil=page_image,
|
|
dpi=self.parameter['dpi'],
|
|
)
|
|
self.eynollah.writer = EynollahXmlWriter(
|
|
dir_out=None,
|
|
image_filename=image_filename,
|
|
curved_line=self.eynollah.curved_line,
|
|
textline_light=self.eynollah.textline_light,
|
|
pcgts=pcgts)
|
|
self.eynollah.run_single()
|
|
return result
|