mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-12-01 08:44:13 +01:00
WIP: reorganize OCR-D and start on ocrd-eynollah-ocr
This commit is contained in:
parent
0f410c2e7c
commit
acb91efe48
7 changed files with 44 additions and 31 deletions
|
|
@ -41,8 +41,9 @@ classifiers = [
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
eynollah = "eynollah.cli:main"
|
eynollah = "eynollah.cli:main"
|
||||||
eynollah-training = "eynollah.training.cli:main"
|
eynollah-training = "eynollah.training.cli:main"
|
||||||
ocrd-eynollah-segment = "eynollah.ocrd_cli:main"
|
ocrd-eynollah-segment = "eynollah.ocrd.ocrd_cli_layout:main"
|
||||||
ocrd-sbb-binarize = "eynollah.ocrd_cli_binarization:main"
|
ocrd-sbb-binarize = "eynollah.ocrd.ocrd_cli_binarize:main"
|
||||||
|
ocrd-eynollah-ocr = "eynollah.ocrd.ocrd_cli_ocr:main"
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
Homepage = "https://github.com/qurator-spk/eynollah"
|
Homepage = "https://github.com/qurator-spk/eynollah"
|
||||||
|
|
|
||||||
0
src/eynollah/ocrd/__init__.py
Normal file
0
src/eynollah/ocrd/__init__.py
Normal file
9
src/eynollah/ocrd/ocrd_cli_binarize.py
Normal file
9
src/eynollah/ocrd/ocrd_cli_binarize.py
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
from click import command
|
||||||
|
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
||||||
|
|
||||||
|
from .processor_binarize import SbbBinarizeProcessor
|
||||||
|
|
||||||
|
@command()
|
||||||
|
@ocrd_cli_options
|
||||||
|
def main(*args, **kwargs):
|
||||||
|
return ocrd_cli_wrap_processor(SbbBinarizeProcessor, *args, **kwargs)
|
||||||
|
|
@ -1,11 +1,9 @@
|
||||||
from .processor import EynollahProcessor
|
|
||||||
from click import command
|
from click import command
|
||||||
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
||||||
|
|
||||||
|
from .processor_layout import EynollahProcessor
|
||||||
|
|
||||||
@command()
|
@command()
|
||||||
@ocrd_cli_options
|
@ocrd_cli_options
|
||||||
def main(*args, **kwargs):
|
def main(*args, **kwargs):
|
||||||
return ocrd_cli_wrap_processor(EynollahProcessor, *args, **kwargs)
|
return ocrd_cli_wrap_processor(EynollahProcessor, *args, **kwargs)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
|
|
@ -1,29 +1,15 @@
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from PIL import Image
|
|
||||||
from frozendict import frozendict
|
from frozendict import frozendict
|
||||||
import numpy as np
|
|
||||||
import cv2
|
|
||||||
from click import command
|
|
||||||
|
|
||||||
from ocrd import Processor, OcrdPageResult, OcrdPageResultImage
|
from ocrd import Processor, OcrdPageResult, OcrdPageResultImage
|
||||||
from ocrd_models.ocrd_page import OcrdPage, AlternativeImageType
|
from ocrd_models.ocrd_page import OcrdPage, AlternativeImageType
|
||||||
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
|
||||||
|
|
||||||
from eynollah.model_zoo.model_zoo import EynollahModelZoo
|
from eynollah.model_zoo.model_zoo import EynollahModelZoo
|
||||||
|
|
||||||
from .sbb_binarize import SbbBinarizer
|
from ..sbb_binarize import SbbBinarizer
|
||||||
|
from ..utils.pil_cv2 import cv2pil, pil2cv
|
||||||
|
|
||||||
def cv2pil(img):
|
|
||||||
return Image.fromarray(img.astype('uint8'))
|
|
||||||
|
|
||||||
def pil2cv(img):
|
|
||||||
# from ocrd/workspace.py
|
|
||||||
color_conversion = cv2.COLOR_GRAY2BGR if img.mode in ('1', 'L') else cv2.COLOR_RGB2BGR
|
|
||||||
pil_as_np_array = np.array(img).astype('uint8') if img.mode == '1' else np.array(img)
|
|
||||||
return cv2.cvtColor(pil_as_np_array, color_conversion)
|
|
||||||
|
|
||||||
class SbbBinarizeProcessor(Processor):
|
class SbbBinarizeProcessor(Processor):
|
||||||
# already employs GPU (without singleton process atm)
|
# already employs GPU (without singleton process atm)
|
||||||
|
|
@ -103,12 +89,7 @@ class SbbBinarizeProcessor(Processor):
|
||||||
line_image_bin = cv2pil(self.binarizer.run(image=pil2cv(line_image), use_patches=True))
|
line_image_bin = cv2pil(self.binarizer.run(image=pil2cv(line_image), use_patches=True))
|
||||||
# update PAGE (reference the image file):
|
# update PAGE (reference the image file):
|
||||||
line_image_ref = AlternativeImageType(comments=line_xywh['features'] + ',binarized')
|
line_image_ref = AlternativeImageType(comments=line_xywh['features'] + ',binarized')
|
||||||
line.add_AlternativeImage(region_image_ref)
|
line.add_AlternativeImage(line_image_ref)
|
||||||
result.images.append(OcrdPageResultImage(line_image_bin, line.id + '.IMG-BIN', line_image_ref))
|
result.images.append(OcrdPageResultImage(line_image_bin, line.id + '.IMG-BIN', line_image_ref))
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@command()
|
|
||||||
@ocrd_cli_options
|
|
||||||
def main(*args, **kwargs):
|
|
||||||
return ocrd_cli_wrap_processor(SbbBinarizeProcessor, *args, **kwargs)
|
|
||||||
|
|
@ -3,9 +3,9 @@ from typing import Optional
|
||||||
from ocrd_models import OcrdPage
|
from ocrd_models import OcrdPage
|
||||||
from ocrd import OcrdPageResultImage, Processor, OcrdPageResult
|
from ocrd import OcrdPageResultImage, Processor, OcrdPageResult
|
||||||
|
|
||||||
from eynollah.model_zoo.model_zoo import EynollahModelZoo
|
from ..model_zoo.model_zoo import EynollahModelZoo
|
||||||
|
from ..eynollah import Eynollah
|
||||||
from .eynollah import Eynollah, EynollahXmlWriter
|
from ..writer import EynollahXmlWriter
|
||||||
|
|
||||||
class EynollahProcessor(Processor):
|
class EynollahProcessor(Processor):
|
||||||
# already employs background CPU multiprocessing per page
|
# already employs background CPU multiprocessing per page
|
||||||
24
src/eynollah/ocrd/processor_ocr.py
Normal file
24
src/eynollah/ocrd/processor_ocr.py
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
|
||||||
|
from functools import cached_property
|
||||||
|
from frozendict import frozendict
|
||||||
|
from ocrd import Processor
|
||||||
|
|
||||||
|
from ..model_zoo.model_zoo import EynollahModelZoo
|
||||||
|
|
||||||
|
|
||||||
|
class EynollahOcrProcessor(Processor):
|
||||||
|
# already employs GPU (without singleton process atm)
|
||||||
|
max_workers = 1
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def executable(self):
|
||||||
|
return 'ocrd-eynollah-ocr'
|
||||||
|
|
||||||
|
def setup(self):
|
||||||
|
"""
|
||||||
|
Set up the model prior to processing.
|
||||||
|
"""
|
||||||
|
# resolve relative path via OCR-D ResourceManager
|
||||||
|
assert isinstance(self.parameter, frozendict)
|
||||||
|
model_zoo = EynollahModelZoo(basedir=self.parameter['model'])
|
||||||
|
raise NotImplementedError()
|
||||||
Loading…
Add table
Add a link
Reference in a new issue