commit
e0d38517d3
@ -0,0 +1,6 @@
|
||||
tests
|
||||
dist
|
||||
build
|
||||
env*
|
||||
*.egg-info
|
||||
models_eynollah*
|
@ -0,0 +1,44 @@
|
||||
name: CD
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ "master" ]
|
||||
workflow_dispatch: # run manually
|
||||
|
||||
jobs:
|
||||
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
# we need tags for docker version tagging
|
||||
fetch-tags: true
|
||||
fetch-depth: 0
|
||||
- # Activate cache export feature to reduce build time of images
|
||||
name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Login to GitHub Container Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERIO_USERNAME }}
|
||||
password: ${{ secrets.DOCKERIO_PASSWORD }}
|
||||
- name: Build the Docker image
|
||||
# build both tags at the same time
|
||||
run: make docker DOCKER_TAG="docker.io/ocrd/eynollah -t ghcr.io/qurator-spk/eynollah"
|
||||
- name: Test the Docker image
|
||||
run: docker run --rm ocrd/eynollah ocrd-eynollah-segment -h
|
||||
- name: Push to Dockerhub
|
||||
run: docker push docker.io/ocrd/eynollah
|
||||
- name: Push to Github Container Registry
|
||||
run: docker push ghcr.io/qurator-spk/eynollah
|
File diff suppressed because it is too large
Load Diff
@ -1,47 +0,0 @@
|
||||
{
|
||||
"version": "0.1.0",
|
||||
"git_url": "https://github.com/qurator-spk/sbb_binarization",
|
||||
"tools": {
|
||||
"ocrd-sbb-binarize": {
|
||||
"executable": "ocrd-sbb-binarize",
|
||||
"description": "Pixelwise binarization with selectional auto-encoders in Keras",
|
||||
"categories": ["Image preprocessing"],
|
||||
"steps": ["preprocessing/optimization/binarization"],
|
||||
"input_file_grp": [],
|
||||
"output_file_grp": [],
|
||||
"parameters": {
|
||||
"operation_level": {
|
||||
"type": "string",
|
||||
"enum": ["page", "region"],
|
||||
"default": "page",
|
||||
"description": "PAGE XML hierarchy level to operate on"
|
||||
},
|
||||
"model": {
|
||||
"description": "Directory containing HDF5 or SavedModel/ProtoBuf models. Can be an absolute path or a path relative to the OCR-D resource location, the current working directory or the $SBB_BINARIZE_DATA environment variable (if set)",
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"content-type": "text/directory",
|
||||
"required": true
|
||||
}
|
||||
},
|
||||
"resources": [
|
||||
{
|
||||
"url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2020_01_16.zip",
|
||||
"name": "default",
|
||||
"type": "archive",
|
||||
"path_in_archive": "saved_model_2020_01_16",
|
||||
"size": 563147331,
|
||||
"description": "default models provided by github.com/qurator-spk (SavedModel format)"
|
||||
},
|
||||
{
|
||||
"url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip",
|
||||
"name": "default-2021-03-09",
|
||||
"type": "archive",
|
||||
"path_in_archive": ".",
|
||||
"size": 133230419,
|
||||
"description": "updated default models provided by github.com/qurator-spk (SavedModel format)"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
@ -1,71 +1,91 @@
|
||||
from json import loads
|
||||
from pkg_resources import resource_string
|
||||
from tempfile import NamedTemporaryFile
|
||||
from pathlib import Path
|
||||
from os.path import join
|
||||
from typing import Optional
|
||||
from ocrd_models import OcrdPage
|
||||
from ocrd import Processor, OcrdPageResult
|
||||
|
||||
from PIL import Image
|
||||
from .eynollah import Eynollah, EynollahXmlWriter
|
||||
|
||||
from ocrd import Processor
|
||||
from ocrd_modelfactory import page_from_file, exif_from_filename
|
||||
from ocrd_models import OcrdFile, OcrdExif
|
||||
from ocrd_models.ocrd_page import to_xml
|
||||
from ocrd_utils import (
|
||||
getLogger,
|
||||
MIMETYPE_PAGE,
|
||||
assert_file_grp_cardinality,
|
||||
make_file_id
|
||||
)
|
||||
class EynollahProcessor(Processor):
|
||||
# already employs background CPU multiprocessing per page
|
||||
# already employs GPU (without singleton process atm)
|
||||
max_workers = 1
|
||||
|
||||
from .eynollah import Eynollah
|
||||
from .utils.pil_cv2 import pil2cv
|
||||
@property
|
||||
def executable(self):
|
||||
return 'ocrd-eynollah-segment'
|
||||
|
||||
OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
|
||||
def setup(self) -> None:
|
||||
if self.parameter['textline_light'] and not self.parameter['light_version']:
|
||||
raise ValueError("Error: You set parameter 'textline_light' to enable light textline detection, "
|
||||
"but parameter 'light_version' is not enabled")
|
||||
self.eynollah = Eynollah(
|
||||
self.resolve_resource(self.parameter['models']),
|
||||
logger=self.logger,
|
||||
allow_enhancement=self.parameter['allow_enhancement'],
|
||||
curved_line=self.parameter['curved_line'],
|
||||
right2left=self.parameter['right_to_left'],
|
||||
ignore_page_extraction=self.parameter['ignore_page_extraction'],
|
||||
light_version=self.parameter['light_version'],
|
||||
textline_light=self.parameter['textline_light'],
|
||||
full_layout=self.parameter['full_layout'],
|
||||
allow_scaling=self.parameter['allow_scaling'],
|
||||
headers_off=self.parameter['headers_off'],
|
||||
tables=self.parameter['tables'],
|
||||
)
|
||||
self.eynollah.plotter = None
|
||||
|
||||
class EynollahProcessor(Processor):
|
||||
def shutdown(self):
|
||||
if hasattr(self, 'eynollah'):
|
||||
del self.eynollah
|
||||
|
||||
def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
|
||||
"""
|
||||
Performs cropping, region and line segmentation with Eynollah.
|
||||
|
||||
For each page, open and deserialize PAGE input file (from existing
|
||||
PAGE file in the input fileGrp, or generated from image file).
|
||||
Retrieve its respective page-level image (ignoring annotation that
|
||||
already added `binarized`, `cropped` or `deskewed` features).
|
||||
|
||||
Set up Eynollah to detect regions and lines, and add each one to the
|
||||
page, respectively.
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment']
|
||||
kwargs['version'] = OCRD_TOOL['version']
|
||||
super().__init__(*args, **kwargs)
|
||||
\b
|
||||
- If ``tables``, try to detect table blocks and add them as TableRegion.
|
||||
- If ``full_layout``, then in addition to paragraphs and marginals, also
|
||||
try to detect drop capitals and headings.
|
||||
- If ``ignore_page_extraction``, then attempt no cropping of the page.
|
||||
- If ``curved_line``, then compute contour polygons for text lines
|
||||
instead of simple bounding boxes.
|
||||
|
||||
def process(self):
|
||||
LOG = getLogger('eynollah')
|
||||
assert_file_grp_cardinality(self.input_file_grp, 1)
|
||||
assert_file_grp_cardinality(self.output_file_grp, 1)
|
||||
for n, input_file in enumerate(self.input_files):
|
||||
page_id = input_file.pageId or input_file.ID
|
||||
LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files))
|
||||
pcgts = page_from_file(self.workspace.download_file(input_file))
|
||||
LOG.debug('width %s height %s', pcgts.get_Page().imageWidth, pcgts.get_Page().imageHeight)
|
||||
self.add_metadata(pcgts)
|
||||
page = pcgts.get_Page()
|
||||
# XXX loses DPI information
|
||||
# page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
|
||||
image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename
|
||||
eynollah_kwargs = {
|
||||
'dir_models': self.resolve_resource(self.parameter['models']),
|
||||
'dir_out': self.output_file_grp,
|
||||
'allow_enhancement': False,
|
||||
'curved_line': self.parameter['curved_line'],
|
||||
'full_layout': self.parameter['full_layout'],
|
||||
'allow_scaling': self.parameter['allow_scaling'],
|
||||
'light_version': self.parameter['light_version'],
|
||||
'textline_light': self.parameter['textline_light'],
|
||||
'headers_off': self.parameter['headers_off'],
|
||||
'tables': self.parameter['tables'],
|
||||
'override_dpi': self.parameter['dpi'],
|
||||
'logger': LOG,
|
||||
'pcgts': pcgts,
|
||||
'image_filename': image_filename
|
||||
}
|
||||
Eynollah(**eynollah_kwargs).run()
|
||||
file_id = make_file_id(input_file, self.output_file_grp)
|
||||
pcgts.set_pcGtsId(file_id)
|
||||
self.workspace.add_file(
|
||||
ID=file_id,
|
||||
file_grp=self.output_file_grp,
|
||||
pageId=page_id,
|
||||
mimetype=MIMETYPE_PAGE,
|
||||
local_filename=join(self.output_file_grp, file_id) + '.xml',
|
||||
content=to_xml(pcgts))
|
||||
Produce a new output file by serialising the resulting hierarchy.
|
||||
"""
|
||||
assert input_pcgts
|
||||
assert input_pcgts[0]
|
||||
assert self.parameter
|
||||
pcgts = input_pcgts[0]
|
||||
result = OcrdPageResult(pcgts)
|
||||
page = pcgts.get_Page()
|
||||
page_image, _, _ = self.workspace.image_from_page(
|
||||
page, page_id,
|
||||
# avoid any features that would change the coordinate system: cropped,deskewed
|
||||
# (the PAGE builder merely adds regions, so afterwards we would not know which to transform)
|
||||
# also avoid binarization as models usually fare better on grayscale/RGB
|
||||
feature_filter='cropped,deskewed,binarized')
|
||||
if hasattr(page_image, 'filename'):
|
||||
image_filename = page_image.filename
|
||||
else:
|
||||
image_filename = "dummy" # will be replaced by ocrd.Processor.process_page_file
|
||||
result.images.append(OcrdPageResultImage(page_image, '.IMG', page)) # mark as new original
|
||||
# FIXME: mask out already existing regions (incremental segmentation)
|
||||
self.eynollah.cache_images(
|
||||
image_pil=page_image,
|
||||
dpi=self.parameter['dpi'],
|
||||
)
|
||||
self.eynollah.writer = EynollahXmlWriter(
|
||||
dir_out=None,
|
||||
image_filename=image_filename,
|
||||
curved_line=self.eynollah.curved_line,
|
||||
textline_light=self.eynollah.textline_light,
|
||||
pcgts=pcgts)
|
||||
self.eynollah.run_single()
|
||||
return result
|
||||
|
Binary file not shown.
Loading…
Reference in New Issue