mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-09-17 21:29:56 +02:00
new page extraction model integration
This commit is contained in:
commit
52cb0d9fac
4 changed files with 19 additions and 12 deletions
|
@ -36,6 +36,8 @@ COPY . .
|
|||
COPY ocrd-tool.json .
|
||||
# prepackage ocrd-tool.json as ocrd-all-tool.json
|
||||
RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
|
||||
# prepackage ocrd-all-module-dir.json
|
||||
RUN ocrd ocrd-tool ocrd-tool.json dump-module-dirs > $(dirname $(ocrd bashlib filename))/ocrd-all-module-dir.json
|
||||
# install everything and reduce image size
|
||||
RUN make install EXTRAS=OCR && rm -rf /build/eynollah
|
||||
# smoke test
|
||||
|
|
7
Makefile
7
Makefile
|
@ -3,8 +3,9 @@ PIP ?= pip3
|
|||
EXTRAS ?=
|
||||
|
||||
# DOCKER_BASE_IMAGE = artefakt.dev.sbb.berlin:5000/sbb/ocrd_core:v2.68.0
|
||||
DOCKER_BASE_IMAGE = docker.io/ocrd/core-cuda-tf2:v3.3.0
|
||||
DOCKER_TAG = ocrd/eynollah
|
||||
DOCKER_BASE_IMAGE ?= docker.io/ocrd/core-cuda-tf2:latest
|
||||
DOCKER_TAG ?= ocrd/eynollah
|
||||
DOCKER ?= docker
|
||||
|
||||
#SEG_MODEL := https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz
|
||||
#SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz
|
||||
|
@ -117,7 +118,7 @@ coverage:
|
|||
|
||||
# Build docker image
|
||||
docker:
|
||||
docker build \
|
||||
$(DOCKER) build \
|
||||
--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
|
||||
--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
|
||||
--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
|
||||
|
|
|
@ -38,7 +38,7 @@
|
|||
"textline_light": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Light version need textline light"
|
||||
"description": "Light version need textline light. If this parameter set to true, this tool will try to return contoure of textlines instead of rectangle bounding box of textline with a faster method."
|
||||
},
|
||||
"tables": {
|
||||
"type": "boolean",
|
||||
|
@ -65,11 +65,6 @@
|
|||
"default": false,
|
||||
"description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not."
|
||||
},
|
||||
"textline_light": {
|
||||
"type": "boolean",
|
||||
"default": false,
|
||||
"description": "if this parameter set to true, this tool will try to return contoure of textlines instead of rectangle bounding box of textline with a faster method."
|
||||
},
|
||||
"right_to_left": {
|
||||
"type": "boolean",
|
||||
"default": false,
|
||||
|
@ -79,6 +74,11 @@
|
|||
"type": "boolean",
|
||||
"default": false,
|
||||
"description": "ignore the special role of headings during reading order detection"
|
||||
},
|
||||
"reading_order_machine_based": {
|
||||
"type": "boolean",
|
||||
"default": false,
|
||||
"description": "use data-driven (rather than rule-based) reading order detection"
|
||||
}
|
||||
},
|
||||
"resources": [
|
||||
|
|
|
@ -14,15 +14,17 @@ class EynollahProcessor(Processor):
|
|||
return 'ocrd-eynollah-segment'
|
||||
|
||||
def setup(self) -> None:
|
||||
if self.parameter['textline_light'] and not self.parameter['light_version']:
|
||||
raise ValueError("Error: You set parameter 'textline_light' to enable light textline detection, "
|
||||
"but parameter 'light_version' is not enabled")
|
||||
assert self.parameter
|
||||
if self.parameter['textline_light'] != self.parameter['light_version']:
|
||||
raise ValueError("Error: You must set or unset both parameter 'textline_light' (to enable light textline detection), "
|
||||
"and parameter 'light_version' (faster+simpler method for main region detection and deskewing)")
|
||||
self.eynollah = Eynollah(
|
||||
self.resolve_resource(self.parameter['models']),
|
||||
logger=self.logger,
|
||||
allow_enhancement=self.parameter['allow_enhancement'],
|
||||
curved_line=self.parameter['curved_line'],
|
||||
right2left=self.parameter['right_to_left'],
|
||||
reading_order_machine_based=self.parameter['reading_order_machine_based'],
|
||||
ignore_page_extraction=self.parameter['ignore_page_extraction'],
|
||||
light_version=self.parameter['light_version'],
|
||||
textline_light=self.parameter['textline_light'],
|
||||
|
@ -56,6 +58,8 @@ class EynollahProcessor(Processor):
|
|||
- If ``ignore_page_extraction``, then attempt no cropping of the page.
|
||||
- If ``curved_line``, then compute contour polygons for text lines
|
||||
instead of simple bounding boxes.
|
||||
- If ``reading_order_machine_based``, then detect reading order via
|
||||
data-driven model instead of geometrical heuristics.
|
||||
|
||||
Produce a new output file by serialising the resulting hierarchy.
|
||||
"""
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue