new page extraction model integration

This commit is contained in:
vahidrezanezhad 2025-09-15 13:38:23 +02:00
commit 52cb0d9fac
4 changed files with 19 additions and 12 deletions

View file

@ -36,6 +36,8 @@ COPY . .
COPY ocrd-tool.json . COPY ocrd-tool.json .
# prepackage ocrd-tool.json as ocrd-all-tool.json # prepackage ocrd-tool.json as ocrd-all-tool.json
RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
# prepackage ocrd-all-module-dir.json
RUN ocrd ocrd-tool ocrd-tool.json dump-module-dirs > $(dirname $(ocrd bashlib filename))/ocrd-all-module-dir.json
# install everything and reduce image size # install everything and reduce image size
RUN make install EXTRAS=OCR && rm -rf /build/eynollah RUN make install EXTRAS=OCR && rm -rf /build/eynollah
# smoke test # smoke test

View file

@ -3,8 +3,9 @@ PIP ?= pip3
EXTRAS ?= EXTRAS ?=
# DOCKER_BASE_IMAGE = artefakt.dev.sbb.berlin:5000/sbb/ocrd_core:v2.68.0 # DOCKER_BASE_IMAGE = artefakt.dev.sbb.berlin:5000/sbb/ocrd_core:v2.68.0
DOCKER_BASE_IMAGE = docker.io/ocrd/core-cuda-tf2:v3.3.0 DOCKER_BASE_IMAGE ?= docker.io/ocrd/core-cuda-tf2:latest
DOCKER_TAG = ocrd/eynollah DOCKER_TAG ?= ocrd/eynollah
DOCKER ?= docker
#SEG_MODEL := https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz #SEG_MODEL := https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz
#SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz #SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz
@ -117,7 +118,7 @@ coverage:
# Build docker image # Build docker image
docker: docker:
docker build \ $(DOCKER) build \
--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ --build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
--build-arg VCS_REF=$$(git rev-parse --short HEAD) \ --build-arg VCS_REF=$$(git rev-parse --short HEAD) \
--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \

View file

@ -38,7 +38,7 @@
"textline_light": { "textline_light": {
"type": "boolean", "type": "boolean",
"default": true, "default": true,
"description": "Light version need textline light" "description": "Light version need textline light. If this parameter set to true, this tool will try to return contoure of textlines instead of rectangle bounding box of textline with a faster method."
}, },
"tables": { "tables": {
"type": "boolean", "type": "boolean",
@ -65,11 +65,6 @@
"default": false, "default": false,
"description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not." "description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not."
}, },
"textline_light": {
"type": "boolean",
"default": false,
"description": "if this parameter set to true, this tool will try to return contoure of textlines instead of rectangle bounding box of textline with a faster method."
},
"right_to_left": { "right_to_left": {
"type": "boolean", "type": "boolean",
"default": false, "default": false,
@ -79,6 +74,11 @@
"type": "boolean", "type": "boolean",
"default": false, "default": false,
"description": "ignore the special role of headings during reading order detection" "description": "ignore the special role of headings during reading order detection"
},
"reading_order_machine_based": {
"type": "boolean",
"default": false,
"description": "use data-driven (rather than rule-based) reading order detection"
} }
}, },
"resources": [ "resources": [

View file

@ -14,15 +14,17 @@ class EynollahProcessor(Processor):
return 'ocrd-eynollah-segment' return 'ocrd-eynollah-segment'
def setup(self) -> None: def setup(self) -> None:
if self.parameter['textline_light'] and not self.parameter['light_version']: assert self.parameter
raise ValueError("Error: You set parameter 'textline_light' to enable light textline detection, " if self.parameter['textline_light'] != self.parameter['light_version']:
"but parameter 'light_version' is not enabled") raise ValueError("Error: You must set or unset both parameter 'textline_light' (to enable light textline detection), "
"and parameter 'light_version' (faster+simpler method for main region detection and deskewing)")
self.eynollah = Eynollah( self.eynollah = Eynollah(
self.resolve_resource(self.parameter['models']), self.resolve_resource(self.parameter['models']),
logger=self.logger, logger=self.logger,
allow_enhancement=self.parameter['allow_enhancement'], allow_enhancement=self.parameter['allow_enhancement'],
curved_line=self.parameter['curved_line'], curved_line=self.parameter['curved_line'],
right2left=self.parameter['right_to_left'], right2left=self.parameter['right_to_left'],
reading_order_machine_based=self.parameter['reading_order_machine_based'],
ignore_page_extraction=self.parameter['ignore_page_extraction'], ignore_page_extraction=self.parameter['ignore_page_extraction'],
light_version=self.parameter['light_version'], light_version=self.parameter['light_version'],
textline_light=self.parameter['textline_light'], textline_light=self.parameter['textline_light'],
@ -56,6 +58,8 @@ class EynollahProcessor(Processor):
- If ``ignore_page_extraction``, then attempt no cropping of the page. - If ``ignore_page_extraction``, then attempt no cropping of the page.
- If ``curved_line``, then compute contour polygons for text lines - If ``curved_line``, then compute contour polygons for text lines
instead of simple bounding boxes. instead of simple bounding boxes.
- If ``reading_order_machine_based``, then detect reading order via
data-driven model instead of geometrical heuristics.
Produce a new output file by serialising the resulting hierarchy. Produce a new output file by serialising the resulting hierarchy.
""" """