diff --git a/Dockerfile b/Dockerfile index 4785fc1..4ba498b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -36,6 +36,8 @@ COPY . . COPY ocrd-tool.json . # prepackage ocrd-tool.json as ocrd-all-tool.json RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json +# prepackage ocrd-all-module-dir.json +RUN ocrd ocrd-tool ocrd-tool.json dump-module-dirs > $(dirname $(ocrd bashlib filename))/ocrd-all-module-dir.json # install everything and reduce image size RUN make install EXTRAS=OCR && rm -rf /build/eynollah # smoke test diff --git a/Makefile b/Makefile index 5f2bf34..73d4d34 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,9 @@ PIP ?= pip3 EXTRAS ?= # DOCKER_BASE_IMAGE = artefakt.dev.sbb.berlin:5000/sbb/ocrd_core:v2.68.0 -DOCKER_BASE_IMAGE = docker.io/ocrd/core-cuda-tf2:v3.3.0 -DOCKER_TAG = ocrd/eynollah +DOCKER_BASE_IMAGE ?= docker.io/ocrd/core-cuda-tf2:latest +DOCKER_TAG ?= ocrd/eynollah +DOCKER ?= docker #SEG_MODEL := https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz #SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz @@ -117,7 +118,7 @@ coverage: # Build docker image docker: - docker build \ + $(DOCKER) build \ --build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ --build-arg VCS_REF=$$(git rev-parse --short HEAD) \ --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ diff --git a/src/eynollah/ocrd-tool.json b/src/eynollah/ocrd-tool.json index e972ec8..af5e03f 100644 --- a/src/eynollah/ocrd-tool.json +++ b/src/eynollah/ocrd-tool.json @@ -38,7 +38,7 @@ "textline_light": { "type": "boolean", "default": true, - "description": "Light version need textline light" + "description": "Light version need textline light. If this parameter set to true, this tool will try to return contoure of textlines instead of rectangle bounding box of textline with a faster method." }, "tables": { "type": "boolean", @@ -65,11 +65,6 @@ "default": false, "description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not." }, - "textline_light": { - "type": "boolean", - "default": false, - "description": "if this parameter set to true, this tool will try to return contoure of textlines instead of rectangle bounding box of textline with a faster method." - }, "right_to_left": { "type": "boolean", "default": false, @@ -79,6 +74,11 @@ "type": "boolean", "default": false, "description": "ignore the special role of headings during reading order detection" + }, + "reading_order_machine_based": { + "type": "boolean", + "default": false, + "description": "use data-driven (rather than rule-based) reading order detection" } }, "resources": [ diff --git a/src/eynollah/processor.py b/src/eynollah/processor.py index 8f99489..c2922c1 100644 --- a/src/eynollah/processor.py +++ b/src/eynollah/processor.py @@ -14,15 +14,17 @@ class EynollahProcessor(Processor): return 'ocrd-eynollah-segment' def setup(self) -> None: - if self.parameter['textline_light'] and not self.parameter['light_version']: - raise ValueError("Error: You set parameter 'textline_light' to enable light textline detection, " - "but parameter 'light_version' is not enabled") + assert self.parameter + if self.parameter['textline_light'] != self.parameter['light_version']: + raise ValueError("Error: You must set or unset both parameter 'textline_light' (to enable light textline detection), " + "and parameter 'light_version' (faster+simpler method for main region detection and deskewing)") self.eynollah = Eynollah( self.resolve_resource(self.parameter['models']), logger=self.logger, allow_enhancement=self.parameter['allow_enhancement'], curved_line=self.parameter['curved_line'], right2left=self.parameter['right_to_left'], + reading_order_machine_based=self.parameter['reading_order_machine_based'], ignore_page_extraction=self.parameter['ignore_page_extraction'], light_version=self.parameter['light_version'], textline_light=self.parameter['textline_light'], @@ -56,6 +58,8 @@ class EynollahProcessor(Processor): - If ``ignore_page_extraction``, then attempt no cropping of the page. - If ``curved_line``, then compute contour polygons for text lines instead of simple bounding boxes. + - If ``reading_order_machine_based``, then detect reading order via + data-driven model instead of geometrical heuristics. Produce a new output file by serialising the resulting hierarchy. """