mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-09-17 21:29:56 +02:00
new page extraction model integration
This commit is contained in:
commit
52cb0d9fac
4 changed files with 19 additions and 12 deletions
|
@ -36,6 +36,8 @@ COPY . .
|
||||||
COPY ocrd-tool.json .
|
COPY ocrd-tool.json .
|
||||||
# prepackage ocrd-tool.json as ocrd-all-tool.json
|
# prepackage ocrd-tool.json as ocrd-all-tool.json
|
||||||
RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
|
RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
|
||||||
|
# prepackage ocrd-all-module-dir.json
|
||||||
|
RUN ocrd ocrd-tool ocrd-tool.json dump-module-dirs > $(dirname $(ocrd bashlib filename))/ocrd-all-module-dir.json
|
||||||
# install everything and reduce image size
|
# install everything and reduce image size
|
||||||
RUN make install EXTRAS=OCR && rm -rf /build/eynollah
|
RUN make install EXTRAS=OCR && rm -rf /build/eynollah
|
||||||
# smoke test
|
# smoke test
|
||||||
|
|
7
Makefile
7
Makefile
|
@ -3,8 +3,9 @@ PIP ?= pip3
|
||||||
EXTRAS ?=
|
EXTRAS ?=
|
||||||
|
|
||||||
# DOCKER_BASE_IMAGE = artefakt.dev.sbb.berlin:5000/sbb/ocrd_core:v2.68.0
|
# DOCKER_BASE_IMAGE = artefakt.dev.sbb.berlin:5000/sbb/ocrd_core:v2.68.0
|
||||||
DOCKER_BASE_IMAGE = docker.io/ocrd/core-cuda-tf2:v3.3.0
|
DOCKER_BASE_IMAGE ?= docker.io/ocrd/core-cuda-tf2:latest
|
||||||
DOCKER_TAG = ocrd/eynollah
|
DOCKER_TAG ?= ocrd/eynollah
|
||||||
|
DOCKER ?= docker
|
||||||
|
|
||||||
#SEG_MODEL := https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz
|
#SEG_MODEL := https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz
|
||||||
#SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz
|
#SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz
|
||||||
|
@ -117,7 +118,7 @@ coverage:
|
||||||
|
|
||||||
# Build docker image
|
# Build docker image
|
||||||
docker:
|
docker:
|
||||||
docker build \
|
$(DOCKER) build \
|
||||||
--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
|
--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
|
||||||
--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
|
--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
|
||||||
--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
|
--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
"textline_light": {
|
"textline_light": {
|
||||||
"type": "boolean",
|
"type": "boolean",
|
||||||
"default": true,
|
"default": true,
|
||||||
"description": "Light version need textline light"
|
"description": "Light version need textline light. If this parameter set to true, this tool will try to return contoure of textlines instead of rectangle bounding box of textline with a faster method."
|
||||||
},
|
},
|
||||||
"tables": {
|
"tables": {
|
||||||
"type": "boolean",
|
"type": "boolean",
|
||||||
|
@ -65,11 +65,6 @@
|
||||||
"default": false,
|
"default": false,
|
||||||
"description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not."
|
"description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not."
|
||||||
},
|
},
|
||||||
"textline_light": {
|
|
||||||
"type": "boolean",
|
|
||||||
"default": false,
|
|
||||||
"description": "if this parameter set to true, this tool will try to return contoure of textlines instead of rectangle bounding box of textline with a faster method."
|
|
||||||
},
|
|
||||||
"right_to_left": {
|
"right_to_left": {
|
||||||
"type": "boolean",
|
"type": "boolean",
|
||||||
"default": false,
|
"default": false,
|
||||||
|
@ -79,6 +74,11 @@
|
||||||
"type": "boolean",
|
"type": "boolean",
|
||||||
"default": false,
|
"default": false,
|
||||||
"description": "ignore the special role of headings during reading order detection"
|
"description": "ignore the special role of headings during reading order detection"
|
||||||
|
},
|
||||||
|
"reading_order_machine_based": {
|
||||||
|
"type": "boolean",
|
||||||
|
"default": false,
|
||||||
|
"description": "use data-driven (rather than rule-based) reading order detection"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"resources": [
|
"resources": [
|
||||||
|
|
|
@ -14,15 +14,17 @@ class EynollahProcessor(Processor):
|
||||||
return 'ocrd-eynollah-segment'
|
return 'ocrd-eynollah-segment'
|
||||||
|
|
||||||
def setup(self) -> None:
|
def setup(self) -> None:
|
||||||
if self.parameter['textline_light'] and not self.parameter['light_version']:
|
assert self.parameter
|
||||||
raise ValueError("Error: You set parameter 'textline_light' to enable light textline detection, "
|
if self.parameter['textline_light'] != self.parameter['light_version']:
|
||||||
"but parameter 'light_version' is not enabled")
|
raise ValueError("Error: You must set or unset both parameter 'textline_light' (to enable light textline detection), "
|
||||||
|
"and parameter 'light_version' (faster+simpler method for main region detection and deskewing)")
|
||||||
self.eynollah = Eynollah(
|
self.eynollah = Eynollah(
|
||||||
self.resolve_resource(self.parameter['models']),
|
self.resolve_resource(self.parameter['models']),
|
||||||
logger=self.logger,
|
logger=self.logger,
|
||||||
allow_enhancement=self.parameter['allow_enhancement'],
|
allow_enhancement=self.parameter['allow_enhancement'],
|
||||||
curved_line=self.parameter['curved_line'],
|
curved_line=self.parameter['curved_line'],
|
||||||
right2left=self.parameter['right_to_left'],
|
right2left=self.parameter['right_to_left'],
|
||||||
|
reading_order_machine_based=self.parameter['reading_order_machine_based'],
|
||||||
ignore_page_extraction=self.parameter['ignore_page_extraction'],
|
ignore_page_extraction=self.parameter['ignore_page_extraction'],
|
||||||
light_version=self.parameter['light_version'],
|
light_version=self.parameter['light_version'],
|
||||||
textline_light=self.parameter['textline_light'],
|
textline_light=self.parameter['textline_light'],
|
||||||
|
@ -56,6 +58,8 @@ class EynollahProcessor(Processor):
|
||||||
- If ``ignore_page_extraction``, then attempt no cropping of the page.
|
- If ``ignore_page_extraction``, then attempt no cropping of the page.
|
||||||
- If ``curved_line``, then compute contour polygons for text lines
|
- If ``curved_line``, then compute contour polygons for text lines
|
||||||
instead of simple bounding boxes.
|
instead of simple bounding boxes.
|
||||||
|
- If ``reading_order_machine_based``, then detect reading order via
|
||||||
|
data-driven model instead of geometrical heuristics.
|
||||||
|
|
||||||
Produce a new output file by serialising the resulting hierarchy.
|
Produce a new output file by serialising the resulting hierarchy.
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue