From b8129c64252489a7ac8c0b359c2602082792fc60 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 11 Dec 2019 17:53:26 +0100 Subject: [PATCH 01/93] =?UTF-8?q?=F0=9F=A7=B9=20Do=20not=20advertise=20and?= =?UTF-8?q?=20support=20untested=20models?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 11 ++--------- README.md | 7 ------- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index 0508505..75cc1c8 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,6 @@ help: @echo "" @echo " install Install ocrd_calamari" @echo " calamari Clone calamari repo" - @echo " calamari_models Clone calamari_models repo" @echo " gt4histocr-calamari Get GT4HistOCR Calamari model (from SBB)" @echo " calamari/build pip install calamari" @echo " deps-test Install testing python deps via pip" @@ -37,12 +36,6 @@ install: calamari: $(GIT_CLONE) https://github.com/chwick/calamari -# Clone calamari_models repo -calamari_models: - $(GIT_CLONE) -n https://github.com/chwick/calamari_models - # Checkout latest version that works with calamari-ocr==0.3.5: - cd calamari_models && git checkout f76b1d3ec - gt4histocr-calamari: mkdir gt4histocr-calamari cd gt4histocr-calamari && \ @@ -53,7 +46,7 @@ gt4histocr-calamari: # pip install calamari -calamari/build: calamari calamari_models +calamari/build: calamari cd calamari && $(PIP_INSTALL) . @@ -87,7 +80,7 @@ test: test/assets gt4histocr-calamari $(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS) # Run unit tests and determine test coverage -coverage: test/assets calamari_models +coverage: test/assets gt4histocr-calamari coverage erase make test PYTHON="coverage run" coverage report diff --git a/README.md b/README.md index 4d7dc96..aea4d41 100644 --- a/README.md +++ b/README.md @@ -29,13 +29,6 @@ pip install . ## Install models -Download standard models: - -``` -wget https://github.com/Calamari-OCR/calamari_models/archive/master.zip -unzip master.zip -``` - Download models trained on GT4HistOCR data: ``` From dc38f0ee511613a53519a2f021a46580fd6b07cf Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 19 Dec 2019 16:23:16 +0100 Subject: [PATCH 02/93] =?UTF-8?q?=F0=9F=8E=A8=20Use=20TOOL=20constant=20co?= =?UTF-8?q?nvention=20from=20the=20other=20OCR-D=20processors?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/recognize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 92aa5a4..d633d23 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -15,13 +15,14 @@ from ocrd_utils import getLogger, concat_padded, MIMETYPE_PAGE from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL +TOOL = 'ocrd-calamari-recognize' log = getLogger('processor.CalamariRecognize') class CalamariRecognize(Processor): def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-recognize'] + kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(CalamariRecognize, self).__init__(*args, **kwargs) From 95281f3d29cbc868461695b79654d1cb0f9b457c Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 19 Dec 2019 16:24:34 +0100 Subject: [PATCH 03/93] =?UTF-8?q?=E2=9C=A8=20Add=20metadata=20about=20the?= =?UTF-8?q?=20recognition=20operation=20w/=20parameter=20info?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/recognize.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index d633d23..c5a6b12 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -9,8 +9,12 @@ from calamari_ocr.ocr.voting import voter_from_proto from calamari_ocr.proto import VoterParams from ocrd import Processor from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import to_xml -from ocrd_models.ocrd_page_generateds import TextEquivType +from ocrd_models.ocrd_page import ( + LabelType, LabelsType, + MetadataItemType, + TextEquivType, + to_xml +) from ocrd_utils import getLogger, concat_padded, MIMETYPE_PAGE from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL @@ -88,6 +92,20 @@ class CalamariRecognize(Processor): _page_update_higher_textequiv_levels('line', pcgts) + + # Add metadata about this operation and its runtime parameters: + metadata = pcgts.get_Metadata() # ensured by from_file() + metadata.add_MetadataItem( + MetadataItemType(type_="processingStep", + name=self.ocrd_tool['steps'][0], + value=TOOL, + Labels=[LabelsType( + externalModel="ocrd-tool", + externalId="parameters", + Label=[LabelType(type_=name, value=self.parameter[name]) + for name in self.parameter.keys()])])) + + file_id = self._make_file_id(input_file, n) self.workspace.add_file( ID=file_id, From 49b6dfe7353beff1f659a081201716bf183bfcbe Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 19 Dec 2019 16:30:11 +0100 Subject: [PATCH 04/93] =?UTF-8?q?=F0=9F=A7=B9=20Clean=20up=20trailing=20wh?= =?UTF-8?q?itespace?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/recognize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index c5a6b12..fc21a12 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -85,7 +85,7 @@ class CalamariRecognize(Processor): if line.get_TextEquiv(): log.warning("Line '%s' already contained text results", line.id) line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)]) - + if line.get_Word(): log.warning("Line '%s' already contained word segmentation", line.id) line.set_Word([]) From 357a2a970abe642f9fee33c468622c30f5335e6a Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 15 Jan 2020 15:05:33 +0100 Subject: [PATCH 05/93] =?UTF-8?q?=E2=AC=86=20Update=20model=20download=20U?= =?UTF-8?q?RL?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 75cc1c8..0e0c298 100644 --- a/Makefile +++ b/Makefile @@ -39,7 +39,7 @@ calamari: gt4histocr-calamari: mkdir gt4histocr-calamari cd gt4histocr-calamari && \ - wget https://file.spk-berlin.de:8443/calamari-models/GT4HistOCR/model.tar.xz && \ + wget https://qurator-data.de/calamari-models/GT4HistOCR/model.tar.xz && \ tar xfv model.tar.xz && \ rm model.tar.xz From d2ca24bf1eddf72ed8b71d7976bfa96c259df521 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 15 Jan 2020 15:54:09 +0100 Subject: [PATCH 06/93] =?UTF-8?q?=E2=9C=85=20CircleCI:=20Try=20to=20fix=20?= =?UTF-8?q?encoding=20problem?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .circleci/config.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index a97d20b..bd115c8 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -12,7 +12,12 @@ jobs: - checkout - run: make install PIP_INSTALL="pip3 install" - run: pip3 install -r requirements-test.txt - - run: make coverage + - run: + command: | + if [[ $(python -c "import sys; print(sys.stdin.encoding)" |grep None) ]]; then + export PYTHONIOENCODING=utf-8 + fi + make coverage - codecov/upload workflows: From 7bdd15648f7f85558e5b4acaa960e00320339c0a Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 15 Jan 2020 15:59:30 +0100 Subject: [PATCH 07/93] =?UTF-8?q?=E2=9C=85=20CircleCI:=20Try=20to=20fix=20?= =?UTF-8?q?encoding=20problem?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index bd115c8..aa16ae1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -14,7 +14,7 @@ jobs: - run: pip3 install -r requirements-test.txt - run: command: | - if [[ $(python -c "import sys; print(sys.stdin.encoding)" |grep None) ]]; then + if [[ $(python3 -c "import sys; print(sys.stdin.encoding)" |grep None) ]]; then export PYTHONIOENCODING=utf-8 fi make coverage From e8f60f9bf41664767eee71e1e483019cb8a82ae1 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 15 Jan 2020 16:08:11 +0100 Subject: [PATCH 08/93] =?UTF-8?q?=E2=9C=85=20CircleCI:=20Try=20to=20fix=20?= =?UTF-8?q?encoding=20problem?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .circleci/config.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index aa16ae1..8cdc067 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -14,9 +14,7 @@ jobs: - run: pip3 install -r requirements-test.txt - run: command: | - if [[ $(python3 -c "import sys; print(sys.stdin.encoding)" |grep None) ]]; then - export PYTHONIOENCODING=utf-8 - fi + export PYTHONIOENCODING=utf-8 make coverage - codecov/upload From 2797b0e806fe377df64781754fa183c5a73c26e6 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 15 Jan 2020 16:15:25 +0100 Subject: [PATCH 09/93] =?UTF-8?q?=E2=9C=85=20CircleCI:=20Try=20to=20fix=20?= =?UTF-8?q?encoding=20problem?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .circleci/config.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 8cdc067..bcf7e4b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7,15 +7,14 @@ jobs: build-python36: docker: - image: ubuntu:18.04 + environment: + - PYTHONIOENCODING: utf-8 steps: - run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget imagemagick - checkout - run: make install PIP_INSTALL="pip3 install" - run: pip3 install -r requirements-test.txt - - run: - command: | - export PYTHONIOENCODING=utf-8 - make coverage + - run: make coverage - codecov/upload workflows: From 1c3626559948720e063d21c875b66ae5a530695f Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 15 Jan 2020 17:39:01 +0100 Subject: [PATCH 10/93] =?UTF-8?q?=E2=AC=86=20Update=20ocrd?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 17de3dc..52bffb3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ tensorflow-gpu == 1.14.0 calamari-ocr == 0.3.5 setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? click -ocrd >= 1.0.0b11 +ocrd >= 2.2.1 From 60aa1583414412081e6f01b174dbbdcf5a9e4af5 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 15 Jan 2020 18:07:39 +0100 Subject: [PATCH 11/93] circle: set locale to a UTF-8 variant so python doesn't fall back to ascii --- .circleci/config.yml | 5 +++-- Makefile | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index bcf7e4b..de8bbbe 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -10,11 +10,12 @@ jobs: environment: - PYTHONIOENCODING: utf-8 steps: - - run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget imagemagick + - run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget imagemagick locales + - run: locale-gen "en_US.UTF-8"; update-locale LC_ALL="en_US.UTF-8" - checkout - run: make install PIP_INSTALL="pip3 install" - run: pip3 install -r requirements-test.txt - - run: make coverage + - run: make coverage LC_ALL=en_US.utf8 - codecov/upload workflows: diff --git a/Makefile b/Makefile index 0e0c298..5a37869 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,4 @@ +export # export variables to subshells PIP_INSTALL = pip3 install GIT_CLONE = git clone PYTHON = python3 From c09fe169f2320844a1a35f854350eb484a31aa4a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 28 Jan 2020 22:17:13 +0000 Subject: [PATCH 12/93] Bump tensorflow-gpu from 1.14.0 to 1.15.2 Bumps [tensorflow-gpu](https://github.com/tensorflow/tensorflow) from 1.14.0 to 1.15.2. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v1.14.0...v1.15.2) Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 52bffb3..94a797e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ numpy -tensorflow-gpu == 1.14.0 +tensorflow-gpu == 1.15.2 calamari-ocr == 0.3.5 setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? click From 739f43e9da5eea36a237d5bb0c362d9661de5085 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 30 Jan 2020 19:04:20 +0100 Subject: [PATCH 13/93] =?UTF-8?q?=F0=9F=90=9B=20Use=20the=20documented=20p?= =?UTF-8?q?ackage=20name=20for=20TensorFlow=201.15.x?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 94a797e..fc2ed16 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ numpy -tensorflow-gpu == 1.15.2 +tensorflow == 1.15.* calamari-ocr == 0.3.5 setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? click From 7d02c8dff058d2eafe69209592e0a90a4bfe6ab3 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 30 Jan 2020 19:06:42 +0100 Subject: [PATCH 14/93] =?UTF-8?q?=F0=9F=93=9D=20README-DEV:=20Document=20i?= =?UTF-8?q?nstalling=20test=20requirements?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README-DEV.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README-DEV.md b/README-DEV.md index da2025a..366f436 100644 --- a/README-DEV.md +++ b/README-DEV.md @@ -4,6 +4,7 @@ In a Python 3 virtualenv: ~~~ pip install -e . +pip install -r requirements-test.txt make test ~~~ From 5dfd809fbc980ff73f31f9434dfa2eb88d15138c Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 31 Jan 2020 10:46:50 +0100 Subject: [PATCH 15/93] =?UTF-8?q?=F0=9F=90=9B=20CircleCI:=20Try=20upgradin?= =?UTF-8?q?g=20pip?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index de8bbbe..11a4289 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -13,6 +13,7 @@ jobs: - run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget imagemagick locales - run: locale-gen "en_US.UTF-8"; update-locale LC_ALL="en_US.UTF-8" - checkout + - run: pip3 install --upgrade pip - run: make install PIP_INSTALL="pip3 install" - run: pip3 install -r requirements-test.txt - run: make coverage LC_ALL=en_US.utf8 From c416e0c2534e53185e6ae58290750bba0ec7f0e1 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 31 Jan 2020 10:56:16 +0100 Subject: [PATCH 16/93] =?UTF-8?q?Revert=20"=F0=9F=90=9B=20Use=20the=20docu?= =?UTF-8?q?mented=20package=20name=20for=20TensorFlow=201.15.x"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 739f43e9da5eea36a237d5bb0c362d9661de5085. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index fc2ed16..94a797e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ numpy -tensorflow == 1.15.* +tensorflow-gpu == 1.15.2 calamari-ocr == 0.3.5 setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? click From 17dbeb248082f0b7c6207273ee91f8f62a05a6bc Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 31 Jan 2020 10:57:30 +0100 Subject: [PATCH 17/93] =?UTF-8?q?=F0=9F=94=A7=20Loosen=20tensorflow-gpu=20?= =?UTF-8?q?dependency=20a=20bit=20to=201.15.*?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 94a797e..0a426e0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ numpy -tensorflow-gpu == 1.15.2 +tensorflow-gpu == 1.15.* calamari-ocr == 0.3.5 setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? click From 24532f693acd9bc5028640c444cb2b2b33ab9470 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 31 Jan 2020 17:45:00 +0100 Subject: [PATCH 18/93] =?UTF-8?q?=F0=9F=9A=A7=20Use=20character=20position?= =?UTF-8?q?s=20as=20word=20segmentation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/recognize.py | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index fc21a12..3dfff38 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -13,9 +13,10 @@ from ocrd_models.ocrd_page import ( LabelType, LabelsType, MetadataItemType, TextEquivType, + WordType, CoordsType, to_xml ) -from ocrd_utils import getLogger, concat_padded, MIMETYPE_PAGE +from ocrd_utils import getLogger, concat_padded, coordinates_for_segment, points_from_polygon, MIMETYPE_PAGE from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL @@ -69,7 +70,7 @@ class CalamariRecognize(Processor): for (line_no, line) in enumerate(textlines): log.debug("Recognizing line '%s' in region '%s'", line_no, region.id) - line_image, line_xywh = self.workspace.image_from_segment(line, region_image, region_xywh) + line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_xywh) line_image_np = np.array(line_image, dtype=np.uint8) raw_results = list(self.predictor.predict_raw([line_image_np], progress_bar=False))[0] @@ -82,14 +83,41 @@ class CalamariRecognize(Processor): line_text = prediction.sentence line_conf = prediction.avg_char_probability + # Delete existing results if line.get_TextEquiv(): log.warning("Line '%s' already contained text results", line.id) - line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)]) - + line.set_TextEquiv([]) if line.get_Word(): log.warning("Line '%s' already contained word segmentation", line.id) line.set_Word([]) + # Save line results + line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)]) + + # Save word results + # XXX For early development just put every char = glyph into its own word + for word_no, p in enumerate(prediction.positions): + start = p.global_start + end = p.global_end + + + # XXX Maybe use version in ocrd_tesserocr + h = line_image.height + polygon = [(start, 0), (end, 0), (end, h), (start, h)] + points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords)) + + word = WordType( + id='%s_word%04d' % (line.id, word_no), + Coords=CoordsType(points)) + + chars = sorted(p.chars, key=lambda k: k.probability, reverse=True) + for index, char in enumerate(chars): + if char.char: + word.add_TextEquiv(TextEquivType(Unicode=char.char, index=index, conf=char.probability)) + # XXX Note that omission probabilities are not normalized?! + + line.add_Word(word) + _page_update_higher_textequiv_levels('line', pcgts) From 507bc1ce5ebd91bb651d933148c18b8e184aae69 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 3 Feb 2020 12:22:01 +0100 Subject: [PATCH 19/93] =?UTF-8?q?=E2=9C=A8=20Include=20proper=20word=20+?= =?UTF-8?q?=20glyph=20segmentation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/recognize.py | 62 +++++++++++++++++++++++++++----------- requirements.txt | 1 + 2 files changed, 45 insertions(+), 18 deletions(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 3dfff38..c3dd474 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -4,6 +4,7 @@ import os from glob import glob import numpy as np +import uniseg.wordbreak from calamari_ocr.ocr import MultiPredictor from calamari_ocr.ocr.voting import voter_from_proto from calamari_ocr.proto import VoterParams @@ -13,7 +14,7 @@ from ocrd_models.ocrd_page import ( LabelType, LabelsType, MetadataItemType, TextEquivType, - WordType, CoordsType, + WordType, GlyphType, CoordsType, to_xml ) from ocrd_utils import getLogger, concat_padded, coordinates_for_segment, points_from_polygon, MIMETYPE_PAGE @@ -95,28 +96,53 @@ class CalamariRecognize(Processor): line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)]) # Save word results - # XXX For early development just put every char = glyph into its own word - for word_no, p in enumerate(prediction.positions): - start = p.global_start - end = p.global_end + def unwanted(c): + return c == " " + word_no = 0 + i = 0 + for word_text in uniseg.wordbreak.words(prediction.sentence): + print(word_text) + word_length = len(word_text) + do_not_include = all(unwanted(c) for c in word_text) - # XXX Maybe use version in ocrd_tesserocr - h = line_image.height - polygon = [(start, 0), (end, 0), (end, h), (start, h)] - points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords)) + if not do_not_include: + word_positions = prediction.positions[i:i+word_length] + word_start = word_positions[0].global_start + word_end = word_positions[-1].global_end - word = WordType( - id='%s_word%04d' % (line.id, word_no), - Coords=CoordsType(points)) + # XXX Maybe use version in ocrd_tesserocr + h = line_image.height + polygon = [(word_start, 0), (word_end, 0), (word_end, h), (word_start, h)] + points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords)) - chars = sorted(p.chars, key=lambda k: k.probability, reverse=True) - for index, char in enumerate(chars): - if char.char: - word.add_TextEquiv(TextEquivType(Unicode=char.char, index=index, conf=char.probability)) - # XXX Note that omission probabilities are not normalized?! + word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points)) + word.add_TextEquiv(TextEquivType(Unicode=word_text)) + + for glyph_no, p in enumerate(word_positions): + glyph_start = p.global_start + glyph_end = p.global_end + + # XXX Maybe use version in ocrd_tesserocr + h = line_image.height + polygon = [(glyph_start, 0), (glyph_end, 0), (glyph_end, h), (glyph_start, h)] + points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords)) + + glyph = GlyphType(id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points)) + + chars = sorted(p.chars, key=lambda k: k.probability, reverse=True) + for index, char in enumerate(chars): + if char.char: + glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=index, conf=char.probability)) + # XXX Note that omission probabilities are not normalized?! + word.add_Glyph(glyph) + + line.add_Word(word) + + + i += word_length + word_no += 1 - line.add_Word(word) _page_update_higher_textequiv_levels('line', pcgts) diff --git a/requirements.txt b/requirements.txt index 0a426e0..1b6d3a6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ calamari-ocr == 0.3.5 setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? click ocrd >= 2.2.1 +uniseg From 2ccfc7b195cc6f599d6ec86db5ac275b4cda0533 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 3 Feb 2020 12:22:22 +0100 Subject: [PATCH 20/93] =?UTF-8?q?=F0=9F=8E=A8=20Set=20vim=20textwidth?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/recognize.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index c3dd474..ca68ebe 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -203,3 +203,5 @@ def _page_update_higher_textequiv_levels(level, pcgts): else u'' for line in lines) region.set_TextEquiv( [TextEquivType(Unicode=region_unicode)]) # remove old + +# vim:tw=120: From decaa7b69f2bbee9c6947d50f55c5906319798ff Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 3 Feb 2020 14:03:01 +0100 Subject: [PATCH 21/93] =?UTF-8?q?=F0=9F=8E=A8=20Use=20polygon=5Ffrom=5Fx0y?= =?UTF-8?q?0x1y1=20to=20build=20word/glyph=20polygon?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/recognize.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index ca68ebe..c5dba59 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -17,7 +17,11 @@ from ocrd_models.ocrd_page import ( WordType, GlyphType, CoordsType, to_xml ) -from ocrd_utils import getLogger, concat_padded, coordinates_for_segment, points_from_polygon, MIMETYPE_PAGE +from ocrd_utils import ( + getLogger, concat_padded, + coordinates_for_segment, points_from_polygon, polygon_from_x0y0x1y1, + MIMETYPE_PAGE +) from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL @@ -111,9 +115,7 @@ class CalamariRecognize(Processor): word_start = word_positions[0].global_start word_end = word_positions[-1].global_end - # XXX Maybe use version in ocrd_tesserocr - h = line_image.height - polygon = [(word_start, 0), (word_end, 0), (word_end, h), (word_start, h)] + polygon = polygon_from_x0y0x1y1([word_start, 0, word_end, line_image.height]) points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords)) word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points)) @@ -123,9 +125,7 @@ class CalamariRecognize(Processor): glyph_start = p.global_start glyph_end = p.global_end - # XXX Maybe use version in ocrd_tesserocr - h = line_image.height - polygon = [(glyph_start, 0), (glyph_end, 0), (glyph_end, h), (glyph_start, h)] + polygon = polygon_from_x0y0x1y1([glyph_start, 0, glyph_end, line_image.height]) points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords)) glyph = GlyphType(id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points)) From f75426060e79808c3683ddc3e25fdad952a953ca Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 3 Feb 2020 14:03:28 +0100 Subject: [PATCH 22/93] =?UTF-8?q?=F0=9F=A7=B9=20Remove=20debugging=20print?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/recognize.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index c5dba59..d64933b 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -106,7 +106,6 @@ class CalamariRecognize(Processor): word_no = 0 i = 0 for word_text in uniseg.wordbreak.words(prediction.sentence): - print(word_text) word_length = len(word_text) do_not_include = all(unwanted(c) for c in word_text) From 26501899109a55e8d009b6479d1493d1a4e1db6b Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 3 Feb 2020 14:03:54 +0100 Subject: [PATCH 23/93] =?UTF-8?q?=F0=9F=A7=B9=20Add=20whitespace?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/recognize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index d64933b..772c680 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -134,6 +134,7 @@ class CalamariRecognize(Processor): if char.char: glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=index, conf=char.probability)) # XXX Note that omission probabilities are not normalized?! + word.add_Glyph(glyph) line.add_Word(word) From 0a572df0ba5ef61e1c162d10a30c9ce3a65f47d5 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 3 Feb 2020 15:31:36 +0100 Subject: [PATCH 24/93] =?UTF-8?q?=F0=9F=93=9D=20README:=20Add=20informatio?= =?UTF-8?q?n=20about=20the=20new=20glyph=20and=20word=20segmentation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index aea4d41..18041e8 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,14 @@ This offers a OCR-D compliant workspace processor for some of the functionality This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized image) as its input. +In addition to the line text it also outputs glyph segmentation including +per-glyph confidence values and per-glyph alternative predictions as provided +by the Calamari OCR engine. Note that while Calamari does not provide word +segmentation, this processor produces word segmentation inferred from Unicode +text segmentation and the glyph positions. The provided glyph and word +segmentation can be used for text extraction and highlighting, but is probably +not useful for further image-based processing. + ## Installation ### From PyPI From 91cca1e1b8672018e88a6adae2190f3d780b3e57 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 3 Feb 2020 15:33:11 +0100 Subject: [PATCH 25/93] =?UTF-8?q?=F0=9F=93=9D=20Document=20why=20we=20are?= =?UTF-8?q?=20using=20Unicode=20text=20segmentation=20to=20produce=20word?= =?UTF-8?q?=20results?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/recognize.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 772c680..0624d9b 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -100,6 +100,11 @@ class CalamariRecognize(Processor): line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)]) # Save word results + # + # Calamari OCR does not provide word positions, so we infer word positions from a. Unicode text + # segmentation and b. the glyph positions. This is necessary because the PAGE XML format enforces + # a strict hierarchy of lines > words > glyphs. + def unwanted(c): return c == " " From 3149e1d9e062ade7d39da916459e9c01e7a965c0 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 3 Feb 2020 15:33:38 +0100 Subject: [PATCH 26/93] =?UTF-8?q?=F0=9F=93=9D=20unwanted()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/recognize.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 0624d9b..222bff3 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -106,6 +106,11 @@ class CalamariRecognize(Processor): # a strict hierarchy of lines > words > glyphs. def unwanted(c): + """ + Define unwanted characters + + Words only containing these e.g. whitespace characters are not considered as words. + """ return c == " " word_no = 0 From 909632493b74463b5b336129d34ae9a285143980 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 3 Feb 2020 17:37:19 +0100 Subject: [PATCH 27/93] =?UTF-8?q?=F0=9F=9A=A7=20Add=20future=20TODOs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/recognize.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 222bff3..bde4218 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -116,6 +116,7 @@ class CalamariRecognize(Processor): word_no = 0 i = 0 for word_text in uniseg.wordbreak.words(prediction.sentence): + # XXX Re-use word segmentation from dinglehopper, i.e. support private use characters word_length = len(word_text) do_not_include = all(unwanted(c) for c in word_text) @@ -126,6 +127,7 @@ class CalamariRecognize(Processor): polygon = polygon_from_x0y0x1y1([word_start, 0, word_end, line_image.height]) points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords)) + # XXX Crop to line polygon? word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points)) word.add_TextEquiv(TextEquivType(Unicode=word_text)) From 0f9c94e7dc4f4577ec1465a1cb0613d310941728 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 3 Feb 2020 17:40:45 +0100 Subject: [PATCH 28/93] =?UTF-8?q?=F0=9F=90=9B=20Start=20with=20TextEquiv?= =?UTF-8?q?=20index=3D1=20to=20adhere=20to=20OCR-D=20PAGE=20conventions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://ocr-d.github.io/page#multiple-textequivs --- ocrd_calamari/recognize.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index bde4218..4c27e5e 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -142,10 +142,12 @@ class CalamariRecognize(Processor): glyph = GlyphType(id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points)) chars = sorted(p.chars, key=lambda k: k.probability, reverse=True) - for index, char in enumerate(chars): + char_index = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs + for char in chars: if char.char: - glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=index, conf=char.probability)) - # XXX Note that omission probabilities are not normalized?! + glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=char_index, conf=char.probability)) + char_index += 1 + # XXX Note that omission probabilities are not normalized?! word.add_Glyph(glyph) From 6f4736f8e4ef6f077b7f24ec8052ebb0ff10b982 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 3 Feb 2020 19:10:16 +0100 Subject: [PATCH 29/93] =?UTF-8?q?=E2=9C=A8=20Do=20word=20segmentation=20as?= =?UTF-8?q?=20expected=20by=20OCR-D=20PAGE=20specs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 12 +++++------ ocrd_calamari/recognize.py | 42 +++++++++++++++++++++----------------- requirements.txt | 1 - 3 files changed, 29 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 18041e8..303efe3 100644 --- a/README.md +++ b/README.md @@ -14,12 +14,12 @@ This processor only operates on the text line level and so needs a line segmenta image) as its input. In addition to the line text it also outputs glyph segmentation including -per-glyph confidence values and per-glyph alternative predictions as provided -by the Calamari OCR engine. Note that while Calamari does not provide word -segmentation, this processor produces word segmentation inferred from Unicode -text segmentation and the glyph positions. The provided glyph and word -segmentation can be used for text extraction and highlighting, but is probably -not useful for further image-based processing. +per-glyph confidence values and per-glyph alternative predictions as provided by +the Calamari OCR engine. Note that while Calamari does not provide word +segmentation, this processor produces word segmentation inferred from text +segmentation and the glyph positions. The provided glyph and word segmentation +can be used for text extraction and highlighting, but is probably not useful for +further image-based processing. ## Installation diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 4c27e5e..cd2d84e 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -4,7 +4,6 @@ import os from glob import glob import numpy as np -import uniseg.wordbreak from calamari_ocr.ocr import MultiPredictor from calamari_ocr.ocr.voting import voter_from_proto from calamari_ocr.proto import VoterParams @@ -101,26 +100,32 @@ class CalamariRecognize(Processor): # Save word results # - # Calamari OCR does not provide word positions, so we infer word positions from a. Unicode text - # segmentation and b. the glyph positions. This is necessary because the PAGE XML format enforces - # a strict hierarchy of lines > words > glyphs. - - def unwanted(c): - """ - Define unwanted characters - - Words only containing these e.g. whitespace characters are not considered as words. - """ - return c == " " + # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation + # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict + # hierarchy of lines > words > glyphs. + + def _words(s): + """Split words based on spaces and include spaces as 'words'""" + spaces = None + word = '' + for c in s: + if c == ' ' and spaces is True: + word += c + elif c != ' ' and spaces is False: + word += c + else: + if word: + yield word + word = c + spaces = (c == ' ') + yield word word_no = 0 i = 0 - for word_text in uniseg.wordbreak.words(prediction.sentence): - # XXX Re-use word segmentation from dinglehopper, i.e. support private use characters - word_length = len(word_text) - do_not_include = all(unwanted(c) for c in word_text) - if not do_not_include: + for word_text in _words(prediction.sentence): + word_length = len(word_text) + if not all(c == ' ' for c in word_text): word_positions = prediction.positions[i:i+word_length] word_start = word_positions[0].global_start word_end = word_positions[-1].global_end @@ -152,10 +157,9 @@ class CalamariRecognize(Processor): word.add_Glyph(glyph) line.add_Word(word) - + word_no += 1 i += word_length - word_no += 1 _page_update_higher_textequiv_levels('line', pcgts) diff --git a/requirements.txt b/requirements.txt index 1b6d3a6..0a426e0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,3 @@ calamari-ocr == 0.3.5 setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? click ocrd >= 2.2.1 -uniseg From 9010250911323194a83aaa0e9b372f3c97c0f50a Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 4 Feb 2020 13:54:45 +0100 Subject: [PATCH 30/93] =?UTF-8?q?=E2=99=BB=20test:=20Move=20binarization?= =?UTF-8?q?=20into=20the=20workspace=20fixture?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_recognize.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_recognize.py b/test/test_recognize.py index 0fca48f..b6b6980 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -30,10 +30,6 @@ def workspace(): "https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f, os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)) - return workspace - - -def test_recognize(workspace): # The binarization options I have are: # # a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf) @@ -46,6 +42,10 @@ def test_recognize(workspace): ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f) subprocess.call(['convert', ff, '-threshold', '50%', ff]) + return workspace + + +def test_recognize(workspace): # XXX Should remove GT text to really test this CalamariRecognize( From 82fe0333f10a9e8796d92baee43e76f06fa08e91 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 4 Feb 2020 18:40:06 +0100 Subject: [PATCH 31/93] =?UTF-8?q?=E2=9C=85=20Test=20word=20segmentation=20?= =?UTF-8?q?(Fixes=20#30)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_recognize.py | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/test/test_recognize.py b/test/test_recognize.py index b6b6980..11b34ad 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -2,6 +2,7 @@ import os import shutil import subprocess import urllib.request +from lxml import etree import pytest from ocrd.resolver import Resolver @@ -11,6 +12,7 @@ from .base import assets METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml') WORKSPACE_DIR = '/tmp/test-ocrd-calamari' +CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json') @pytest.fixture @@ -53,12 +55,44 @@ def test_recognize(workspace): input_file_grp="OCR-D-GT-SEG-LINE", output_file_grp="OCR-D-OCR-CALAMARI", parameter={ - 'checkpoint': os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json') + "checkpoint": CHECKPOINT, } ).process() workspace.save_mets() - page1 = os.path.join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml') + page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml") assert os.path.exists(page1) - with open(page1, 'r', encoding='utf-8') as f: - assert 'verſchuldeten' in f.read() + with open(page1, "r", encoding="utf-8") as f: + assert "verſchuldeten" in f.read() + + +def test_word_segmentation(workspace): + CalamariRecognize( + workspace, + input_file_grp="OCR-D-GT-SEG-LINE", + output_file_grp="OCR-D-OCR-CALAMARI", + parameter={ + "checkpoint": CHECKPOINT, + } + ).process() + workspace.save_mets() + + page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml") + assert os.path.exists(page1) + tree = etree.parse(page1) + + NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" } + + # The result should contain a TextLine that contains the text "December" + line = tree.xpath(".//pc:TextLine[pc:TextEquiv/pc:Unicode[contains(text(),'December')]]", namespaces=NSMAP)[0] + assert line + + # The textline should a. contain multiple words and b. these should concatenate fine to produce the same line text + words = line.xpath(".//pc:Word", namespaces=NSMAP) + assert len(words) >= 2 + words_text = " ".join(word.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text for word in words) + line_text = line.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text + assert words_text == line_text + + +# vim:tw=120: From 0f0bae18ba4c25ed6c609097519bce1bf4d2e941 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 4 Feb 2020 19:29:56 +0100 Subject: [PATCH 32/93] =?UTF-8?q?=E2=9C=85=20Remove=20GT=20text=20to=20not?= =?UTF-8?q?=20accidently=20check=20it=20instead=20of=20OCR=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_recognize.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/test/test_recognize.py b/test/test_recognize.py index 11b34ad..e576ac5 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -3,6 +3,7 @@ import shutil import subprocess import urllib.request from lxml import etree +from glob import glob import pytest from ocrd.resolver import Resolver @@ -10,10 +11,15 @@ from ocrd.resolver import Resolver from ocrd_calamari import CalamariRecognize from .base import assets + METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml') WORKSPACE_DIR = '/tmp/test-ocrd-calamari' CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json') +# Because XML namespace versions are so much fun, we not only use one, we use TWO! +NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" } +NSMAP_GT = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" } + @pytest.fixture def workspace(): @@ -44,12 +50,20 @@ def workspace(): ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f) subprocess.call(['convert', ff, '-threshold', '50%', ff]) + # Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text + for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-LINE"): + workspace.download_file(of) + for to_remove in ["//pc:Word", "//pc:TextEquiv"]: + for ff in glob(os.path.join(WORKSPACE_DIR, "OCR-D-GT-SEG-LINE", "*")): + tree = etree.parse(ff) + for e in tree.xpath(to_remove, namespaces=NSMAP_GT): + e.getparent().remove(e) + tree.write(ff, xml_declaration=True, encoding="utf-8") + return workspace def test_recognize(workspace): - # XXX Should remove GT text to really test this - CalamariRecognize( workspace, input_file_grp="OCR-D-GT-SEG-LINE", @@ -81,8 +95,6 @@ def test_word_segmentation(workspace): assert os.path.exists(page1) tree = etree.parse(page1) - NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" } - # The result should contain a TextLine that contains the text "December" line = tree.xpath(".//pc:TextLine[pc:TextEquiv/pc:Unicode[contains(text(),'December')]]", namespaces=NSMAP)[0] assert line From ef3fb44fb528e9e52b78fe1e787e38142da7a7d4 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 5 Feb 2020 13:02:10 +0100 Subject: [PATCH 33/93] =?UTF-8?q?=E2=9C=A8=20Allow=20controlling=20of=20ou?= =?UTF-8?q?tput=20hierarchy=20level,=20e.g.=20only=20line,=20not=20words+g?= =?UTF-8?q?lyphs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 12 ++++--- ocrd_calamari/ocrd-tool.json | 6 ++++ ocrd_calamari/recognize.py | 66 +++++++++++++++++++----------------- test/test_recognize.py | 26 ++++++++++++++ 4 files changed, 73 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index 303efe3..2126724 100644 --- a/README.md +++ b/README.md @@ -13,10 +13,11 @@ This offers a OCR-D compliant workspace processor for some of the functionality This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized image) as its input. -In addition to the line text it also outputs glyph segmentation including -per-glyph confidence values and per-glyph alternative predictions as provided by -the Calamari OCR engine. Note that while Calamari does not provide word -segmentation, this processor produces word segmentation inferred from text +In addition to the line text it may also output word and glyph segmentation +including per-glyph confidence values and per-glyph alternative predictions as +provided by the Calamari OCR engine, using a `textequiv_level` of `word` or +`glyph`. Note that while Calamari does not provide word segmentation, this +processor produces word segmentation inferred from text segmentation and the glyph positions. The provided glyph and word segmentation can be used for text extraction and highlighting, but is probably not useful for further image-based processing. @@ -53,7 +54,8 @@ ocrd-calamari-recognize -p test-parameters.json -m mets.xml -I OCR-D-SEG-LINE -O With `test-parameters.json`: ~~~ { - "checkpoint": "/path/to/some/trained/models/*.ckpt.json" + "checkpoint": "/path/to/some/trained/models/*.ckpt.json", + "textequiv_level": "line", } ~~~ diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index 54d2206..1174243 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -25,6 +25,12 @@ "voter": { "description": "The voting algorithm to use", "type": "string", "default": "confidence_voter_default_ctc" + }, + "textequiv_level": { + "type": "string", + "enum": ["line", "word", "glyph"], + "default": "line", + "description": "Deepest PAGE XML hierarchy level to include TextEquiv results for" } } } diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index cd2d84e..7b2db0d 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -120,46 +120,48 @@ class CalamariRecognize(Processor): spaces = (c == ' ') yield word - word_no = 0 - i = 0 - - for word_text in _words(prediction.sentence): - word_length = len(word_text) - if not all(c == ' ' for c in word_text): - word_positions = prediction.positions[i:i+word_length] - word_start = word_positions[0].global_start - word_end = word_positions[-1].global_end - - polygon = polygon_from_x0y0x1y1([word_start, 0, word_end, line_image.height]) - points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords)) - # XXX Crop to line polygon? + if self.parameter['textequiv_level'] in ['word', 'glyph']: + word_no = 0 + i = 0 + + for word_text in _words(prediction.sentence): + word_length = len(word_text) + if not all(c == ' ' for c in word_text): + word_positions = prediction.positions[i:i+word_length] + word_start = word_positions[0].global_start + word_end = word_positions[-1].global_end + + polygon = polygon_from_x0y0x1y1([word_start, 0, word_end, line_image.height]) + points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords)) + # XXX Crop to line polygon? - word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points)) - word.add_TextEquiv(TextEquivType(Unicode=word_text)) + word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points)) + word.add_TextEquiv(TextEquivType(Unicode=word_text)) - for glyph_no, p in enumerate(word_positions): - glyph_start = p.global_start - glyph_end = p.global_end + if self.parameter['textequiv_level'] == 'glyph': + for glyph_no, p in enumerate(word_positions): + glyph_start = p.global_start + glyph_end = p.global_end - polygon = polygon_from_x0y0x1y1([glyph_start, 0, glyph_end, line_image.height]) - points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords)) + polygon = polygon_from_x0y0x1y1([glyph_start, 0, glyph_end, line_image.height]) + points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords)) - glyph = GlyphType(id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points)) + glyph = GlyphType(id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points)) - chars = sorted(p.chars, key=lambda k: k.probability, reverse=True) - char_index = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs - for char in chars: - if char.char: - glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=char_index, conf=char.probability)) - char_index += 1 - # XXX Note that omission probabilities are not normalized?! + chars = sorted(p.chars, key=lambda k: k.probability, reverse=True) + char_index = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs + for char in chars: + if char.char: + glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=char_index, conf=char.probability)) + char_index += 1 + # XXX Note that omission probabilities are not normalized?! - word.add_Glyph(glyph) + word.add_Glyph(glyph) - line.add_Word(word) - word_no += 1 + line.add_Word(word) + word_no += 1 - i += word_length + i += word_length _page_update_higher_textequiv_levels('line', pcgts) diff --git a/test/test_recognize.py b/test/test_recognize.py index e576ac5..5db48cf 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -87,6 +87,7 @@ def test_word_segmentation(workspace): output_file_grp="OCR-D-OCR-CALAMARI", parameter={ "checkpoint": CHECKPOINT, + "textequiv_level": "word", # Note that we're going down to word level here } ).process() workspace.save_mets() @@ -106,5 +107,30 @@ def test_word_segmentation(workspace): line_text = line.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text assert words_text == line_text + # For extra measure, check that we're not seeing any glyphs, as we asked for textequiv_level == "word" + glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP) + assert len(glyphs) == 0 + + +def test_glyphs(workspace): + CalamariRecognize( + workspace, + input_file_grp="OCR-D-GT-SEG-LINE", + output_file_grp="OCR-D-OCR-CALAMARI", + parameter={ + "checkpoint": CHECKPOINT, + "textequiv_level": "glyph", # Note that we're going down to glyph level here + } + ).process() + workspace.save_mets() + + page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml") + assert os.path.exists(page1) + tree = etree.parse(page1) + + # The result should contain a lot of glyphs + glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP) + assert len(glyphs) >= 100 + # vim:tw=120: From e39a2bce01666a896d665c80fbacea534b27edc5 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 5 Feb 2020 13:07:56 +0100 Subject: [PATCH 34/93] =?UTF-8?q?=F0=9F=93=9D=20Fix=20example=20parameters?= =?UTF-8?q?=20JSON?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2126724..359f1a3 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ With `test-parameters.json`: ~~~ { "checkpoint": "/path/to/some/trained/models/*.ckpt.json", - "textequiv_level": "line", + "textequiv_level": "line" } ~~~ From b802b4deafb767a52cadc1c491a4bb78d766cbef Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 5 Feb 2020 13:07:56 +0100 Subject: [PATCH 35/93] =?UTF-8?q?=E2=9C=A8=20Allow=20configuring=20a=20cut?= =?UTF-8?q?=20off=20confidence=20value=20for=20glyph=20alternatives?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 3 +++ ocrd_calamari/ocrd-tool.json | 6 ++++++ ocrd_calamari/recognize.py | 14 +++++++++----- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 359f1a3..8ebbf4f 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,9 @@ With `test-parameters.json`: } ~~~ +You may want to have a look at the [ocrd-tool.json](ocrd-tool.json) descriptions +for additional parameters and default values. + ## Development & Testing For information regarding development and testing, please see [README-DEV.md](README-DEV.md). diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index 1174243..b954d41 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -31,6 +31,12 @@ "enum": ["line", "word", "glyph"], "default": "line", "description": "Deepest PAGE XML hierarchy level to include TextEquiv results for" + }, + "glyph_conf_cutoff": { + "type": "number", + "format": "float", + "default": 0.001, + "description": "Only include glyph alternatives with confidences above this threshold" } } } diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 7b2db0d..c588a62 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -148,13 +148,17 @@ class CalamariRecognize(Processor): glyph = GlyphType(id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points)) - chars = sorted(p.chars, key=lambda k: k.probability, reverse=True) + # Filter predictions + chars = p.chars + chars = [c for c in chars if c.char] # XXX Note that omission probabilities are not normalized?! + chars = [c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff']] + + # Sort and add predictions (= TextEquivs) + chars = sorted(chars, key=lambda k: k.probability, reverse=True) char_index = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs for char in chars: - if char.char: - glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=char_index, conf=char.probability)) - char_index += 1 - # XXX Note that omission probabilities are not normalized?! + glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=char_index, conf=char.probability)) + char_index += 1 word.add_Glyph(glyph) From 0c7cd69526c126c87047148e2d1657f3f30bfa3b Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 5 Feb 2020 13:33:02 +0100 Subject: [PATCH 36/93] =?UTF-8?q?=F0=9F=93=9D=20README:=20Update=20intro?= =?UTF-8?q?=20that=20we're=20mostly=20on=20par=20with=20Calamari's=20funct?= =?UTF-8?q?ionality?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8ebbf4f..6ccd69e 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ ## Introduction -This offers a OCR-D compliant workspace processor for some of the functionality of Calamari OCR. +This offers a OCR-D compliant workspace processor for the functionality of Calamari OCR. This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized image) as its input. From 46fe34400fc03f4bf993d1f0d194eef5f693b728 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 5 Feb 2020 13:33:52 +0100 Subject: [PATCH 37/93] =?UTF-8?q?=F0=9F=93=9D=20README:=20Link=20to=20the?= =?UTF-8?q?=20correct=20ocrd-tool.json?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6ccd69e..b41357d 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ With `test-parameters.json`: } ~~~ -You may want to have a look at the [ocrd-tool.json](ocrd-tool.json) descriptions +You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions for additional parameters and default values. ## Development & Testing From 3e426b2a0a5366fac20e9b857af1c657e2773b98 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 5 Feb 2020 16:18:23 +0100 Subject: [PATCH 38/93] =?UTF-8?q?=F0=9F=93=9D=20README:=20Use=20gt4histocr?= =?UTF-8?q?-calamari=20from=20the=20Makefile=20in=20the=20example?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See #33. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8ebbf4f..f766c14 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ ocrd-calamari-recognize -p test-parameters.json -m mets.xml -I OCR-D-SEG-LINE -O With `test-parameters.json`: ~~~ { - "checkpoint": "/path/to/some/trained/models/*.ckpt.json", + "checkpoint": "/path/to/for/example/gt4histocr-calamari/*.ckpt.json", "textequiv_level": "line" } ~~~ From 3416a155ece8ae45c0ccabc665db8680a86602ea Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 5 Feb 2020 17:39:37 +0100 Subject: [PATCH 39/93] =?UTF-8?q?=F0=9F=93=9D=20README:=20Provide=20a=20co?= =?UTF-8?q?mplete=20example=20using=20real=20data=20and=20other=20processo?= =?UTF-8?q?rs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See #33. --- .gitignore | 1 + Makefile | 4 ++++ README.md | 27 ++++++++++++++++----------- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 42c4957..4061f82 100644 --- a/.gitignore +++ b/.gitignore @@ -107,5 +107,6 @@ venv.bak/ /calamari /calamari_models /gt4histocr-calamari +/actevedef_718448162* /repo /test/assets diff --git a/Makefile b/Makefile index 5a37869..c3e85ab 100644 --- a/Makefile +++ b/Makefile @@ -44,6 +44,10 @@ gt4histocr-calamari: tar xfv model.tar.xz && \ rm model.tar.xz +# Example data +actevedef_718448162: + wget https://qurator-data.de/examples/actevedef_718448162.zip && \ + unzip actevedef_718448162.zip # pip install calamari diff --git a/README.md b/README.md index 6f80434..2f6947b 100644 --- a/README.md +++ b/README.md @@ -46,18 +46,23 @@ ls gt4histocr-calamari ``` ## Example Usage +Before using `ocrd-calamari-recognize` get some example data and model, and +prepare the document for OCR: +``` +# Download model and example data +make gt4histocr-calamari +make actevedef_718448162 + +# Create binarized images and line segmentation using other OCR-D projects +ocrd-olena-binarize -p '{ "impl": "sauvola-ms-split" }' -I OCR-D-IMG -O OCR-D-IMG-BINPAGE,OCR-D-IMG-BIN +ocrd-tesserocr-segment-region -I OCR-D-IMG-BINPAGE -O OCR-D-SEG-REGION +ocrd-tesserocr-segment-line -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE +``` -~~~ -ocrd-calamari-recognize -p test-parameters.json -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI -~~~ - -With `test-parameters.json`: -~~~ -{ - "checkpoint": "/path/to/for/example/gt4histocr-calamari/*.ckpt.json", - "textequiv_level": "line" -} -~~~ +Finally recognize the text using ocrd_calamari and the downloaded model: +``` +ocrd-calamari-recognize -p '{ "checkpoint": "../gt4histocr-calamari/*.ckpt.json" }' -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI +``` You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions for additional parameters and default values. From 73beab177049ec0054fa1a1d54292e69e18ec393 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 5 Feb 2020 17:49:31 +0100 Subject: [PATCH 40/93] =?UTF-8?q?=F0=9F=93=9D=20README:=20Add=20a=20missin?= =?UTF-8?q?g=20`cd`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 2f6947b..e158b8d 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,7 @@ make gt4histocr-calamari make actevedef_718448162 # Create binarized images and line segmentation using other OCR-D projects +cd actevedef_718448162 ocrd-olena-binarize -p '{ "impl": "sauvola-ms-split" }' -I OCR-D-IMG -O OCR-D-IMG-BINPAGE,OCR-D-IMG-BIN ocrd-tesserocr-segment-region -I OCR-D-IMG-BINPAGE -O OCR-D-SEG-REGION ocrd-tesserocr-segment-line -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE From 4508e3ec47b1e7152e7899f7282e8eeb1920ce46 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 5 Feb 2020 17:55:51 +0100 Subject: [PATCH 41/93] =?UTF-8?q?=F0=9F=93=A6=20v0.0.4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/ocrd-tool.json | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index b954d41..5d181ef 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -1,6 +1,6 @@ { "git_url": "https://github.com/kba/ocrd_calamari", - "version": "0.0.3", + "version": "0.0.4", "tools": { "ocrd-calamari-recognize": { "executable": "ocrd-calamari-recognize", diff --git a/setup.py b/setup.py index 323d68a..4bf26b2 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import setup, find_packages setup( name='ocrd_calamari', - version='0.0.3', + version='0.0.4', description='Calamari bindings', long_description=Path('README.md').read_text(), long_description_content_type='text/markdown', From cf7a788854fe506bc41885debdf20d3442d4db34 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 6 Feb 2020 13:02:02 +0100 Subject: [PATCH 42/93] =?UTF-8?q?=F0=9F=93=9D=20README-DEV:=20Mention=20cl?= =?UTF-8?q?eaning=20up=20the=20dict/=20directory?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README-DEV.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README-DEV.md b/README-DEV.md index 366f436..b6f899f 100644 --- a/README-DEV.md +++ b/README-DEV.md @@ -17,5 +17,6 @@ Release * git push --tags PyPI: -* python sdist bdist_wheel -* twine upload dist/ocrd_calamari-* +* `rm -rf dist/` or backup if `dist/` exists already +* In the virtualenv: `python setup.py sdist bdist_wheel` +* `twine upload dist/ocrd_calamari-*` From b26194179cd8b9336d46cd9bf19a8abb927dcdeb Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 6 Feb 2020 13:03:06 +0100 Subject: [PATCH 43/93] =?UTF-8?q?=F0=9F=93=9D=20README-DEV:=20Improve=20ma?= =?UTF-8?q?rkdown?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README-DEV.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README-DEV.md b/README-DEV.md index b6f899f..9e739a4 100644 --- a/README-DEV.md +++ b/README-DEV.md @@ -10,11 +10,11 @@ make test Release ------- -* Update ocrd-tool.json version -* Update setup.py version -* git commit -m 'v' -* git tag -m 'v' 'v' -* git push --tags +* Update `ocrd-tool.json` version +* Update `setup.py` version +* `git commit -m 'v'` +* `git tag -m 'v' 'v'` +* `git push --tags` PyPI: * `rm -rf dist/` or backup if `dist/` exists already From 71096493ac6efb12f531e6a8e24a33c97e524766 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 6 Feb 2020 13:04:29 +0100 Subject: [PATCH 44/93] =?UTF-8?q?=F0=9F=93=9D=20README-DEV:=20Improve=20in?= =?UTF-8?q?fo=20about=20releasing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README-DEV.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README-DEV.md b/README-DEV.md index 9e739a4..8a1bbad 100644 --- a/README-DEV.md +++ b/README-DEV.md @@ -8,15 +8,15 @@ pip install -r requirements-test.txt make test ~~~ -Release -------- +Releasing +--------- * Update `ocrd-tool.json` version * Update `setup.py` version * `git commit -m 'v'` * `git tag -m 'v' 'v'` * `git push --tags` -PyPI: +### Uploading to PyPI * `rm -rf dist/` or backup if `dist/` exists already * In the virtualenv: `python setup.py sdist bdist_wheel` * `twine upload dist/ocrd_calamari-*` From 1fda419f2585cbba099263b0545b7e61f8b9793a Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 6 Feb 2020 13:43:36 +0100 Subject: [PATCH 45/93] =?UTF-8?q?=F0=9F=90=B3=20Fix=20Docker=20build?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 6bd7f73..07bc945 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,6 +10,7 @@ COPY Makefile . COPY setup.py . COPY ocrd-tool.json . COPY requirements.txt . +COPY README.md . COPY ocrd_calamari ocrd_calamari RUN make calamari/build From 7c18b1d39152e2b50cdcfd3d4bd328ba2317685a Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 6 Feb 2020 13:43:59 +0100 Subject: [PATCH 46/93] =?UTF-8?q?=F0=9F=90=B3=20Docker:=20Use=20ocrd/core:?= =?UTF-8?q?master=20instead=20of=20outdated=20:edge?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 07bc945..93bf1e8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM ocrd/core:edge +FROM ocrd/core MAINTAINER OCR-D ENV DEBIAN_FRONTEND noninteractive ENV PYTHONIOENCODING utf8 From 41f5c8a8fa899beaf1231c4cbe7666fba135edfe Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 6 Feb 2020 13:44:43 +0100 Subject: [PATCH 47/93] =?UTF-8?q?=F0=9F=90=B3=20Docker:=20Upgrade=20pip=20?= =?UTF-8?q?to=20silence=20warning=20and=20fix=20potential=20other=20proble?= =?UTF-8?q?ms?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 93bf1e8..e00889c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,8 @@ COPY README.md . COPY ocrd_calamari ocrd_calamari RUN make calamari/build -RUN pip3 install . +RUN pip3 install --upgrade pip && \ + pip3 install . ENTRYPOINT ["/usr/local/bin/ocrd-calamari-recognize"] From a2d1d76dbdb0ca2207db31e67d6e7d7ce4c1c844 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 6 Feb 2020 13:52:05 +0100 Subject: [PATCH 48/93] =?UTF-8?q?=F0=9F=90=B3=20Docker:=20Do=20not=20use?= =?UTF-8?q?=20the=20make=20target=20to=20install=20calamari-ocr,=20stick?= =?UTF-8?q?=20to=20pip?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 1 - Makefile | 9 --------- 2 files changed, 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index e00889c..d04c790 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,6 @@ COPY requirements.txt . COPY README.md . COPY ocrd_calamari ocrd_calamari -RUN make calamari/build RUN pip3 install --upgrade pip && \ pip3 install . diff --git a/Makefile b/Makefile index c3e85ab..c6752fa 100644 --- a/Makefile +++ b/Makefile @@ -11,9 +11,7 @@ help: @echo " Targets" @echo "" @echo " install Install ocrd_calamari" - @echo " calamari Clone calamari repo" @echo " gt4histocr-calamari Get GT4HistOCR Calamari model (from SBB)" - @echo " calamari/build pip install calamari" @echo " deps-test Install testing python deps via pip" @echo " repo/assets Clone OCR-D/assets to ./repo/assets" @echo " test/assets Setup test assets" @@ -33,9 +31,6 @@ help: install: $(PIP_INSTALL) . -# Clone calamari repo -calamari: - $(GIT_CLONE) https://github.com/chwick/calamari gt4histocr-calamari: mkdir gt4histocr-calamari @@ -50,10 +45,6 @@ actevedef_718448162: unzip actevedef_718448162.zip -# pip install calamari -calamari/build: calamari - cd calamari && $(PIP_INSTALL) . - # # Assets and Tests From 303172b279374f12a48999aa2fa63db7901d0909 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 6 Feb 2020 13:53:55 +0100 Subject: [PATCH 49/93] =?UTF-8?q?=F0=9F=93=9D=20Document=20make=20targets?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c6752fa..e65df22 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,7 @@ help: @echo "" @echo " install Install ocrd_calamari" @echo " gt4histocr-calamari Get GT4HistOCR Calamari model (from SBB)" + @echo " actevedef_718448162 Download example data" @echo " deps-test Install testing python deps via pip" @echo " repo/assets Clone OCR-D/assets to ./repo/assets" @echo " test/assets Setup test assets" @@ -32,6 +33,7 @@ install: $(PIP_INSTALL) . +# Get GT4HistOCR Calamari model (from SBB) gt4histocr-calamari: mkdir gt4histocr-calamari cd gt4histocr-calamari && \ @@ -39,7 +41,7 @@ gt4histocr-calamari: tar xfv model.tar.xz && \ rm model.tar.xz -# Example data +# Download example data actevedef_718448162: wget https://qurator-data.de/examples/actevedef_718448162.zip && \ unzip actevedef_718448162.zip From 30f7e1b2469121361eb94c886978aef0ad60631b Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 6 Feb 2020 14:01:36 +0100 Subject: [PATCH 50/93] =?UTF-8?q?=F0=9F=90=B3=20Docker:=20Run=20pip3=20che?= =?UTF-8?q?ck=20for=20good=20measure?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index d04c790..6d63150 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,8 @@ COPY README.md . COPY ocrd_calamari ocrd_calamari RUN pip3 install --upgrade pip && \ - pip3 install . + pip3 install . && \ + pip3 check ENTRYPOINT ["/usr/local/bin/ocrd-calamari-recognize"] From 5b6d8b3f41db2d383eeb1c10acaf7511617b6165 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 12 Feb 2020 12:25:25 +0100 Subject: [PATCH 51/93] =?UTF-8?q?=F0=9F=90=9B=20Build=20line=20text=20on?= =?UTF-8?q?=20our=20own?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calamari does whitespace post-processing on prediction.sentence, while it does not do the same on prediction.positions. Do it on our own to have consistency. Fixes GH-37. --- ocrd_calamari/recognize.py | 44 ++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index c588a62..70c0112 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -1,6 +1,7 @@ from __future__ import absolute_import import os +import itertools from glob import glob import numpy as np @@ -84,8 +85,39 @@ class CalamariRecognize(Processor): prediction = self.voter.vote_prediction_result(raw_results) prediction.id = "voted" - line_text = prediction.sentence - line_conf = prediction.avg_char_probability + # Build line text on our own + # + # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same + # on prediction.positions. Do it on our own to have consistency. + # + # XXX Check Calamari's built-in post-processing on prediction.sentence + + def _drop_leading_spaces(positions): + return list(itertools.dropwhile(lambda p: p.chars[0].char == " ", positions)) + def _drop_trailing_spaces(positions): + return list(reversed(_drop_leading_spaces(reversed(positions)))) + def _drop_double_spaces(positions): + def _drop_double_spaces_generator(positions): + last_was_space = False + for p in positions: + if p.chars[0].char == " ": + if not last_was_space: + yield p + last_was_space = True + else: + yield p + last_was_space = False + return list(_drop_double_spaces_generator(positions)) + positions = prediction.positions + positions = _drop_leading_spaces(positions) + positions = _drop_trailing_spaces(positions) + positions = _drop_double_spaces(positions) + positions = list(positions) + + line_text = ''.join(p.chars[0].char for p in positions) + if line_text != prediction.sentence: + log.warning("Our own line text is not the same as Calamari's: '%s' != '%s'", + line_text, prediction.sentence) # Delete existing results if line.get_TextEquiv(): @@ -96,8 +128,10 @@ class CalamariRecognize(Processor): line.set_Word([]) # Save line results + line_conf = prediction.avg_char_probability line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)]) + # Save word results # # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation @@ -124,10 +158,12 @@ class CalamariRecognize(Processor): word_no = 0 i = 0 - for word_text in _words(prediction.sentence): + + + for word_text in _words(line_text): word_length = len(word_text) if not all(c == ' ' for c in word_text): - word_positions = prediction.positions[i:i+word_length] + word_positions = positions[i:i+word_length] word_start = word_positions[0].global_start word_end = word_positions[-1].global_end From cd8f6a5fcbf70f0cb9fb403943320d35d8364911 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 12 Feb 2020 13:32:10 +0100 Subject: [PATCH 52/93] =?UTF-8?q?=F0=9F=90=9B=20Use=20line=20id=20for=20de?= =?UTF-8?q?bug=20message?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/recognize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 70c0112..4812796 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -73,7 +73,7 @@ class CalamariRecognize(Processor): textlines = region.get_TextLine() log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) for (line_no, line) in enumerate(textlines): - log.debug("Recognizing line '%s' in region '%s'", line_no, region.id) + log.debug("Recognizing line '%s' in region '%s'", line.id, region.id) line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_xywh) line_image_np = np.array(line_image, dtype=np.uint8) From d2c843aa3f9fe0d2fd104e451a021097257b398a Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 12 Feb 2020 13:33:29 +0100 Subject: [PATCH 53/93] =?UTF-8?q?=F0=9F=93=A6=20v0.0.5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/ocrd-tool.json | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index 5d181ef..e0efca2 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -1,6 +1,6 @@ { "git_url": "https://github.com/kba/ocrd_calamari", - "version": "0.0.4", + "version": "0.0.5", "tools": { "ocrd-calamari-recognize": { "executable": "ocrd-calamari-recognize", diff --git a/setup.py b/setup.py index 4bf26b2..203cebf 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import setup, find_packages setup( name='ocrd_calamari', - version='0.0.4', + version='0.0.5', description='Calamari bindings', long_description=Path('README.md').read_text(), long_description_content_type='text/markdown', From 0c9e1f13c74a52208831cb2ce11ee340148dd846 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 12 Feb 2020 16:38:45 +0100 Subject: [PATCH 54/93] =?UTF-8?q?=F0=9F=90=9B=20Sort=20predictions=20in=20?= =?UTF-8?q?exactly=20the=20same=20way=20to=20make=20sure=20we=20are=20corr?= =?UTF-8?q?ectly=20removing=20spaces?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/recognize.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 4812796..79cbd3b 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -92,8 +92,16 @@ class CalamariRecognize(Processor): # # XXX Check Calamari's built-in post-processing on prediction.sentence + + def _sort_chars(p): + """Filter and sort chars of prediction p""" + chars = p.chars + chars = [c for c in chars if c.char] # XXX Note that omission probabilities are not normalized?! + chars = [c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff']] + chars = sorted(chars, key=lambda k: k.probability, reverse=True) + return chars def _drop_leading_spaces(positions): - return list(itertools.dropwhile(lambda p: p.chars[0].char == " ", positions)) + return list(itertools.dropwhile(lambda p: _sort_chars(p)[0].char == " ", positions)) def _drop_trailing_spaces(positions): return list(reversed(_drop_leading_spaces(reversed(positions)))) def _drop_double_spaces(positions): @@ -184,17 +192,10 @@ class CalamariRecognize(Processor): glyph = GlyphType(id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points)) - # Filter predictions - chars = p.chars - chars = [c for c in chars if c.char] # XXX Note that omission probabilities are not normalized?! - chars = [c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff']] - - # Sort and add predictions (= TextEquivs) - chars = sorted(chars, key=lambda k: k.probability, reverse=True) - char_index = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs - for char in chars: + # Add predictions (= TextEquivs) + char_index_start = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs + for char_index, char in enumerate(_sort_chars(p), start=char_index_start): glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=char_index, conf=char.probability)) - char_index += 1 word.add_Glyph(glyph) From 0334a358704d1f2f00a8197add96776521fd5608 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 12 Feb 2020 17:18:37 +0100 Subject: [PATCH 55/93] =?UTF-8?q?=F0=9F=90=9B=20Sort=20predictions=20in=20?= =?UTF-8?q?exactly=20the=20same=20way,=20also=20when=20building=20the=20te?= =?UTF-8?q?xt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/recognize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 79cbd3b..8ae0a17 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -122,7 +122,7 @@ class CalamariRecognize(Processor): positions = _drop_double_spaces(positions) positions = list(positions) - line_text = ''.join(p.chars[0].char for p in positions) + line_text = ''.join(_sort_chars(p)[0].char for p in positions) if line_text != prediction.sentence: log.warning("Our own line text is not the same as Calamari's: '%s' != '%s'", line_text, prediction.sentence) From 62e5e0c295d9274bc507150cb1070c9cea301c33 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 13 Feb 2020 16:00:58 +0100 Subject: [PATCH 56/93] =?UTF-8?q?=F0=9F=90=9B=20ocrd-tool.json:=20Fix=20Gi?= =?UTF-8?q?tHub=20url=20by=20s/kba/OCR-D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/ocrd-tool.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index e0efca2..a99e24b 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -1,5 +1,5 @@ { - "git_url": "https://github.com/kba/ocrd_calamari", + "git_url": "https://github.com/OCR-D/ocrd_calamari", "version": "0.0.5", "tools": { "ocrd-calamari-recognize": { From 69df78bce114a624d8d95af0520c29e22f1152ed Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 13 Feb 2020 16:02:02 +0100 Subject: [PATCH 57/93] =?UTF-8?q?=F0=9F=90=9B=20setup.py:=20Fix=20GitHub?= =?UTF-8?q?=20url=20by=20s/kba/OCR-D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 203cebf..30f7c75 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( long_description_content_type='text/markdown', author='Konstantin Baierer, Mike Gerber', author_email='unixprog@gmail.com, mike.gerber@sbb.spk-berlin.de', - url='https://github.com/kba/ocrd_calamari', + url='https://github.com/OCR-D/ocrd_calamari', license='Apache License 2.0', packages=find_packages(exclude=('tests', 'docs')), install_requires=Path('requirements.txt').read_text().split('\n'), From 123ee61a8bdea975fce14127561400e8c6898651 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 13 Feb 2020 16:04:17 +0100 Subject: [PATCH 58/93] v0.0.6 --- ocrd_calamari/ocrd-tool.json | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index a99e24b..3507ebf 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -1,6 +1,6 @@ { "git_url": "https://github.com/OCR-D/ocrd_calamari", - "version": "0.0.5", + "version": "0.0.6", "tools": { "ocrd-calamari-recognize": { "executable": "ocrd-calamari-recognize", diff --git a/setup.py b/setup.py index 30f7c75..7879b64 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import setup, find_packages setup( name='ocrd_calamari', - version='0.0.5', + version='0.0.6', description='Calamari bindings', long_description=Path('README.md').read_text(), long_description_content_type='text/markdown', From fb538845d8e259b64dfb07d18403a649142fb87e Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 13 Feb 2020 16:49:11 +0100 Subject: [PATCH 59/93] =?UTF-8?q?=F0=9F=93=84=20Update=20license=20(Fixes?= =?UTF-8?q?=20#35)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Set copyright owner name. Also, going along the lines of "update the year when substantial revision of the work happenend", set the copyright years. The latter may be not be necessary, because "life of author + 70 years" or something. --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 261eeb9..bc7973a 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2018-2020 Konstantin Baierer, Mike Gerber Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From e03ff4064bc1801b4f0b74c9771e71426e2a0142 Mon Sep 17 00:00:00 2001 From: kba Date: Sun, 31 May 2020 20:31:06 +0200 Subject: [PATCH 60/93] setup.py: exclude "test", not "tests", from installation --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7879b64..0e417fb 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setup( author_email='unixprog@gmail.com, mike.gerber@sbb.spk-berlin.de', url='https://github.com/OCR-D/ocrd_calamari', license='Apache License 2.0', - packages=find_packages(exclude=('tests', 'docs')), + packages=find_packages(exclude=('test', 'docs')), install_requires=Path('requirements.txt').read_text().split('\n'), package_data={ '': ['*.json', '*.yml', '*.yaml'], From 7dff7784c5533ff569160b278c7ca6462c3603d4 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 21 Jul 2020 18:16:52 +0200 Subject: [PATCH 61/93] =?UTF-8?q?=E2=AC=86=EF=B8=8F=20Update=20to=20Calama?= =?UTF-8?q?ri=201.0.x?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0a426e0..53a18b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ numpy -tensorflow-gpu == 1.15.* -calamari-ocr == 0.3.5 +tensorflow-gpu == 2.2.* +calamari-ocr == 1.0.* setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? click ocrd >= 2.2.1 From 8ab57e44dc45bb3004200a10ed1cf467f44447af Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 21 Jul 2020 18:52:47 +0200 Subject: [PATCH 62/93] =?UTF-8?q?=E2=AC=86=EF=B8=8F=20Update=20model=20dow?= =?UTF-8?q?nload=20for=20Calamari=201.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e65df22..61ca7f3 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ install: gt4histocr-calamari: mkdir gt4histocr-calamari cd gt4histocr-calamari && \ - wget https://qurator-data.de/calamari-models/GT4HistOCR/model.tar.xz && \ + wget https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz && \ tar xfv model.tar.xz && \ rm model.tar.xz From 027fcd7d75c621f9130c2b3b96e5d996360ac893 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 21 Jul 2020 20:10:36 +0200 Subject: [PATCH 63/93] =?UTF-8?q?=F0=9F=90=9B=20Fix=20test=20file=20path?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_recognize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_recognize.py b/test/test_recognize.py index 5db48cf..54faf87 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -12,7 +12,7 @@ from ocrd_calamari import CalamariRecognize from .base import assets -METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml') +METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml') WORKSPACE_DIR = '/tmp/test-ocrd-calamari' CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json') From 9ea50e25d17acbc19426c15860c7b37b194f0b67 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 21 Jul 2020 18:16:52 +0200 Subject: [PATCH 64/93] =?UTF-8?q?=E2=AC=86=EF=B8=8F=20Update=20to=20Calama?= =?UTF-8?q?ri=201.0.x?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0a426e0..53a18b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ numpy -tensorflow-gpu == 1.15.* -calamari-ocr == 0.3.5 +tensorflow-gpu == 2.2.* +calamari-ocr == 1.0.* setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? click ocrd >= 2.2.1 From 7584d0135ce6f3988909244a1a6f49fbadf314ad Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 21 Jul 2020 18:52:47 +0200 Subject: [PATCH 65/93] =?UTF-8?q?=E2=AC=86=EF=B8=8F=20Update=20model=20dow?= =?UTF-8?q?nload=20for=20Calamari=201.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e65df22..61ca7f3 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ install: gt4histocr-calamari: mkdir gt4histocr-calamari cd gt4histocr-calamari && \ - wget https://qurator-data.de/calamari-models/GT4HistOCR/model.tar.xz && \ + wget https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz && \ tar xfv model.tar.xz && \ rm model.tar.xz From 93190fae3b3d8b5b9a68b37f604c43c34979e5d4 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 22 Jul 2020 16:03:10 +0200 Subject: [PATCH 66/93] =?UTF-8?q?=E2=9A=A1=20Recognize=20more=20than=20one?= =?UTF-8?q?=20line=20at=20a=20time=20(Fixes=20gh#20)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/recognize.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 8ae0a17..d040550 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -72,13 +72,16 @@ class CalamariRecognize(Processor): textlines = region.get_TextLine() log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) - for (line_no, line) in enumerate(textlines): - log.debug("Recognizing line '%s' in region '%s'", line.id, region.id) + line_images_np = [] + for (line_no, line) in enumerate(textlines): line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_xywh) line_image_np = np.array(line_image, dtype=np.uint8) + line_images_np.append(line_image_np) + raw_results_all = self.predictor.predict_raw(line_images_np, progress_bar=False) + + for line, raw_results in zip(textlines, raw_results_all): - raw_results = list(self.predictor.predict_raw([line_image_np], progress_bar=False))[0] for i, p in enumerate(raw_results): p.prediction.id = "fold_{}".format(i) From d9afb05cf32e75481ff090e3ffb7d30fd2cb7511 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 23 Jul 2020 12:44:15 +0200 Subject: [PATCH 67/93] =?UTF-8?q?=F0=9F=90=9B=20Use=20TensorFlow=20>=3D=20?= =?UTF-8?q?2.3.0rc2=20to=20fix=20retracing=20warnings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 53a18b0..f7ece9d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ numpy -tensorflow-gpu == 2.2.* +tensorflow >= 2.3.0rc2 calamari-ocr == 1.0.* setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? click From 0a9dbd0c25b71b034918db36393e793660554777 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 23 Jul 2020 13:04:11 +0200 Subject: [PATCH 68/93] =?UTF-8?q?=F0=9F=A7=B9=20Do=20not=20install=20numpy?= =?UTF-8?q?,=20let=20the=20TF=20dependency=20do=20it?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f7ece9d..a1de85a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -numpy tensorflow >= 2.3.0rc2 calamari-ocr == 1.0.* setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? From 046e3e8ee31e9ed973ae72ba108322f10fe11d00 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 6 Aug 2020 11:27:59 +0200 Subject: [PATCH 69/93] =?UTF-8?q?=F0=9F=9A=A7=20Tests:=20Add=20some=20TODO?= =?UTF-8?q?s=20re=20data=20+=20namespace=20version=20changes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_recognize.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_recognize.py b/test/test_recognize.py index 54faf87..70667aa 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -51,6 +51,8 @@ def workspace(): subprocess.call(['convert', ff, '-threshold', '50%', ff]) # Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text + # XXX Review data again + # XXX Make this more robust against namespace version changes for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-LINE"): workspace.download_file(of) for to_remove in ["//pc:Word", "//pc:TextEquiv"]: From 7da45a0ec1111e2ea28505b4e444e135bdcdf4b2 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 6 Aug 2020 12:31:47 +0200 Subject: [PATCH 70/93] Set pcGtsId Newest OCR-D validation checks PAGE-XML pcGtsId against METS file/@ID. Set the pcGtsId here correctly. Fixes #40. --- ocrd_calamari/recognize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 8ae0a17..5555d75 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -222,6 +222,7 @@ class CalamariRecognize(Processor): file_id = self._make_file_id(input_file, n) + pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, From 86410110bc9f0eaec93599e6f39a12c6a1234457 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 6 Aug 2020 12:39:45 +0200 Subject: [PATCH 71/93] =?UTF-8?q?=F0=9F=93=A6=20v0.0.7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/ocrd-tool.json | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index 3507ebf..5941dcd 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -1,6 +1,6 @@ { "git_url": "https://github.com/OCR-D/ocrd_calamari", - "version": "0.0.6", + "version": "0.0.7", "tools": { "ocrd-calamari-recognize": { "executable": "ocrd-calamari-recognize", diff --git a/setup.py b/setup.py index 0e417fb..02114a2 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import setup, find_packages setup( name='ocrd_calamari', - version='0.0.6', + version='0.0.7', description='Calamari bindings', long_description=Path('README.md').read_text(), long_description_content_type='text/markdown', From f6dfedf837e1527099de325822e7bdc17611b04f Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 6 Aug 2020 14:04:17 +0200 Subject: [PATCH 72/93] =?UTF-8?q?=F0=9F=97=92=EF=B8=8F=20README-DEV:=20Als?= =?UTF-8?q?o=20release=20on=20GitHub?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README-DEV.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README-DEV.md b/README-DEV.md index 8a1bbad..40a237a 100644 --- a/README-DEV.md +++ b/README-DEV.md @@ -15,6 +15,7 @@ Releasing * `git commit -m 'v'` * `git tag -m 'v' 'v'` * `git push --tags` +* Do a release on GitHub ### Uploading to PyPI * `rm -rf dist/` or backup if `dist/` exists already From f746b73fd02436220f92bf46a1dce9d80c65cf72 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 6 Aug 2020 15:23:56 +0200 Subject: [PATCH 73/93] use make_file_id and assert_file_grp_cardinality --- ocrd_calamari/recognize.py | 12 +++++------- requirements.txt | 2 +- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 5555d75..cf57e62 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -20,6 +20,7 @@ from ocrd_models.ocrd_page import ( from ocrd_utils import ( getLogger, concat_padded, coordinates_for_segment, points_from_polygon, polygon_from_x0y0x1y1, + make_file_id, assert_file_grp_cardinality, MIMETYPE_PAGE ) @@ -46,17 +47,14 @@ class CalamariRecognize(Processor): voter_params.type = VoterParams.Type.Value(self.parameter['voter'].upper()) self.voter = voter_from_proto(voter_params) - def _make_file_id(self, input_file, n): - file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) - if file_id == input_file.ID: - file_id = concat_padded(self.output_file_grp, n) - return file_id - def process(self): """ Performs the recognition. """ + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + self._init_calamari() for (n, input_file) in enumerate(self.input_files): @@ -221,7 +219,7 @@ class CalamariRecognize(Processor): for name in self.parameter.keys()])])) - file_id = self._make_file_id(input_file, n) + file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, diff --git a/requirements.txt b/requirements.txt index 0a426e0..58a0207 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ tensorflow-gpu == 1.15.* calamari-ocr == 0.3.5 setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? click -ocrd >= 2.2.1 +ocrd >= 2.13.0 From c417a0ab7788842515ec6bccd7422ba932e24c03 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 3 Sep 2020 11:31:11 +0200 Subject: [PATCH 74/93] =?UTF-8?q?=F0=9F=93=9D=20README:=20Add=20a=20screen?= =?UTF-8?q?shot=20of=20example=20output?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index e158b8d..29adbf3 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,8 @@ segmentation and the glyph positions. The provided glyph and word segmentation can be used for text extraction and highlighting, but is probably not useful for further image-based processing. +![Example output as viewed in PAGE Viewer](https://github.com/OCR-D/ocrd_calamari/raw/screenshots/output-in-page-viewer.jpg) + ## Installation ### From PyPI From 7705374cfc65566caba9d06a3e607bc314eb123d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 3 Sep 2020 11:44:47 +0200 Subject: [PATCH 75/93] =?UTF-8?q?=F0=9F=90=9B=20CircleCI:=20Ignore=20scree?= =?UTF-8?q?nshots=20branch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .circleci/config.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 11a4289..cc8e333 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -23,3 +23,6 @@ workflows: build: jobs: - build-python36 + branches: + ignore: + - screenshots From bb9b1ab41ba4436420a44640c19d4974e52cc805 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 3 Sep 2020 11:55:00 +0200 Subject: [PATCH 76/93] =?UTF-8?q?=F0=9F=90=9B=20CircleCI:=20Ignore=20scree?= =?UTF-8?q?nshots=20branch=20(second=20try)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .circleci/config.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index cc8e333..b90ef37 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -22,7 +22,8 @@ jobs: workflows: build: jobs: - - build-python36 - branches: - ignore: - - screenshots + - build-python36: + filters: + branches: + ignore: + - screenshots From e4982aff37b9eda9dd419a50ba1a993fbbd76e0a Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 24 Sep 2020 10:25:47 +0200 Subject: [PATCH 77/93] getLogger per method --- ocrd_calamari/recognize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index cf57e62..9884433 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -27,7 +27,6 @@ from ocrd_utils import ( from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL TOOL = 'ocrd-calamari-recognize' -log = getLogger('processor.CalamariRecognize') class CalamariRecognize(Processor): @@ -51,6 +50,7 @@ class CalamariRecognize(Processor): """ Performs the recognition. """ + log = getLogger('processor.CalamariRecognize') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) From 3156121ff79c645da6072101699e996b8cb07200 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 1 Oct 2020 13:21:14 +0200 Subject: [PATCH 78/93] =?UTF-8?q?=F0=9F=93=9D=20Let=20intro=20mention=20oc?= =?UTF-8?q?rd=5Fcalamari=20+=20PAGE=20XML,=20link=20to=20OCR-D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 29adbf3..3cf8227 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ ## Introduction -This offers a OCR-D compliant workspace processor for the functionality of Calamari OCR. +**ocrd_calamari** offers a [OCR-D](https://ocr-d.de) compliant workspace processor for the functionality of Calamari OCR. It uses [PAGE XML](https://github.com/PRImA-Research-Lab/PAGE-XML) documents as input and output. This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized image) as its input. From a5d46f0d28c40a5d70004b3042916ec10a6f30f3 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 1 Oct 2020 13:23:44 +0200 Subject: [PATCH 79/93] =?UTF-8?q?=F0=9F=9A=A7=20README:=20Mention=20METS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3cf8227..cb370b3 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ ## Introduction -**ocrd_calamari** offers a [OCR-D](https://ocr-d.de) compliant workspace processor for the functionality of Calamari OCR. It uses [PAGE XML](https://github.com/PRImA-Research-Lab/PAGE-XML) documents as input and output. +**ocrd_calamari** offers a [OCR-D](https://ocr-d.de) compliant workspace processor for the functionality of Calamari OCR. It uses OCR-D workspaces (METS) with [PAGE XML](https://github.com/PRImA-Research-Lab/PAGE-XML) documents as input and output. This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized image) as its input. From af211d2a1b52656557a2a87c271a30a1579eeeea Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 23 Jul 2020 13:04:11 +0200 Subject: [PATCH 80/93] =?UTF-8?q?=F0=9F=A7=B9=20Do=20not=20install=20numpy?= =?UTF-8?q?,=20let=20the=20TF=20dependency=20do=20it?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 58a0207..9ceba65 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -numpy tensorflow-gpu == 1.15.* calamari-ocr == 0.3.5 setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? From 0e59c2317a579d26ddc111a9d3b49d3eddcaa53e Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 25 Nov 2020 12:09:41 +0100 Subject: [PATCH 81/93] =?UTF-8?q?=F0=9F=91=B7=F0=9F=8F=BE=E2=80=8D?= =?UTF-8?q?=E2=99=82=EF=B8=8F=20Use=20gt4histocr-calamari1/=20as=20directo?= =?UTF-8?q?ry=20name=20for=20the=20Calmari=201=20model?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 12 ++++++------ README.md | 8 ++++---- test/test_recognize.py | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 61ca7f3..00a8f69 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ help: @echo " Targets" @echo "" @echo " install Install ocrd_calamari" - @echo " gt4histocr-calamari Get GT4HistOCR Calamari model (from SBB)" + @echo " gt4histocr-calamari1 Get GT4HistOCR Calamari model (from SBB)" @echo " actevedef_718448162 Download example data" @echo " deps-test Install testing python deps via pip" @echo " repo/assets Clone OCR-D/assets to ./repo/assets" @@ -34,9 +34,9 @@ install: # Get GT4HistOCR Calamari model (from SBB) -gt4histocr-calamari: - mkdir gt4histocr-calamari - cd gt4histocr-calamari && \ +gt4histocr-calamari1: + mkdir -p gt4histocr-calamari1 + cd gt4histocr-calamari1 && \ wget https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz && \ tar xfv model.tar.xz && \ rm model.tar.xz @@ -73,12 +73,12 @@ assets-clean: rm -rf test/assets # Run unit tests -test: test/assets gt4histocr-calamari +test: test/assets gt4histocr-calamari1 # declare -p HTTP_PROXY $(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS) # Run unit tests and determine test coverage -coverage: test/assets gt4histocr-calamari +coverage: test/assets gt4histocr-calamari1 coverage erase make test PYTHON="coverage run" coverage report diff --git a/README.md b/README.md index e158b8d..9b111fb 100644 --- a/README.md +++ b/README.md @@ -41,8 +41,8 @@ pip install . Download models trained on GT4HistOCR data: ``` -make gt4histocr-calamari -ls gt4histocr-calamari +make gt4histocr-calamari1 +ls gt4histocr-calamari1 ``` ## Example Usage @@ -50,7 +50,7 @@ Before using `ocrd-calamari-recognize` get some example data and model, and prepare the document for OCR: ``` # Download model and example data -make gt4histocr-calamari +make gt4histocr-calamari1 make actevedef_718448162 # Create binarized images and line segmentation using other OCR-D projects @@ -62,7 +62,7 @@ ocrd-tesserocr-segment-line -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE Finally recognize the text using ocrd_calamari and the downloaded model: ``` -ocrd-calamari-recognize -p '{ "checkpoint": "../gt4histocr-calamari/*.ckpt.json" }' -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI +ocrd-calamari-recognize -p '{ "checkpoint": "../gt4histocr-calamari1/*.ckpt.json" }' -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI ``` You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions diff --git a/test/test_recognize.py b/test/test_recognize.py index 54faf87..eee45d9 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -14,7 +14,7 @@ from .base import assets METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml') WORKSPACE_DIR = '/tmp/test-ocrd-calamari' -CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json') +CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari1/*.ckpt.json') # Because XML namespace versions are so much fun, we not only use one, we use TWO! NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" } From 15bcfde180975f8cbf6ffc58bca0a172718f34e1 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 25 Nov 2020 12:40:20 +0100 Subject: [PATCH 82/93] =?UTF-8?q?=F0=9F=90=9B=20Pin=20h5py=20to=20<=203=20?= =?UTF-8?q?because=20pip?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index f5a1a60..20b2ff2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +h5py < 3 # XXX tensorflow 2.4.0rc3 requires h5py~=2.10.0, but you'll have h5py 3.1.0 which is incompatible. tensorflow >= 2.3.0rc2 calamari-ocr == 1.0.* setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? From 448a5b0dbcb67065cce02e145d5466ff4ffdd9f2 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 25 Nov 2020 12:56:50 +0100 Subject: [PATCH 83/93] =?UTF-8?q?=F0=9F=93=A6=20v1.0.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/ocrd-tool.json | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index 5941dcd..4494679 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -1,6 +1,6 @@ { "git_url": "https://github.com/OCR-D/ocrd_calamari", - "version": "0.0.7", + "version": "1.0.0", "tools": { "ocrd-calamari-recognize": { "executable": "ocrd-calamari-recognize", diff --git a/setup.py b/setup.py index 02114a2..0b5a880 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import setup, find_packages setup( name='ocrd_calamari', - version='0.0.7', + version='1.0.0', description='Calamari bindings', long_description=Path('README.md').read_text(), long_description_content_type='text/markdown', From 1c7fcda767ef35c0a19171bddfd82aac36207e81 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 25 Nov 2020 18:40:21 +0100 Subject: [PATCH 84/93] =?UTF-8?q?=F0=9F=93=9D=20README:=20Link=20to=20mode?= =?UTF-8?q?l=20download?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index effe3bb..d662189 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,8 @@ make gt4histocr-calamari1 ls gt4histocr-calamari1 ``` +Manual download: [model.tar.xz](https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz) + ## Example Usage Before using `ocrd-calamari-recognize` get some example data and model, and prepare the document for OCR: From df530877dc0a4b04d9d266ace7ead7930ed091fa Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 17 Dec 2020 17:18:29 +0100 Subject: [PATCH 85/93] check for empty line image, ht @andbue, fix #48 --- ocrd_calamari/recognize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index b21f6ed..b621a63 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -74,6 +74,7 @@ class CalamariRecognize(Processor): line_images_np = [] for (line_no, line) in enumerate(textlines): line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_xywh) + line_image = line_image if all(line_image.size) else [[0]] line_image_np = np.array(line_image, dtype=np.uint8) line_images_np.append(line_image_np) raw_results_all = self.predictor.predict_raw(line_images_np, progress_bar=False) From fe973e58db11a6339e6b3d335a61b18bffd2d7d6 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 17 Dec 2020 18:07:38 +0100 Subject: [PATCH 86/93] add version of calamari in --version output --- ocrd_calamari/recognize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index b21f6ed..381cac3 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -5,6 +5,7 @@ import itertools from glob import glob import numpy as np +from calamari_ocr import __version__ as calamari_version from calamari_ocr.ocr import MultiPredictor from calamari_ocr.ocr.voting import voter_from_proto from calamari_ocr.proto import VoterParams @@ -33,7 +34,7 @@ class CalamariRecognize(Processor): def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] - kwargs['version'] = OCRD_TOOL['version'] + kwargs['version'] = '%s (calamari %s)' % (OCRD_TOOL['version'], calamari_version) super(CalamariRecognize, self).__init__(*args, **kwargs) def _init_calamari(self): From 83adfcfd5a41acd211c71650e4f55333d9909cdf Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 22 Dec 2020 15:20:45 +0100 Subject: [PATCH 87/93] implement "checkpoint_dir" parameter as a simpler alternative to "checkpoint" --- README.md | 7 +++++++ ocrd_calamari/ocrd-tool.json | 4 ++++ ocrd_calamari/recognize.py | 2 ++ test/test_recognize.py | 19 ++++++++++++++++++- 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d662189..079f2a8 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,13 @@ Finally recognize the text using ocrd_calamari and the downloaded model: ocrd-calamari-recognize -p '{ "checkpoint": "../gt4histocr-calamari1/*.ckpt.json" }' -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI ``` +or + +``` +ocrd-calamari-recognize -P checkpoint_dir ../gt4histocr-calamari1 -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI +``` + + You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions for additional parameters and default values. diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index 4494679..691eeba 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -18,6 +18,10 @@ "OCR-D-OCR-CALAMARI" ], "parameters": { + "checkpoint_dir": { + "description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory", + "type": "string", "format": "file", "cacheable": true + }, "checkpoint": { "description": "The calamari model files (*.ckpt.json)", "type": "string", "format": "file", "cacheable": true diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 381cac3..102c927 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -40,6 +40,8 @@ class CalamariRecognize(Processor): def _init_calamari(self): os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL + if self.parameter['checkpoint_dir']: + self.parameter['checkpoint'] = '%s/*.ckpt.json' % self.parameter['checkpoint_dir'] checkpoints = glob(self.parameter['checkpoint']) self.predictor = MultiPredictor(checkpoints=checkpoints) diff --git a/test/test_recognize.py b/test/test_recognize.py index 0d23c1f..7926404 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -14,7 +14,8 @@ from .base import assets METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml') WORKSPACE_DIR = '/tmp/test-ocrd-calamari' -CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari1/*.ckpt.json') +CHECKPPOINT_DIR = os.path.join(os.getcwd(), 'gt4histocr-calamari1') +CHECKPOINT = os.path.join(CHECKPPOINT_DIR, '*.ckpt.json') # Because XML namespace versions are so much fun, we not only use one, we use TWO! NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" } @@ -81,6 +82,22 @@ def test_recognize(workspace): with open(page1, "r", encoding="utf-8") as f: assert "verſchuldeten" in f.read() +def test_recognize_with_checkpoint_dir(workspace): + CalamariRecognize( + workspace, + input_file_grp="OCR-D-GT-SEG-LINE", + output_file_grp="OCR-D-OCR-CALAMARI", + parameter={ + "checkpoin_dir": CHECKPOINT_DIR, + } + ).process() + workspace.save_mets() + + page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml") + assert os.path.exists(page1) + with open(page1, "r", encoding="utf-8") as f: + assert "verſchuldeten" in f.read() + def test_word_segmentation(workspace): CalamariRecognize( From d6804bd9c3047956c90756baa554d849121861d5 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 22 Dec 2020 16:29:40 +0100 Subject: [PATCH 88/93] fix typos --- .gitignore | 1 + ocrd_calamari/recognize.py | 2 +- test/test_recognize.py | 6 +++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 4061f82..0bea6c4 100644 --- a/.gitignore +++ b/.gitignore @@ -110,3 +110,4 @@ venv.bak/ /actevedef_718448162* /repo /test/assets +gt4histocr-calamari* diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 102c927..6269e55 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -40,7 +40,7 @@ class CalamariRecognize(Processor): def _init_calamari(self): os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL - if self.parameter['checkpoint_dir']: + if self.parameter.get('checkpoint_dir', None): self.parameter['checkpoint'] = '%s/*.ckpt.json' % self.parameter['checkpoint_dir'] checkpoints = glob(self.parameter['checkpoint']) self.predictor = MultiPredictor(checkpoints=checkpoints) diff --git a/test/test_recognize.py b/test/test_recognize.py index 7926404..0a1e558 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -14,8 +14,8 @@ from .base import assets METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml') WORKSPACE_DIR = '/tmp/test-ocrd-calamari' -CHECKPPOINT_DIR = os.path.join(os.getcwd(), 'gt4histocr-calamari1') -CHECKPOINT = os.path.join(CHECKPPOINT_DIR, '*.ckpt.json') +CHECKPOINT_DIR = os.path.join(os.getcwd(), 'gt4histocr-calamari1') +CHECKPOINT = os.path.join(CHECKPOINT_DIR, '*.ckpt.json') # Because XML namespace versions are so much fun, we not only use one, we use TWO! NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" } @@ -88,7 +88,7 @@ def test_recognize_with_checkpoint_dir(workspace): input_file_grp="OCR-D-GT-SEG-LINE", output_file_grp="OCR-D-OCR-CALAMARI", parameter={ - "checkpoin_dir": CHECKPOINT_DIR, + "checkpoint_dir": CHECKPOINT_DIR, } ).process() workspace.save_mets() From fdd30ebb8932cc34f25cdce75d84d21d3ade95ae Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 22 Dec 2020 16:34:27 +0100 Subject: [PATCH 89/93] also add tensorflow version to --version output --- ocrd_calamari/recognize.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 381cac3..06af5dc 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -27,6 +27,9 @@ from ocrd_utils import ( from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL +os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL +from tensorflow import __version__ as tensorflow_version + TOOL = 'ocrd-calamari-recognize' @@ -34,11 +37,10 @@ class CalamariRecognize(Processor): def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] - kwargs['version'] = '%s (calamari %s)' % (OCRD_TOOL['version'], calamari_version) + kwargs['version'] = '%s (calamari %s, tensorflow %s)' % (OCRD_TOOL['version'], calamari_version, tensorflow_version) super(CalamariRecognize, self).__init__(*args, **kwargs) def _init_calamari(self): - os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL checkpoints = glob(self.parameter['checkpoint']) self.predictor = MultiPredictor(checkpoints=checkpoints) From 00e43b1d1f6675de546c52ffe3a386c3fab7eb89 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 22 Dec 2020 19:11:44 +0100 Subject: [PATCH 90/93] use Processor.resolve_files to handle on-demand download of models via registry --- ocrd_calamari/recognize.py | 3 ++- requirements.txt | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 6269e55..d896473 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -41,7 +41,8 @@ class CalamariRecognize(Processor): os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL if self.parameter.get('checkpoint_dir', None): - self.parameter['checkpoint'] = '%s/*.ckpt.json' % self.parameter['checkpoint_dir'] + resolved = self.resolve_resource(self.parameter['checkpoint_dir']) + self.parameter['checkpoint'] = '%s/*.ckpt.json' % resolved checkpoints = glob(self.parameter['checkpoint']) self.predictor = MultiPredictor(checkpoints=checkpoints) diff --git a/requirements.txt b/requirements.txt index 20b2ff2..cbfb800 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ tensorflow >= 2.3.0rc2 calamari-ocr == 1.0.* setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? click -ocrd >= 2.13.0 +ocrd >= 2.22.0 From b3d012412babfd84a47bd84e77ffeb0bcff89ec5 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 21 Jan 2021 12:24:12 +0100 Subject: [PATCH 91/93] =?UTF-8?q?=F0=9F=93=9D=20README:=20Use=20new-style?= =?UTF-8?q?=20OCR-D=20parameter=20syntax=20and=20new-style=20mixed=20outpu?= =?UTF-8?q?t=20file=20groups?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 079f2a8..d277479 100644 --- a/README.md +++ b/README.md @@ -59,20 +59,20 @@ make actevedef_718448162 # Create binarized images and line segmentation using other OCR-D projects cd actevedef_718448162 -ocrd-olena-binarize -p '{ "impl": "sauvola-ms-split" }' -I OCR-D-IMG -O OCR-D-IMG-BINPAGE,OCR-D-IMG-BIN -ocrd-tesserocr-segment-region -I OCR-D-IMG-BINPAGE -O OCR-D-SEG-REGION +ocrd-olena-binarize -P impl sauvola-ms-split -I OCR-D-IMG -O OCR-D-IMG-BIN +ocrd-tesserocr-segment-region -I OCR-D-IMG-BIN -O OCR-D-SEG-REGION ocrd-tesserocr-segment-line -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE ``` Finally recognize the text using ocrd_calamari and the downloaded model: ``` -ocrd-calamari-recognize -p '{ "checkpoint": "../gt4histocr-calamari1/*.ckpt.json" }' -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI +ocrd-calamari-recognize -P checkpoint "../gt4histocr-calamari1/*.ckpt.json" -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI ``` or ``` -ocrd-calamari-recognize -P checkpoint_dir ../gt4histocr-calamari1 -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI +ocrd-calamari-recognize -P checkpoint_dir "../gt4histocr-calamari1" -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI ``` From 03f5e44e624a09577f4bea3defbb27059c971a9d Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 27 Jan 2021 13:59:45 +0100 Subject: [PATCH 92/93] define default for checkpoint_dir, but allow checkpoint still --- ocrd_calamari/ocrd-tool.json | 2 +- ocrd_calamari/recognize.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index 691eeba..467cdec 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -20,7 +20,7 @@ "parameters": { "checkpoint_dir": { "description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory", - "type": "string", "format": "file", "cacheable": true + "type": "string", "format": "file", "cacheable": true, "default": "qurator-gt4histocr-1.0" }, "checkpoint": { "description": "The calamari model files (*.ckpt.json)", diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index d896473..0fe03c7 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -40,7 +40,7 @@ class CalamariRecognize(Processor): def _init_calamari(self): os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL - if self.parameter.get('checkpoint_dir', None): + if not self.parameter.get('checkpoint', None) and self.parameter.get('checkpoint_dir', None): resolved = self.resolve_resource(self.parameter['checkpoint_dir']) self.parameter['checkpoint'] = '%s/*.ckpt.json' % resolved checkpoints = glob(self.parameter['checkpoint']) From 1bb72cbaf1f63575f117b7d454bb48d50121a794 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 28 Jan 2021 15:45:03 +0100 Subject: [PATCH 93/93] =?UTF-8?q?=F0=9F=93=A6=20v1.0.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/ocrd-tool.json | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index 467cdec..d4f83fa 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -1,6 +1,6 @@ { "git_url": "https://github.com/OCR-D/ocrd_calamari", - "version": "1.0.0", + "version": "1.0.1", "tools": { "ocrd-calamari-recognize": { "executable": "ocrd-calamari-recognize", diff --git a/setup.py b/setup.py index 0b5a880..2a98d62 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import setup, find_packages setup( name='ocrd_calamari', - version='1.0.0', + version='1.0.1', description='Calamari bindings', long_description=Path('README.md').read_text(), long_description_content_type='text/markdown',