mirror of
https://github.com/mikegerber/ocrd_calamari.git
synced 2025-06-08 19:29:53 +02:00
Merge pull request #70 from bertsky/patch-2
add checkpoint_dir content-type, remove checkpoint variant
This commit is contained in:
commit
1eb342ef65
6 changed files with 42 additions and 78 deletions
|
@ -14,8 +14,7 @@ jobs:
|
||||||
- run: locale-gen "en_US.UTF-8"; update-locale LC_ALL="en_US.UTF-8"
|
- run: locale-gen "en_US.UTF-8"; update-locale LC_ALL="en_US.UTF-8"
|
||||||
- checkout
|
- checkout
|
||||||
- run: pip3 install --upgrade pip
|
- run: pip3 install --upgrade pip
|
||||||
- run: make install PIP_INSTALL="pip3 install"
|
- run: make install deps-test PIP_INSTALL="pip3 install"
|
||||||
- run: pip3 install -r requirements-test.txt
|
|
||||||
- run: make coverage LC_ALL=en_US.utf8
|
- run: make coverage LC_ALL=en_US.utf8
|
||||||
- codecov/upload
|
- codecov/upload
|
||||||
|
|
||||||
|
|
27
Makefile
27
Makefile
|
@ -3,6 +3,7 @@ PIP_INSTALL = pip3 install
|
||||||
GIT_CLONE = git clone
|
GIT_CLONE = git clone
|
||||||
PYTHON = python3
|
PYTHON = python3
|
||||||
PYTEST_ARGS = -W 'ignore::DeprecationWarning' -W 'ignore::FutureWarning'
|
PYTEST_ARGS = -W 'ignore::DeprecationWarning' -W 'ignore::FutureWarning'
|
||||||
|
MODEL = qurator-gt4histocr-1.0
|
||||||
|
|
||||||
# BEGIN-EVAL makefile-parser --make-help Makefile
|
# BEGIN-EVAL makefile-parser --make-help Makefile
|
||||||
|
|
||||||
|
@ -11,7 +12,7 @@ help:
|
||||||
@echo " Targets"
|
@echo " Targets"
|
||||||
@echo ""
|
@echo ""
|
||||||
@echo " install Install ocrd_calamari"
|
@echo " install Install ocrd_calamari"
|
||||||
@echo " gt4histocr-calamari1 Get GT4HistOCR Calamari model (from SBB)"
|
@echo " $(MODEL) Get Calamari model (from SBB)"
|
||||||
@echo " actevedef_718448162 Download example data"
|
@echo " actevedef_718448162 Download example data"
|
||||||
@echo " deps-test Install testing python deps via pip"
|
@echo " deps-test Install testing python deps via pip"
|
||||||
@echo " repo/assets Clone OCR-D/assets to ./repo/assets"
|
@echo " repo/assets Clone OCR-D/assets to ./repo/assets"
|
||||||
|
@ -25,6 +26,7 @@ help:
|
||||||
@echo " PYTHON '$(PYTHON)'"
|
@echo " PYTHON '$(PYTHON)'"
|
||||||
@echo " PIP_INSTALL '$(PIP_INSTALL)'"
|
@echo " PIP_INSTALL '$(PIP_INSTALL)'"
|
||||||
@echo " GIT_CLONE '$(GIT_CLONE)'"
|
@echo " GIT_CLONE '$(GIT_CLONE)'"
|
||||||
|
@echo " MODEL '$(MODEL)'"
|
||||||
|
|
||||||
# END-EVAL
|
# END-EVAL
|
||||||
|
|
||||||
|
@ -34,17 +36,14 @@ install:
|
||||||
|
|
||||||
|
|
||||||
# Get GT4HistOCR Calamari model (from SBB)
|
# Get GT4HistOCR Calamari model (from SBB)
|
||||||
gt4histocr-calamari1:
|
$(MODEL):
|
||||||
mkdir -p gt4histocr-calamari1
|
ocrd resmgr download ocrd-calamari-recognize $@
|
||||||
cd gt4histocr-calamari1 && \
|
|
||||||
wget https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz && \
|
|
||||||
tar xfv model.tar.xz && \
|
|
||||||
rm model.tar.xz
|
|
||||||
|
|
||||||
# Download example data
|
# Download example data (not used currently)
|
||||||
actevedef_718448162:
|
actevedef_718448162:
|
||||||
wget https://qurator-data.de/examples/actevedef_718448162.zip && \
|
wget https://qurator-data.de/examples/actevedef_718448162.zip \
|
||||||
unzip actevedef_718448162.zip
|
&& unzip actevedef_718448162.zip \
|
||||||
|
&& rm actevedef_718448162.zip
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -54,7 +53,7 @@ actevedef_718448162:
|
||||||
|
|
||||||
# Install testing python deps via pip
|
# Install testing python deps via pip
|
||||||
deps-test:
|
deps-test:
|
||||||
$(PIP) install -r requirements_test.txt
|
$(PIP_INSTALL) -r requirements-test.txt
|
||||||
|
|
||||||
|
|
||||||
# Clone OCR-D/assets to ./repo/assets
|
# Clone OCR-D/assets to ./repo/assets
|
||||||
|
@ -73,15 +72,15 @@ assets-clean:
|
||||||
rm -rf test/assets
|
rm -rf test/assets
|
||||||
|
|
||||||
# Run unit tests
|
# Run unit tests
|
||||||
test: test/assets gt4histocr-calamari1
|
test: test/assets $(MODEL)
|
||||||
# declare -p HTTP_PROXY
|
# declare -p HTTP_PROXY
|
||||||
$(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS)
|
$(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS)
|
||||||
|
|
||||||
# Run unit tests and determine test coverage
|
# Run unit tests and determine test coverage
|
||||||
coverage: test/assets gt4histocr-calamari1
|
coverage: test/assets $(MODEL)
|
||||||
coverage erase
|
coverage erase
|
||||||
make test PYTHON="coverage run"
|
make test PYTHON="coverage run"
|
||||||
coverage report
|
coverage report
|
||||||
coverage html
|
coverage html
|
||||||
|
|
||||||
.PHONY: assets-clean test
|
.PHONY: install assets-clean deps-test test coverage $(MODEL)
|
||||||
|
|
|
@ -20,11 +20,11 @@
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"checkpoint_dir": {
|
"checkpoint_dir": {
|
||||||
"description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory",
|
"description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory",
|
||||||
"type": "string", "format": "file", "cacheable": true, "default": "qurator-gt4histocr-1.0"
|
"type": "string",
|
||||||
},
|
"format": "uri",
|
||||||
"checkpoint": {
|
"content-type": "text/directory",
|
||||||
"description": "The calamari model files (*.ckpt.json)",
|
"cacheable": true,
|
||||||
"type": "string", "format": "file", "cacheable": true
|
"default": "qurator-gt4histocr-1.0"
|
||||||
},
|
},
|
||||||
"voter": {
|
"voter": {
|
||||||
"description": "The voting algorithm to use",
|
"description": "The voting algorithm to use",
|
||||||
|
|
|
@ -47,10 +47,8 @@ class CalamariRecognize(Processor):
|
||||||
"""
|
"""
|
||||||
Set up the model prior to processing.
|
Set up the model prior to processing.
|
||||||
"""
|
"""
|
||||||
if not self.parameter.get('checkpoint', None) and self.parameter.get('checkpoint_dir', None):
|
|
||||||
resolved = self.resolve_resource(self.parameter['checkpoint_dir'])
|
resolved = self.resolve_resource(self.parameter['checkpoint_dir'])
|
||||||
self.parameter['checkpoint'] = '%s/*.ckpt.json' % resolved
|
checkpoints = glob('%s/*.ckpt.json' % resolved)
|
||||||
checkpoints = glob(self.parameter['checkpoint'])
|
|
||||||
self.predictor = MultiPredictor(checkpoints=checkpoints)
|
self.predictor = MultiPredictor(checkpoints=checkpoints)
|
||||||
|
|
||||||
self.network_input_channels = self.predictor.predictors[0].network.input_channels
|
self.network_input_channels = self.predictor.predictors[0].network.input_channels
|
||||||
|
@ -244,18 +242,7 @@ class CalamariRecognize(Processor):
|
||||||
|
|
||||||
|
|
||||||
# Add metadata about this operation and its runtime parameters:
|
# Add metadata about this operation and its runtime parameters:
|
||||||
metadata = pcgts.get_Metadata() # ensured by from_file()
|
self.add_metadata(pcgts)
|
||||||
metadata.add_MetadataItem(
|
|
||||||
MetadataItemType(type_="processingStep",
|
|
||||||
name=self.ocrd_tool['steps'][0],
|
|
||||||
value=TOOL,
|
|
||||||
Labels=[LabelsType(
|
|
||||||
externalModel="ocrd-tool",
|
|
||||||
externalId="parameters",
|
|
||||||
Label=[LabelType(type_=name, value=self.parameter[name])
|
|
||||||
for name in self.parameter.keys()])]))
|
|
||||||
|
|
||||||
|
|
||||||
file_id = make_file_id(input_file, self.output_file_grp)
|
file_id = make_file_id(input_file, self.output_file_grp)
|
||||||
pcgts.set_pcGtsId(file_id)
|
pcgts.set_pcGtsId(file_id)
|
||||||
self.workspace.add_file(
|
self.workspace.add_file(
|
||||||
|
|
|
@ -4,6 +4,9 @@ import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from test.assets import assets
|
from test.assets import assets
|
||||||
|
from ocrd_utils import initLogging
|
||||||
|
|
||||||
PWD = os.path.dirname(os.path.realpath(__file__))
|
PWD = os.path.dirname(os.path.realpath(__file__))
|
||||||
sys.path.append(PWD + '/../ocrd')
|
sys.path.append(PWD + '/../ocrd')
|
||||||
|
|
||||||
|
initLogging()
|
||||||
|
|
|
@ -14,8 +14,7 @@ from .base import assets
|
||||||
|
|
||||||
METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml')
|
METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml')
|
||||||
WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
|
WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
|
||||||
CHECKPOINT_DIR = os.path.join(os.getcwd(), 'gt4histocr-calamari1')
|
CHECKPOINT_DIR = os.getenv('MODEL')
|
||||||
CHECKPOINT = os.path.join(CHECKPOINT_DIR, '*.ckpt.json')
|
|
||||||
|
|
||||||
# Because XML namespace versions are so much fun, we not only use one, we use TWO!
|
# Because XML namespace versions are so much fun, we not only use one, we use TWO!
|
||||||
NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
|
NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
|
||||||
|
@ -31,14 +30,6 @@ def workspace():
|
||||||
resolver = Resolver()
|
resolver = Resolver()
|
||||||
workspace = resolver.workspace_from_url(METS_KANT, dst_dir=WORKSPACE_DIR)
|
workspace = resolver.workspace_from_url(METS_KANT, dst_dir=WORKSPACE_DIR)
|
||||||
|
|
||||||
# XXX Work around data bug(?):
|
|
||||||
# PAGE-XML links to OCR-D-IMG/INPUT_0017.tif, but this is nothing core can download
|
|
||||||
os.makedirs(os.path.join(WORKSPACE_DIR, 'OCR-D-IMG'))
|
|
||||||
for f in ['INPUT_0017.tif', 'INPUT_0020.tif']:
|
|
||||||
urllib.request.urlretrieve(
|
|
||||||
"https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f,
|
|
||||||
os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f))
|
|
||||||
|
|
||||||
# The binarization options I have are:
|
# The binarization options I have are:
|
||||||
#
|
#
|
||||||
# a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf)
|
# a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf)
|
||||||
|
@ -47,21 +38,22 @@ def workspace():
|
||||||
# c. just fumble with the original files
|
# c. just fumble with the original files
|
||||||
#
|
#
|
||||||
# So I'm going for option c.
|
# So I'm going for option c.
|
||||||
for f in ['INPUT_0017.tif', 'INPUT_0020.tif']:
|
for imgf in workspace.mets.find_files(fileGrp="OCR-D-IMG"):
|
||||||
ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)
|
imgf = workspace.download_file(imgf)
|
||||||
subprocess.call(['convert', ff, '-threshold', '50%', ff])
|
path = os.path.join(workspace.directory, imgf.local_filename)
|
||||||
|
subprocess.call(['mogrify', '-threshold', '50%', path])
|
||||||
|
|
||||||
# Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text
|
# Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text
|
||||||
# XXX Review data again
|
# XXX Review data again
|
||||||
# XXX Make this more robust against namespace version changes
|
# XXX Make this more robust against namespace version changes
|
||||||
for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-LINE"):
|
for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-WORD-GLYPH"):
|
||||||
workspace.download_file(of)
|
workspace.download_file(of)
|
||||||
|
path = os.path.join(workspace.directory, of.local_filename)
|
||||||
|
tree = etree.parse(path)
|
||||||
for to_remove in ["//pc:Word", "//pc:TextEquiv"]:
|
for to_remove in ["//pc:Word", "//pc:TextEquiv"]:
|
||||||
for ff in glob(os.path.join(WORKSPACE_DIR, "OCR-D-GT-SEG-LINE", "*")):
|
|
||||||
tree = etree.parse(ff)
|
|
||||||
for e in tree.xpath(to_remove, namespaces=NSMAP_GT):
|
for e in tree.xpath(to_remove, namespaces=NSMAP_GT):
|
||||||
e.getparent().remove(e)
|
e.getparent().remove(e)
|
||||||
tree.write(ff, xml_declaration=True, encoding="utf-8")
|
tree.write(path, xml_declaration=True, encoding="utf-8")
|
||||||
|
|
||||||
return workspace
|
return workspace
|
||||||
|
|
||||||
|
@ -69,23 +61,7 @@ def workspace():
|
||||||
def test_recognize(workspace):
|
def test_recognize(workspace):
|
||||||
CalamariRecognize(
|
CalamariRecognize(
|
||||||
workspace,
|
workspace,
|
||||||
input_file_grp="OCR-D-GT-SEG-LINE",
|
input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
|
||||||
output_file_grp="OCR-D-OCR-CALAMARI",
|
|
||||||
parameter={
|
|
||||||
"checkpoint": CHECKPOINT,
|
|
||||||
}
|
|
||||||
).process()
|
|
||||||
workspace.save_mets()
|
|
||||||
|
|
||||||
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
|
|
||||||
assert os.path.exists(page1)
|
|
||||||
with open(page1, "r", encoding="utf-8") as f:
|
|
||||||
assert "verſchuldeten" in f.read()
|
|
||||||
|
|
||||||
def test_recognize_with_checkpoint_dir(workspace):
|
|
||||||
CalamariRecognize(
|
|
||||||
workspace,
|
|
||||||
input_file_grp="OCR-D-GT-SEG-LINE",
|
|
||||||
output_file_grp="OCR-D-OCR-CALAMARI",
|
output_file_grp="OCR-D-OCR-CALAMARI",
|
||||||
parameter={
|
parameter={
|
||||||
"checkpoint_dir": CHECKPOINT_DIR,
|
"checkpoint_dir": CHECKPOINT_DIR,
|
||||||
|
@ -103,9 +79,9 @@ def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(works
|
||||||
caplog.set_level(logging.WARNING)
|
caplog.set_level(logging.WARNING)
|
||||||
CalamariRecognize(
|
CalamariRecognize(
|
||||||
workspace,
|
workspace,
|
||||||
input_file_grp="OCR-D-GT-SEG-LINE",
|
input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
|
||||||
output_file_grp="OCR-D-OCR-CALAMARI-BROKEN",
|
output_file_grp="OCR-D-OCR-CALAMARI-BROKEN",
|
||||||
parameter={'checkpoint': CHECKPOINT}
|
parameter={'checkpoint_dir': CHECKPOINT_DIR}
|
||||||
).process()
|
).process()
|
||||||
|
|
||||||
interesting_log_messages = [t[2] for t in caplog.record_tuples if "Using raw image" in t[2]]
|
interesting_log_messages = [t[2] for t in caplog.record_tuples if "Using raw image" in t[2]]
|
||||||
|
@ -115,10 +91,10 @@ def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(works
|
||||||
def test_word_segmentation(workspace):
|
def test_word_segmentation(workspace):
|
||||||
CalamariRecognize(
|
CalamariRecognize(
|
||||||
workspace,
|
workspace,
|
||||||
input_file_grp="OCR-D-GT-SEG-LINE",
|
input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
|
||||||
output_file_grp="OCR-D-OCR-CALAMARI",
|
output_file_grp="OCR-D-OCR-CALAMARI",
|
||||||
parameter={
|
parameter={
|
||||||
"checkpoint": CHECKPOINT,
|
"checkpoint_dir": CHECKPOINT_DIR,
|
||||||
"textequiv_level": "word", # Note that we're going down to word level here
|
"textequiv_level": "word", # Note that we're going down to word level here
|
||||||
}
|
}
|
||||||
).process()
|
).process()
|
||||||
|
@ -147,10 +123,10 @@ def test_word_segmentation(workspace):
|
||||||
def test_glyphs(workspace):
|
def test_glyphs(workspace):
|
||||||
CalamariRecognize(
|
CalamariRecognize(
|
||||||
workspace,
|
workspace,
|
||||||
input_file_grp="OCR-D-GT-SEG-LINE",
|
input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
|
||||||
output_file_grp="OCR-D-OCR-CALAMARI",
|
output_file_grp="OCR-D-OCR-CALAMARI",
|
||||||
parameter={
|
parameter={
|
||||||
"checkpoint": CHECKPOINT,
|
"checkpoint_dir": CHECKPOINT_DIR,
|
||||||
"textequiv_level": "glyph", # Note that we're going down to glyph level here
|
"textequiv_level": "glyph", # Note that we're going down to glyph level here
|
||||||
}
|
}
|
||||||
).process()
|
).process()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue