Merge branch 'feat/update-calamari1'

fix/readme-no-checkpoint
Gerber, Mike 4 years ago
commit 8fcd331fbd

@ -11,7 +11,7 @@ help:
@echo " Targets" @echo " Targets"
@echo "" @echo ""
@echo " install Install ocrd_calamari" @echo " install Install ocrd_calamari"
@echo " gt4histocr-calamari Get GT4HistOCR Calamari model (from SBB)" @echo " gt4histocr-calamari1 Get GT4HistOCR Calamari model (from SBB)"
@echo " actevedef_718448162 Download example data" @echo " actevedef_718448162 Download example data"
@echo " deps-test Install testing python deps via pip" @echo " deps-test Install testing python deps via pip"
@echo " repo/assets Clone OCR-D/assets to ./repo/assets" @echo " repo/assets Clone OCR-D/assets to ./repo/assets"
@ -34,10 +34,10 @@ install:
# Get GT4HistOCR Calamari model (from SBB) # Get GT4HistOCR Calamari model (from SBB)
gt4histocr-calamari: gt4histocr-calamari1:
mkdir gt4histocr-calamari mkdir -p gt4histocr-calamari1
cd gt4histocr-calamari && \ cd gt4histocr-calamari1 && \
wget https://qurator-data.de/calamari-models/GT4HistOCR/model.tar.xz && \ wget https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz && \
tar xfv model.tar.xz && \ tar xfv model.tar.xz && \
rm model.tar.xz rm model.tar.xz
@ -73,12 +73,12 @@ assets-clean:
rm -rf test/assets rm -rf test/assets
# Run unit tests # Run unit tests
test: test/assets gt4histocr-calamari test: test/assets gt4histocr-calamari1
# declare -p HTTP_PROXY # declare -p HTTP_PROXY
$(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS) $(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS)
# Run unit tests and determine test coverage # Run unit tests and determine test coverage
coverage: test/assets gt4histocr-calamari coverage: test/assets gt4histocr-calamari1
coverage erase coverage erase
make test PYTHON="coverage run" make test PYTHON="coverage run"
coverage report coverage report

@ -43,8 +43,8 @@ pip install .
Download models trained on GT4HistOCR data: Download models trained on GT4HistOCR data:
``` ```
make gt4histocr-calamari make gt4histocr-calamari1
ls gt4histocr-calamari ls gt4histocr-calamari1
``` ```
## Example Usage ## Example Usage
@ -52,7 +52,7 @@ Before using `ocrd-calamari-recognize` get some example data and model, and
prepare the document for OCR: prepare the document for OCR:
``` ```
# Download model and example data # Download model and example data
make gt4histocr-calamari make gt4histocr-calamari1
make actevedef_718448162 make actevedef_718448162
# Create binarized images and line segmentation using other OCR-D projects # Create binarized images and line segmentation using other OCR-D projects
@ -64,7 +64,7 @@ ocrd-tesserocr-segment-line -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE
Finally recognize the text using ocrd_calamari and the downloaded model: Finally recognize the text using ocrd_calamari and the downloaded model:
``` ```
ocrd-calamari-recognize -p '{ "checkpoint": "../gt4histocr-calamari/*.ckpt.json" }' -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI ocrd-calamari-recognize -p '{ "checkpoint": "../gt4histocr-calamari1/*.ckpt.json" }' -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
``` ```
You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions

@ -70,13 +70,16 @@ class CalamariRecognize(Processor):
textlines = region.get_TextLine() textlines = region.get_TextLine()
log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) log.info("About to recognize %i lines of region '%s'", len(textlines), region.id)
for (line_no, line) in enumerate(textlines):
log.debug("Recognizing line '%s' in region '%s'", line.id, region.id)
line_images_np = []
for (line_no, line) in enumerate(textlines):
line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_xywh) line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_xywh)
line_image_np = np.array(line_image, dtype=np.uint8) line_image_np = np.array(line_image, dtype=np.uint8)
line_images_np.append(line_image_np)
raw_results_all = self.predictor.predict_raw(line_images_np, progress_bar=False)
for line, raw_results in zip(textlines, raw_results_all):
raw_results = list(self.predictor.predict_raw([line_image_np], progress_bar=False))[0]
for i, p in enumerate(raw_results): for i, p in enumerate(raw_results):
p.prediction.id = "fold_{}".format(i) p.prediction.id = "fold_{}".format(i)

@ -1,5 +1,5 @@
tensorflow-gpu == 1.15.* tensorflow >= 2.3.0rc2
calamari-ocr == 0.3.5 calamari-ocr == 1.0.*
setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime?
click click
ocrd >= 2.13.0 ocrd >= 2.13.0

@ -14,7 +14,7 @@ from .base import assets
METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml') METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml')
WORKSPACE_DIR = '/tmp/test-ocrd-calamari' WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json') CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari1/*.ckpt.json')
# Because XML namespace versions are so much fun, we not only use one, we use TWO! # Because XML namespace versions are so much fun, we not only use one, we use TWO!
NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" } NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }

Loading…
Cancel
Save