mirror of
https://github.com/mikegerber/ocrd_calamari.git
synced 2025-06-09 19:59:53 +02:00
Merge branch 'feat/update-calamari1'
This commit is contained in:
commit
8fcd331fbd
5 changed files with 20 additions and 17 deletions
14
Makefile
14
Makefile
|
@ -11,7 +11,7 @@ help:
|
||||||
@echo " Targets"
|
@echo " Targets"
|
||||||
@echo ""
|
@echo ""
|
||||||
@echo " install Install ocrd_calamari"
|
@echo " install Install ocrd_calamari"
|
||||||
@echo " gt4histocr-calamari Get GT4HistOCR Calamari model (from SBB)"
|
@echo " gt4histocr-calamari1 Get GT4HistOCR Calamari model (from SBB)"
|
||||||
@echo " actevedef_718448162 Download example data"
|
@echo " actevedef_718448162 Download example data"
|
||||||
@echo " deps-test Install testing python deps via pip"
|
@echo " deps-test Install testing python deps via pip"
|
||||||
@echo " repo/assets Clone OCR-D/assets to ./repo/assets"
|
@echo " repo/assets Clone OCR-D/assets to ./repo/assets"
|
||||||
|
@ -34,10 +34,10 @@ install:
|
||||||
|
|
||||||
|
|
||||||
# Get GT4HistOCR Calamari model (from SBB)
|
# Get GT4HistOCR Calamari model (from SBB)
|
||||||
gt4histocr-calamari:
|
gt4histocr-calamari1:
|
||||||
mkdir gt4histocr-calamari
|
mkdir -p gt4histocr-calamari1
|
||||||
cd gt4histocr-calamari && \
|
cd gt4histocr-calamari1 && \
|
||||||
wget https://qurator-data.de/calamari-models/GT4HistOCR/model.tar.xz && \
|
wget https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz && \
|
||||||
tar xfv model.tar.xz && \
|
tar xfv model.tar.xz && \
|
||||||
rm model.tar.xz
|
rm model.tar.xz
|
||||||
|
|
||||||
|
@ -73,12 +73,12 @@ assets-clean:
|
||||||
rm -rf test/assets
|
rm -rf test/assets
|
||||||
|
|
||||||
# Run unit tests
|
# Run unit tests
|
||||||
test: test/assets gt4histocr-calamari
|
test: test/assets gt4histocr-calamari1
|
||||||
# declare -p HTTP_PROXY
|
# declare -p HTTP_PROXY
|
||||||
$(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS)
|
$(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS)
|
||||||
|
|
||||||
# Run unit tests and determine test coverage
|
# Run unit tests and determine test coverage
|
||||||
coverage: test/assets gt4histocr-calamari
|
coverage: test/assets gt4histocr-calamari1
|
||||||
coverage erase
|
coverage erase
|
||||||
make test PYTHON="coverage run"
|
make test PYTHON="coverage run"
|
||||||
coverage report
|
coverage report
|
||||||
|
|
|
@ -43,8 +43,8 @@ pip install .
|
||||||
Download models trained on GT4HistOCR data:
|
Download models trained on GT4HistOCR data:
|
||||||
|
|
||||||
```
|
```
|
||||||
make gt4histocr-calamari
|
make gt4histocr-calamari1
|
||||||
ls gt4histocr-calamari
|
ls gt4histocr-calamari1
|
||||||
```
|
```
|
||||||
|
|
||||||
## Example Usage
|
## Example Usage
|
||||||
|
@ -52,7 +52,7 @@ Before using `ocrd-calamari-recognize` get some example data and model, and
|
||||||
prepare the document for OCR:
|
prepare the document for OCR:
|
||||||
```
|
```
|
||||||
# Download model and example data
|
# Download model and example data
|
||||||
make gt4histocr-calamari
|
make gt4histocr-calamari1
|
||||||
make actevedef_718448162
|
make actevedef_718448162
|
||||||
|
|
||||||
# Create binarized images and line segmentation using other OCR-D projects
|
# Create binarized images and line segmentation using other OCR-D projects
|
||||||
|
@ -64,7 +64,7 @@ ocrd-tesserocr-segment-line -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE
|
||||||
|
|
||||||
Finally recognize the text using ocrd_calamari and the downloaded model:
|
Finally recognize the text using ocrd_calamari and the downloaded model:
|
||||||
```
|
```
|
||||||
ocrd-calamari-recognize -p '{ "checkpoint": "../gt4histocr-calamari/*.ckpt.json" }' -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
|
ocrd-calamari-recognize -p '{ "checkpoint": "../gt4histocr-calamari1/*.ckpt.json" }' -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
|
||||||
```
|
```
|
||||||
|
|
||||||
You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions
|
You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions
|
||||||
|
|
|
@ -70,13 +70,16 @@ class CalamariRecognize(Processor):
|
||||||
|
|
||||||
textlines = region.get_TextLine()
|
textlines = region.get_TextLine()
|
||||||
log.info("About to recognize %i lines of region '%s'", len(textlines), region.id)
|
log.info("About to recognize %i lines of region '%s'", len(textlines), region.id)
|
||||||
for (line_no, line) in enumerate(textlines):
|
|
||||||
log.debug("Recognizing line '%s' in region '%s'", line.id, region.id)
|
|
||||||
|
|
||||||
|
line_images_np = []
|
||||||
|
for (line_no, line) in enumerate(textlines):
|
||||||
line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_xywh)
|
line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_xywh)
|
||||||
line_image_np = np.array(line_image, dtype=np.uint8)
|
line_image_np = np.array(line_image, dtype=np.uint8)
|
||||||
|
line_images_np.append(line_image_np)
|
||||||
|
raw_results_all = self.predictor.predict_raw(line_images_np, progress_bar=False)
|
||||||
|
|
||||||
|
for line, raw_results in zip(textlines, raw_results_all):
|
||||||
|
|
||||||
raw_results = list(self.predictor.predict_raw([line_image_np], progress_bar=False))[0]
|
|
||||||
for i, p in enumerate(raw_results):
|
for i, p in enumerate(raw_results):
|
||||||
p.prediction.id = "fold_{}".format(i)
|
p.prediction.id = "fold_{}".format(i)
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
tensorflow-gpu == 1.15.*
|
tensorflow >= 2.3.0rc2
|
||||||
calamari-ocr == 0.3.5
|
calamari-ocr == 1.0.*
|
||||||
setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime?
|
setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime?
|
||||||
click
|
click
|
||||||
ocrd >= 2.13.0
|
ocrd >= 2.13.0
|
||||||
|
|
|
@ -14,7 +14,7 @@ from .base import assets
|
||||||
|
|
||||||
METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml')
|
METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml')
|
||||||
WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
|
WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
|
||||||
CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json')
|
CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari1/*.ckpt.json')
|
||||||
|
|
||||||
# Because XML namespace versions are so much fun, we not only use one, we use TWO!
|
# Because XML namespace versions are so much fun, we not only use one, we use TWO!
|
||||||
NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
|
NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue