|
|
|
@ -40,18 +40,24 @@ do_fontident() {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
do_linesegmentation() {
|
|
|
|
|
remove_filegrp OCR-D-SEG-REGION mets.xml
|
|
|
|
|
remove_filegrp OCR-D-SEG-LINE mets.xml
|
|
|
|
|
ocrd-ocropy-segment -l $LOG_LEVEL \
|
|
|
|
|
-m mets.xml -I OCR-D-IMG -O OCR-D-SEG-LINE
|
|
|
|
|
#ocrd-ocropy-segment -l $LOG_LEVEL \
|
|
|
|
|
# -m mets.xml -I OCR-D-IMG -O OCR-D-SEG-LINE
|
|
|
|
|
# XXX ocrd-ocropy-segment throws an exception for buerger_gedichte_1778.ocrd
|
|
|
|
|
#ocrd workspace validate mets.xml
|
|
|
|
|
|
|
|
|
|
ocrd-tesserocr-segment-region -l $LOG_LEVEL \
|
|
|
|
|
-m mets.xml -I OCR-D-IMG -O OCR-D-SEG-REGION
|
|
|
|
|
#ocrd workspace validate mets.xml
|
|
|
|
|
ocrd-tesserocr-segment-line -l $LOG_LEVEL \
|
|
|
|
|
-m mets.xml -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE
|
|
|
|
|
#ocrd workspace validate mets.xml
|
|
|
|
|
# XXX compare ocrd-tesserocr-segment* vs tesseract native
|
|
|
|
|
|
|
|
|
|
# XXX This leaves copies of the images at the top level of the workspace, because it "downloads" the "remote" files.
|
|
|
|
|
# Clean it up.
|
|
|
|
|
# Clean it up. (Maybe only affects ocrd-ocropy-segment)
|
|
|
|
|
find . -maxdepth 1 -name "OCR-D-IMG*" -type f -exec rm -v {} \;
|
|
|
|
|
|
|
|
|
|
# XXX ocrd-tesserocr-segment-line does not seem to produce any line segmentation
|
|
|
|
|
# XXX mv {ocrd-ocropy-segment,-line}
|
|
|
|
|
# XXX ocrd-ocropy-segment throws an exception for buerger_gedichte_1778.ocrd
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
do_ocr() {
|
|
|
|
|