From 8d66469621a7a6a9d3d3509c0ac94ca2bf280731 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 24 Jun 2019 12:34:08 +0200 Subject: [PATCH] Binarize images before segmenting --- my_ocrd_workflow | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/my_ocrd_workflow b/my_ocrd_workflow index df5f599..bfb84aa 100755 --- a/my_ocrd_workflow +++ b/my_ocrd_workflow @@ -20,6 +20,12 @@ remove_filegrp() { # XXX This should also delete the files (after checking if they are indeed inside the workspace) and the directory } +do_binarization() { + remove_filegrp OCR-D-IMG-BIN mets.xml + ocrd-kraken-binarize -l $LOG_LEVEL \ + -m mets.xml -I OCR-D-IMG -O OCR-D-IMG-BIN +} + do_fontident() { ocrd_typegroups_classifier_parameters=' { @@ -43,12 +49,12 @@ do_linesegmentation() { remove_filegrp OCR-D-SEG-REGION mets.xml remove_filegrp OCR-D-SEG-LINE mets.xml #ocrd-ocropy-segment -l $LOG_LEVEL \ - # -m mets.xml -I OCR-D-IMG -O OCR-D-SEG-LINE + # -m mets.xml -I OCR-D-IMG-BIN -O OCR-D-SEG-LINE # XXX ocrd-ocropy-segment throws an exception for buerger_gedichte_1778.ocrd #ocrd workspace validate mets.xml ocrd-tesserocr-segment-region -l $LOG_LEVEL \ - -m mets.xml -I OCR-D-IMG -O OCR-D-SEG-REGION + -m mets.xml -I OCR-D-IMG-BIN -O OCR-D-SEG-REGION #ocrd workspace validate mets.xml ocrd-tesserocr-segment-line -l $LOG_LEVEL \ -m mets.xml -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE @@ -103,8 +109,9 @@ page_fix_image_references() { } -# TODO Binarization do_fontident + +do_binarization do_linesegmentation do_ocr page_fix_xml OCR-D-OCR-TESS