From 7025d960b41bac09f555e36201232ea005717ac0 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 21 Oct 2019 17:04:06 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Use=20ocrd=5Folena=20for=20binariza?= =?UTF-8?q?tion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 13 ++++++++++++- my_ocrd_workflow | 7 ++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index c238dff..b853c48 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,11 +15,13 @@ RUN apt-get update && \ python3-pip \ git \ # For leptonica/tesseract: - cmake libgif-dev libjpeg-dev libpng-dev libtiff-dev zlib1g-dev \ + cmake libgif-dev libjpeg-dev libpng-dev libtiff-dev zlib1g-dev libpango1.0-dev \ # For clstm on Ubuntu 19.04: swig libeigen3-dev libpng-dev libprotobuf-dev \ # For cv2: libsm6 libxrender1 \ +# For ocrd_olena: + wget graphviz imagemagick libmagick++-dev libgraphicsmagick++1-dev libboost-dev \ # XML utils libxml2-utils \ xmlstarlet \ @@ -47,6 +49,15 @@ RUN curl -sSL -O https://github.com/tesseract-ocr/tessdata_best/archive/$TESSDAT rm -rf $TESSDATA_BEST_VERSION.tar.gz +# Build ocrd_olena +RUN curl -sSL -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/fde4436.tar.gz && \ + mkdir ocrd_olena && \ + tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \ + cd ocrd_olena && \ + make install PREFIX=/usr/local && \ + cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz + + # Copy OCR models RUN mkdir -p /var/lib/calamari-models COPY data/calamari-models/GT4HistOCR /var/lib/calamari-models/GT4HistOCR diff --git a/my_ocrd_workflow b/my_ocrd_workflow index 112d209..ee0cbff 100755 --- a/my_ocrd_workflow +++ b/my_ocrd_workflow @@ -36,8 +36,9 @@ do_binarization() { # Binarize the images remove_filegrp OCR-D-IMG-BIN mets.xml - ocrd-kraken-binarize -l $LOG_LEVEL \ - -m mets.xml -I OCR-D-IMG -O OCR-D-IMG-BIN + ocrd-olena-binarize -l $LOG_LEVEL \ + -m mets.xml -I OCR-D-IMG -O OCR-D-IMG-BIN \ + -p <(echo '{"impl": "sauvola-ms-split"}') } do_fontident() { @@ -143,7 +144,7 @@ page_fix_image_references_to_bin() { for file in `ocrd workspace find -G $filegrp`; do # Arrays with filenames to the images imgs=(`ocrd workspace find -G OCR-D-IMG`) - imgs_bin=(`ocrd workspace find -G OCR-D-IMG-BIN`) + imgs_bin=(`ocrd workspace find -G OCR-D-IMG-BIN -m image/png`) # Change all image references to point to the corresponding binarized image for i in ${!imgs[@]}; do