diff --git a/Dockerfile b/Dockerfile index 354bb21..8372f61 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,6 +17,8 @@ RUN apt-get update && \ cmake libgif-dev libjpeg-dev libpng-dev libtiff-dev zlib1g-dev \ # For clstm on Ubuntu 19.04: swig libeigen3-dev libpng-dev libprotobuf-dev \ +# For cv2: + libsm6 libxrender1 \ # XML utils libxml2-utils \ xmlstarlet \ @@ -53,6 +55,11 @@ COPY data/tesseract-models/GT4HistOCR/GT4HistOCR_2000000.traineddata $TESSDATA_P RUN tesseract --list-langs +# Copy over sbb_textline_detector +COPY vendor vendor +COPY data/textline_detection /var/lib/textline_detection + + COPY requirements.txt /tmp RUN pip3 install --no-cache-dir -r /tmp/requirements.txt diff --git a/build b/build index 3b11557..5743595 100755 --- a/build +++ b/build @@ -7,6 +7,7 @@ set -e git annex upgrade git annex get calamari-models/GT4HistOCR/*.ckpt* git annex get tesseract-models/GT4HistOCR/*.traineddata + git annex get textline_detection/*.h5 ) docker build -t my_ocrd_workflow . diff --git a/data b/data index eb7412a..bcc1aec 160000 --- a/data +++ b/data @@ -1 +1 @@ -Subproject commit eb7412a1efbcba53567ec37237732e96e839dbe8 +Subproject commit bcc1aec082cb81c29668ffef3d04c51eaa866b5c diff --git a/my_ocrd_workflow b/my_ocrd_workflow index 3d41391..95bf205 100755 --- a/my_ocrd_workflow +++ b/my_ocrd_workflow @@ -59,7 +59,7 @@ do_fontident() { # any DEFAULT, yet -I DEFAULT seems to work for ocrd-typegroups-classifier } -do_linesegmentation() { +do_linesegmentation_tesserocr() { # Segment the lines in the binarized images remove_filegrp OCR-D-SEG-REGION mets.xml @@ -76,6 +76,16 @@ do_linesegmentation() { # XXX compare ocrd-tesserocr-segment* vs tesseract native } +do_linesegmentation_sbb() { + # Segment the lines in the images + + remove_filegrp OCR-D-SEG-REGION mets.xml + remove_filegrp OCR-D-SEG-LINE mets.xml + ocrd_sbb_textline_detector -l $LOG_LEVEL \ + -m mets.xml -I OCR-D-IMG -O OCR-D-SEG-LINE \ + -p '{"model": "/var/lib/textline_detection"}' +} + do_ocr() { # Perform OCR on the segmented lines @@ -123,16 +133,22 @@ page_fix_image_references() { done } -page_workaround_remove_conf() { - # XXX Work around https://github.com/OCR-D/core/issues/269 +page_fix_image_references_to_bin() { + # Make image references point to the binarized images + # XXX This is a hack, it is probably better to use alternative images in ocrd_calamari filegrp=$1 local file for file in `ocrd workspace find -G $filegrp`; do - xmlstarlet ed --inplace \ - -N 'page=http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15' \ - -d '//page:TextEquiv/@conf' $file + # Arrays with filenames to the images + imgs=(`ocrd workspace find -G OCR-D-IMG`) + imgs_bin=(`ocrd workspace find -G OCR-D-IMG-BIN`) + + # Change all image references to point to the corresponding binarized image + for i in ${!imgs[@]}; do + sed -i "s!imageFilename=.${imgs[$i]}.!imageFilename=\"${imgs_bin[$i]}\"!g" $file + done done } @@ -146,6 +162,14 @@ page_downgrade_to_2018() { done } +page_upgrade_to_2019() { + filegrp=$1 + + local file + for file in `ocrd workspace find -G $filegrp`; do + sed -i 's#pagecontent/[0-9-]*#pagecontent/2019-07-15#g' $file + done +} pip3 list @@ -158,9 +182,11 @@ do_binarization do_validate -do_linesegmentation -page_validate_xml OCR-D-SEG-REGION -page_validate_xml OCR-D-SEG-LINE +do_linesegmentation_sbb +page_fix_image_references_to_bin OCR-D-SEG-LINE +page_upgrade_to_2019 OCR-D-SEG-LINE +page_validate_xml OCR-D-SEG-REGION +page_validate_xml OCR-D-SEG-LINE do_validate diff --git a/requirements.txt b/requirements.txt index 94fdbaf..42fe20e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ -Pillow==5.4.1 # See https://github.com/OCR-D/core/issues/325 +tensorflow-gpu < 2.0 # Needed for sbb_text_linedetector +Pillow==5.4.1 # See https://github.com/OCR-D/core/issues/325 + ocrd >= 1.0.0b19 https://github.com/mikegerber/ocrd_typegroups_classifier/archive/fix/pass-down-page-id.tar.gz # XXX git+https://github.com/seuretm/ocrd_typegroups_classifier.git @@ -9,4 +11,6 @@ ocrd_tesserocr https://github.com/mikegerber/ocrd_calamari/archive/3e8c1ac.tar.gz +vendor/sbb_textline_detector-b1663f7.tar + https://github.com/qurator-spk/dinglehopper/archive/c305539.tar.gz diff --git a/vendor/sbb_textline_detector-b1663f7.tar b/vendor/sbb_textline_detector-b1663f7.tar new file mode 100644 index 0000000..2a40482 Binary files /dev/null and b/vendor/sbb_textline_detector-b1663f7.tar differ