mirror of
https://github.com/qurator-spk/ocrd-galley.git
synced 2025-06-08 22:29:56 +02:00
✨ Use sbb_textline_detector to segment lines
This commit is contained in:
parent
735e9599d7
commit
6454d20998
6 changed files with 49 additions and 11 deletions
|
@ -17,6 +17,8 @@ RUN apt-get update && \
|
||||||
cmake libgif-dev libjpeg-dev libpng-dev libtiff-dev zlib1g-dev \
|
cmake libgif-dev libjpeg-dev libpng-dev libtiff-dev zlib1g-dev \
|
||||||
# For clstm on Ubuntu 19.04:
|
# For clstm on Ubuntu 19.04:
|
||||||
swig libeigen3-dev libpng-dev libprotobuf-dev \
|
swig libeigen3-dev libpng-dev libprotobuf-dev \
|
||||||
|
# For cv2:
|
||||||
|
libsm6 libxrender1 \
|
||||||
# XML utils
|
# XML utils
|
||||||
libxml2-utils \
|
libxml2-utils \
|
||||||
xmlstarlet \
|
xmlstarlet \
|
||||||
|
@ -53,6 +55,11 @@ COPY data/tesseract-models/GT4HistOCR/GT4HistOCR_2000000.traineddata $TESSDATA_P
|
||||||
RUN tesseract --list-langs
|
RUN tesseract --list-langs
|
||||||
|
|
||||||
|
|
||||||
|
# Copy over sbb_textline_detector
|
||||||
|
COPY vendor vendor
|
||||||
|
COPY data/textline_detection /var/lib/textline_detection
|
||||||
|
|
||||||
|
|
||||||
COPY requirements.txt /tmp
|
COPY requirements.txt /tmp
|
||||||
RUN pip3 install --no-cache-dir -r /tmp/requirements.txt
|
RUN pip3 install --no-cache-dir -r /tmp/requirements.txt
|
||||||
|
|
||||||
|
|
1
build
1
build
|
@ -7,6 +7,7 @@ set -e
|
||||||
git annex upgrade
|
git annex upgrade
|
||||||
git annex get calamari-models/GT4HistOCR/*.ckpt*
|
git annex get calamari-models/GT4HistOCR/*.ckpt*
|
||||||
git annex get tesseract-models/GT4HistOCR/*.traineddata
|
git annex get tesseract-models/GT4HistOCR/*.traineddata
|
||||||
|
git annex get textline_detection/*.h5
|
||||||
)
|
)
|
||||||
|
|
||||||
docker build -t my_ocrd_workflow .
|
docker build -t my_ocrd_workflow .
|
||||||
|
|
2
data
2
data
|
@ -1 +1 @@
|
||||||
Subproject commit eb7412a1efbcba53567ec37237732e96e839dbe8
|
Subproject commit bcc1aec082cb81c29668ffef3d04c51eaa866b5c
|
|
@ -59,7 +59,7 @@ do_fontident() {
|
||||||
# any DEFAULT, yet -I DEFAULT seems to work for ocrd-typegroups-classifier
|
# any DEFAULT, yet -I DEFAULT seems to work for ocrd-typegroups-classifier
|
||||||
}
|
}
|
||||||
|
|
||||||
do_linesegmentation() {
|
do_linesegmentation_tesserocr() {
|
||||||
# Segment the lines in the binarized images
|
# Segment the lines in the binarized images
|
||||||
|
|
||||||
remove_filegrp OCR-D-SEG-REGION mets.xml
|
remove_filegrp OCR-D-SEG-REGION mets.xml
|
||||||
|
@ -76,6 +76,16 @@ do_linesegmentation() {
|
||||||
# XXX compare ocrd-tesserocr-segment* vs tesseract native
|
# XXX compare ocrd-tesserocr-segment* vs tesseract native
|
||||||
}
|
}
|
||||||
|
|
||||||
|
do_linesegmentation_sbb() {
|
||||||
|
# Segment the lines in the images
|
||||||
|
|
||||||
|
remove_filegrp OCR-D-SEG-REGION mets.xml
|
||||||
|
remove_filegrp OCR-D-SEG-LINE mets.xml
|
||||||
|
ocrd_sbb_textline_detector -l $LOG_LEVEL \
|
||||||
|
-m mets.xml -I OCR-D-IMG -O OCR-D-SEG-LINE \
|
||||||
|
-p '{"model": "/var/lib/textline_detection"}'
|
||||||
|
}
|
||||||
|
|
||||||
do_ocr() {
|
do_ocr() {
|
||||||
# Perform OCR on the segmented lines
|
# Perform OCR on the segmented lines
|
||||||
|
|
||||||
|
@ -123,16 +133,22 @@ page_fix_image_references() {
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
page_workaround_remove_conf() {
|
page_fix_image_references_to_bin() {
|
||||||
# XXX Work around https://github.com/OCR-D/core/issues/269
|
# Make image references point to the binarized images
|
||||||
|
# XXX This is a hack, it is probably better to use alternative images in ocrd_calamari
|
||||||
|
|
||||||
filegrp=$1
|
filegrp=$1
|
||||||
|
|
||||||
local file
|
local file
|
||||||
for file in `ocrd workspace find -G $filegrp`; do
|
for file in `ocrd workspace find -G $filegrp`; do
|
||||||
xmlstarlet ed --inplace \
|
# Arrays with filenames to the images
|
||||||
-N 'page=http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15' \
|
imgs=(`ocrd workspace find -G OCR-D-IMG`)
|
||||||
-d '//page:TextEquiv/@conf' $file
|
imgs_bin=(`ocrd workspace find -G OCR-D-IMG-BIN`)
|
||||||
|
|
||||||
|
# Change all image references to point to the corresponding binarized image
|
||||||
|
for i in ${!imgs[@]}; do
|
||||||
|
sed -i "s!imageFilename=.${imgs[$i]}.!imageFilename=\"${imgs_bin[$i]}\"!g" $file
|
||||||
|
done
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -146,6 +162,14 @@ page_downgrade_to_2018() {
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
|
page_upgrade_to_2019() {
|
||||||
|
filegrp=$1
|
||||||
|
|
||||||
|
local file
|
||||||
|
for file in `ocrd workspace find -G $filegrp`; do
|
||||||
|
sed -i 's#pagecontent/[0-9-]*#pagecontent/2019-07-15#g' $file
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
pip3 list
|
pip3 list
|
||||||
|
|
||||||
|
@ -158,7 +182,9 @@ do_binarization
|
||||||
do_validate
|
do_validate
|
||||||
|
|
||||||
|
|
||||||
do_linesegmentation
|
do_linesegmentation_sbb
|
||||||
|
page_fix_image_references_to_bin OCR-D-SEG-LINE
|
||||||
|
page_upgrade_to_2019 OCR-D-SEG-LINE
|
||||||
page_validate_xml OCR-D-SEG-REGION
|
page_validate_xml OCR-D-SEG-REGION
|
||||||
page_validate_xml OCR-D-SEG-LINE
|
page_validate_xml OCR-D-SEG-LINE
|
||||||
do_validate
|
do_validate
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
tensorflow-gpu < 2.0 # Needed for sbb_text_linedetector
|
||||||
Pillow==5.4.1 # See https://github.com/OCR-D/core/issues/325
|
Pillow==5.4.1 # See https://github.com/OCR-D/core/issues/325
|
||||||
|
|
||||||
ocrd >= 1.0.0b19
|
ocrd >= 1.0.0b19
|
||||||
|
|
||||||
https://github.com/mikegerber/ocrd_typegroups_classifier/archive/fix/pass-down-page-id.tar.gz # XXX git+https://github.com/seuretm/ocrd_typegroups_classifier.git
|
https://github.com/mikegerber/ocrd_typegroups_classifier/archive/fix/pass-down-page-id.tar.gz # XXX git+https://github.com/seuretm/ocrd_typegroups_classifier.git
|
||||||
|
@ -9,4 +11,6 @@ ocrd_tesserocr
|
||||||
|
|
||||||
https://github.com/mikegerber/ocrd_calamari/archive/3e8c1ac.tar.gz
|
https://github.com/mikegerber/ocrd_calamari/archive/3e8c1ac.tar.gz
|
||||||
|
|
||||||
|
vendor/sbb_textline_detector-b1663f7.tar
|
||||||
|
|
||||||
https://github.com/qurator-spk/dinglehopper/archive/c305539.tar.gz
|
https://github.com/qurator-spk/dinglehopper/archive/c305539.tar.gz
|
||||||
|
|
BIN
vendor/sbb_textline_detector-b1663f7.tar
vendored
Normal file
BIN
vendor/sbb_textline_detector-b1663f7.tar
vendored
Normal file
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue