From 88c43b8c348edb40527d6e3531f2265295f8ec94 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 5 Dec 2019 18:16:43 +0100 Subject: [PATCH] calamari-models/train-calamari-gt4histocr: Update train.sh to use qurator_data_lib.sh --- qurator_data_lib.sh | 102 ++++++++++++++++++++++++++++++++++++++++++++ train.sh | 30 +++++++++++-- 2 files changed, 129 insertions(+), 3 deletions(-) create mode 100644 qurator_data_lib.sh diff --git a/qurator_data_lib.sh b/qurator_data_lib.sh new file mode 100644 index 0000000..29fade7 --- /dev/null +++ b/qurator_data_lib.sh @@ -0,0 +1,102 @@ +check_data_subdir() { + result=0 + + if git submodule status $DATA_SUBDIR | grep -q '^-'; then + echo "$DATA_SUBDIR/ is not an initialized submodule"; result=1 + fi + if ! [ -e $DATA_SUBDIR/.git/annex ]; then + echo "$DATA_SUBDIR/ is not a git annex repository"; result=1 + fi + if ! (cd $DATA_SUBDIR && git annex version | grep -q 'local repository version: 7'); then + echo "$DATA_SUBDIR/ is not a git annex repository version 7"; result=1 + fi + if ! (cd $DATA_SUBDIR && git remote | grep -q '^nfs$'); then + echo "$DATA_SUBDIR/ has no git remote 'nfs'"; result=1 + fi + + return $result +} + +annex_get() { + if [[ "$1" = '--allow_symlinks' ]]; then + allow_symlinks=1 + shift + else + allow_symlinks=0 + fi + file_pattern="$1" + + ( + cd $DATA_SUBDIR + git annex get $file_pattern + + # fsck seems to be necessary to fix the files if we are in a submodule + git annex fsck $file_pattern + + # Check that there are no symlinks = only unlocked files. This is needed for + # Docker builds, as we cannot dereference symlinks in a Dockerfile COPY. + if [[ $allow_symlinks = 0 ]]; then + git ls-files $file_pattern | while read f; do + if ! [[ -f "$f" ]]; then + echo "$DATA_SUBDIR/$f is not a regular file – Is an unlock needed?" + exit + fi + done + fi + ) +} + +download_to() { + download_source="$1" + unpack_to="$2" + + ( + cd data + tmpf=`mktemp 'tmp.XXXXX'` + wget -O $tmpf "$download_source" + mkdir -p "$unpack_to" + # XXX Unpacking relies on tar -a unpacking any tar compression, might not work everywhere? + tar -C "$unpack_to" -af $tmpf -xv + rm -f $tmpf + ) +} + +suggest_commands() { + echo "Suggested commands:" + echo + echo "git submodule update --init" + echo "(cd $DATA_SUBDIR && git annex init --version=7)" + echo "(cd $DATA_SUBDIR && git remote add nfs /<... path to ...>/GitNX-Repository/qurator/qurator-data)" +} + +handle_data() { + if [[ "$1" = '--no-download' ]]; then + no_download=1 + shift + else + no_download=0 + fi + + if ! check_data_subdir; then + if [[ $no_download = 1 ]]; then + select choice in "Abort to manually fix $DATA_SUBDIR submodule"; do + if [ $REPLY = 1 ]; then + suggest_commands + exit + fi + done + else + select choice in "Abort to manually fix $DATA_SUBDIR submodule" "Download data files from the web"; do + if [ $REPLY = 1 ]; then + suggest_commands + exit + else + get_from_web + break + fi + done + fi + else + get_from_annex + fi +} diff --git a/train.sh b/train.sh index 7a1d33b..d8e1eb8 100755 --- a/train.sh +++ b/train.sh @@ -1,11 +1,35 @@ #!/bin/sh +# Train a GT4HistOCR Calamari model +# (or rather 5 for voted prediction) + +set -e + +self=`realpath $0` +self_dir=`dirname "$self"` + + + + +cd $self_dir +DATA_SUBDIR=data +get_from_annex() { + annex_get 'GT4HistOCR/corpus/*.tar.bz2' +} +get_from_web() { + download_to 'https://zenodo.org/record/1344132/files/GT4HistOCR.tar?download=1' 'GT4HistOCR' +} +. $self_dir/qurator_data_lib.sh +handle_data + + + + rm -rf /tmp/train-calamari-gt4histocr.* TMPDIR=`mktemp -d /tmp/train-calamari-gt4histocr.XXXXX` echo "Unpacking dataset tar files to $TMPDIR" -(cd data; git annex get GT4HistOCR/corpus/*.tar.bz2) -for tar in data/GT4HistOCR/corpus/*.tar.bz2; do +for tar in $DATA_SUBDIR/GT4HistOCR/corpus/*.tar.bz2; do tar xf $tar -C $TMPDIR done echo "Removing dta19/1882-keller_sinngedicht/04970.nrm.png (Broken PNG)" @@ -13,7 +37,7 @@ rm -f $TMPDIR/dta19/1882-keller_sinngedicht/04970.* export PYTHONUNBUFFERED=1 # For python + tee -outdir=data/calamari-models/GT4HistOCR +outdir=$DATA_SUBDIR/calamari-models/GT4HistOCR mkdir -p $outdir calamari-cross-fold-train \