mirror of
https://github.com/qurator-spk/train-calamari-gt4histocr.git
synced 2025-06-09 11:50:01 +02:00
calamari-models/train-calamari-gt4histocr: Update train.sh to use qurator_data_lib.sh
This commit is contained in:
parent
34542c55ea
commit
88c43b8c34
2 changed files with 129 additions and 3 deletions
102
qurator_data_lib.sh
Normal file
102
qurator_data_lib.sh
Normal file
|
@ -0,0 +1,102 @@
|
|||
check_data_subdir() {
|
||||
result=0
|
||||
|
||||
if git submodule status $DATA_SUBDIR | grep -q '^-'; then
|
||||
echo "$DATA_SUBDIR/ is not an initialized submodule"; result=1
|
||||
fi
|
||||
if ! [ -e $DATA_SUBDIR/.git/annex ]; then
|
||||
echo "$DATA_SUBDIR/ is not a git annex repository"; result=1
|
||||
fi
|
||||
if ! (cd $DATA_SUBDIR && git annex version | grep -q 'local repository version: 7'); then
|
||||
echo "$DATA_SUBDIR/ is not a git annex repository version 7"; result=1
|
||||
fi
|
||||
if ! (cd $DATA_SUBDIR && git remote | grep -q '^nfs$'); then
|
||||
echo "$DATA_SUBDIR/ has no git remote 'nfs'"; result=1
|
||||
fi
|
||||
|
||||
return $result
|
||||
}
|
||||
|
||||
annex_get() {
|
||||
if [[ "$1" = '--allow_symlinks' ]]; then
|
||||
allow_symlinks=1
|
||||
shift
|
||||
else
|
||||
allow_symlinks=0
|
||||
fi
|
||||
file_pattern="$1"
|
||||
|
||||
(
|
||||
cd $DATA_SUBDIR
|
||||
git annex get $file_pattern
|
||||
|
||||
# fsck seems to be necessary to fix the files if we are in a submodule
|
||||
git annex fsck $file_pattern
|
||||
|
||||
# Check that there are no symlinks = only unlocked files. This is needed for
|
||||
# Docker builds, as we cannot dereference symlinks in a Dockerfile COPY.
|
||||
if [[ $allow_symlinks = 0 ]]; then
|
||||
git ls-files $file_pattern | while read f; do
|
||||
if ! [[ -f "$f" ]]; then
|
||||
echo "$DATA_SUBDIR/$f is not a regular file – Is an unlock needed?"
|
||||
exit
|
||||
fi
|
||||
done
|
||||
fi
|
||||
)
|
||||
}
|
||||
|
||||
download_to() {
|
||||
download_source="$1"
|
||||
unpack_to="$2"
|
||||
|
||||
(
|
||||
cd data
|
||||
tmpf=`mktemp 'tmp.XXXXX'`
|
||||
wget -O $tmpf "$download_source"
|
||||
mkdir -p "$unpack_to"
|
||||
# XXX Unpacking relies on tar -a unpacking any tar compression, might not work everywhere?
|
||||
tar -C "$unpack_to" -af $tmpf -xv
|
||||
rm -f $tmpf
|
||||
)
|
||||
}
|
||||
|
||||
suggest_commands() {
|
||||
echo "Suggested commands:"
|
||||
echo
|
||||
echo "git submodule update --init"
|
||||
echo "(cd $DATA_SUBDIR && git annex init --version=7)"
|
||||
echo "(cd $DATA_SUBDIR && git remote add nfs /<... path to ...>/GitNX-Repository/qurator/qurator-data)"
|
||||
}
|
||||
|
||||
handle_data() {
|
||||
if [[ "$1" = '--no-download' ]]; then
|
||||
no_download=1
|
||||
shift
|
||||
else
|
||||
no_download=0
|
||||
fi
|
||||
|
||||
if ! check_data_subdir; then
|
||||
if [[ $no_download = 1 ]]; then
|
||||
select choice in "Abort to manually fix $DATA_SUBDIR submodule"; do
|
||||
if [ $REPLY = 1 ]; then
|
||||
suggest_commands
|
||||
exit
|
||||
fi
|
||||
done
|
||||
else
|
||||
select choice in "Abort to manually fix $DATA_SUBDIR submodule" "Download data files from the web"; do
|
||||
if [ $REPLY = 1 ]; then
|
||||
suggest_commands
|
||||
exit
|
||||
else
|
||||
get_from_web
|
||||
break
|
||||
fi
|
||||
done
|
||||
fi
|
||||
else
|
||||
get_from_annex
|
||||
fi
|
||||
}
|
30
train.sh
30
train.sh
|
@ -1,11 +1,35 @@
|
|||
#!/bin/sh
|
||||
# Train a GT4HistOCR Calamari model
|
||||
# (or rather 5 for voted prediction)
|
||||
|
||||
set -e
|
||||
|
||||
self=`realpath $0`
|
||||
self_dir=`dirname "$self"`
|
||||
|
||||
|
||||
|
||||
|
||||
cd $self_dir
|
||||
DATA_SUBDIR=data
|
||||
get_from_annex() {
|
||||
annex_get 'GT4HistOCR/corpus/*.tar.bz2'
|
||||
}
|
||||
get_from_web() {
|
||||
download_to 'https://zenodo.org/record/1344132/files/GT4HistOCR.tar?download=1' 'GT4HistOCR'
|
||||
}
|
||||
. $self_dir/qurator_data_lib.sh
|
||||
handle_data
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
rm -rf /tmp/train-calamari-gt4histocr.*
|
||||
TMPDIR=`mktemp -d /tmp/train-calamari-gt4histocr.XXXXX`
|
||||
|
||||
echo "Unpacking dataset tar files to $TMPDIR"
|
||||
(cd data; git annex get GT4HistOCR/corpus/*.tar.bz2)
|
||||
for tar in data/GT4HistOCR/corpus/*.tar.bz2; do
|
||||
for tar in $DATA_SUBDIR/GT4HistOCR/corpus/*.tar.bz2; do
|
||||
tar xf $tar -C $TMPDIR
|
||||
done
|
||||
echo "Removing dta19/1882-keller_sinngedicht/04970.nrm.png (Broken PNG)"
|
||||
|
@ -13,7 +37,7 @@ rm -f $TMPDIR/dta19/1882-keller_sinngedicht/04970.*
|
|||
|
||||
export PYTHONUNBUFFERED=1 # For python + tee
|
||||
|
||||
outdir=data/calamari-models/GT4HistOCR
|
||||
outdir=$DATA_SUBDIR/calamari-models/GT4HistOCR
|
||||
mkdir -p $outdir
|
||||
|
||||
calamari-cross-fold-train \
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue