commit 34542c55ea142b8383684f6d903faedcff5837a2 Author: Gerber, Mike Date: Thu Dec 5 17:48:45 2019 +0100 ✨ calamari-models/train-calamari-gt4histocr: Move from my personal experiments repo diff --git a/data b/data new file mode 160000 index 0000000..f817209 --- /dev/null +++ b/data @@ -0,0 +1 @@ +Subproject commit f817209ba765464adb132a132774ea7856d53f4e diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..21daa14 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +calamari-ocr==0.3.5 +tensorflow-gpu==1.13.1 diff --git a/train.sh b/train.sh new file mode 100755 index 0000000..7a1d33b --- /dev/null +++ b/train.sh @@ -0,0 +1,29 @@ +#!/bin/sh + +rm -rf /tmp/train-calamari-gt4histocr.* +TMPDIR=`mktemp -d /tmp/train-calamari-gt4histocr.XXXXX` + +echo "Unpacking dataset tar files to $TMPDIR" +(cd data; git annex get GT4HistOCR/corpus/*.tar.bz2) +for tar in data/GT4HistOCR/corpus/*.tar.bz2; do + tar xf $tar -C $TMPDIR +done +echo "Removing dta19/1882-keller_sinngedicht/04970.nrm.png (Broken PNG)" +rm -f $TMPDIR/dta19/1882-keller_sinngedicht/04970.* + +export PYTHONUNBUFFERED=1 # For python + tee + +outdir=data/calamari-models/GT4HistOCR +mkdir -p $outdir + +calamari-cross-fold-train \ + --files \ + "$TMPDIR/*/*/*.png" \ + --best_models_dir $outdir \ + --early_stopping_frequency=0.25 \ + --early_stopping_nbest=5 \ + --batch_size=128 \ + --n_folds=5 \ + --max_parallel_models=1 \ + --display=0.01 \ + 2>&1 | tee $outdir/train.`date -Iminutes`.log