From 34542c55ea142b8383684f6d903faedcff5837a2 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 5 Dec 2019 17:48:45 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20calamari-models/train-calamari-gt4h?= =?UTF-8?q?istocr:=20Move=20from=20my=20personal=20experiments=20repo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data | 1 + requirements.txt | 2 ++ train.sh | 29 +++++++++++++++++++++++++++++ 3 files changed, 32 insertions(+) create mode 160000 data create mode 100644 requirements.txt create mode 100755 train.sh diff --git a/data b/data new file mode 160000 index 0000000..f817209 --- /dev/null +++ b/data @@ -0,0 +1 @@ +Subproject commit f817209ba765464adb132a132774ea7856d53f4e diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..21daa14 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +calamari-ocr==0.3.5 +tensorflow-gpu==1.13.1 diff --git a/train.sh b/train.sh new file mode 100755 index 0000000..7a1d33b --- /dev/null +++ b/train.sh @@ -0,0 +1,29 @@ +#!/bin/sh + +rm -rf /tmp/train-calamari-gt4histocr.* +TMPDIR=`mktemp -d /tmp/train-calamari-gt4histocr.XXXXX` + +echo "Unpacking dataset tar files to $TMPDIR" +(cd data; git annex get GT4HistOCR/corpus/*.tar.bz2) +for tar in data/GT4HistOCR/corpus/*.tar.bz2; do + tar xf $tar -C $TMPDIR +done +echo "Removing dta19/1882-keller_sinngedicht/04970.nrm.png (Broken PNG)" +rm -f $TMPDIR/dta19/1882-keller_sinngedicht/04970.* + +export PYTHONUNBUFFERED=1 # For python + tee + +outdir=data/calamari-models/GT4HistOCR +mkdir -p $outdir + +calamari-cross-fold-train \ + --files \ + "$TMPDIR/*/*/*.png" \ + --best_models_dir $outdir \ + --early_stopping_frequency=0.25 \ + --early_stopping_nbest=5 \ + --batch_size=128 \ + --n_folds=5 \ + --max_parallel_models=1 \ + --display=0.01 \ + 2>&1 | tee $outdir/train.`date -Iminutes`.log