From 6ae85063c55035a95ef82e3d7ff77e2b5437caff Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 10 Feb 2020 19:25:08 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=9D=20Document=20do=5Fvalidate()=20opt?= =?UTF-8?q?ions=20better?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- my_ocrd_workflow | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/my_ocrd_workflow b/my_ocrd_workflow index da2067e..6234d33 100755 --- a/my_ocrd_workflow +++ b/my_ocrd_workflow @@ -10,13 +10,27 @@ fi do_validate() { + # Validate the workspace + + # Both ocrd_tesserocr + ocrd_calamari produce segment coordinates that are not strictly within their parent's + # coordinates: + # + # INCONSISTENCY in [...] coords [...] not within parent coords + # + # → --page-coordinate-consistency off + # + # ocrd_tesserocr sometimes produces segment text results that aren't concatenating as expected by the validator: + # + # INCONSISTENCY in [...]: text results '[...]' != concatenated '[...]' + # + # → --page-strictness lax + # validate_options=' --skip dimension --skip pixel_density --page-strictness lax --page-coordinate-consistency off' ocrd workspace validate $validate_options - # XXX ocrd-tesserocr INCONSISTENCY in TextRegion → use "--page-strictness lax" for now } do_binarization() {