From 91cca1e1b8672018e88a6adae2190f3d780b3e57 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 3 Feb 2020 15:33:11 +0100
Subject: [PATCH] =?UTF-8?q?=F0=9F=93=9D=20Document=20why=20we=20are=20usin?=
 =?UTF-8?q?g=20Unicode=20text=20segmentation=20to=20produce=20word=20resul?=
 =?UTF-8?q?ts?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ocrd_calamari/recognize.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py
index 772c680..0624d9b 100644
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@@ -100,6 +100,11 @@ class CalamariRecognize(Processor):
                     line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])
 
                     # Save word results
+                    #
+                    # Calamari OCR does not provide word positions, so we infer word positions from a. Unicode text
+                    # segmentation and b. the glyph positions. This is necessary because the PAGE XML format enforces
+                    # a strict hierarchy of lines > words > glyphs.
+
                     def unwanted(c):
                         return c == " "