From 91cca1e1b8672018e88a6adae2190f3d780b3e57 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 3 Feb 2020 15:33:11 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=9D=20Document=20why=20we=20are=20usin?= =?UTF-8?q?g=20Unicode=20text=20segmentation=20to=20produce=20word=20resul?= =?UTF-8?q?ts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/recognize.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 772c680..0624d9b 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -100,6 +100,11 @@ class CalamariRecognize(Processor): line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)]) # Save word results + # + # Calamari OCR does not provide word positions, so we infer word positions from a. Unicode text + # segmentation and b. the glyph positions. This is necessary because the PAGE XML format enforces + # a strict hierarchy of lines > words > glyphs. + def unwanted(c): return c == " "