From 409d7db2f22e76c0fc3de13e7739d712b1187893 Mon Sep 17 00:00:00 2001
From: Clemens Neudecker <952378+cneud@users.noreply.github.com>
Date: Fri, 6 Dec 2019 15:17:02 +0100
Subject: [PATCH] Transform OCR coordinates for web presentation images (fixes
 #31)

thx @kba!

(scaling factor will require testing with more images though)
---
 tools/cli.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/cli.py b/tools/cli.py
index 55fcae6..c328f0c 100644
--- a/tools/cli.py
+++ b/tools/cli.py
@@ -172,7 +172,8 @@ def page2tsv(page_xml_file, tsv_out_file, image_url, ner_rest_endpoint, noproxy)
             text = word.text
             for coords in words.findall('.//{%s}Coords' % xmlns):
 
-                points = [int(pos) for p in coords.attrib['points'].split(' ') for pos in p.split(',')]
+                # transform the OCR coordinates by 0.5685 to derived the correct coords for the web presentation
+                points = [int(0.5685 * float(pos)) for p in coords.attrib['points'].split(' ') for pos in p.split(',')]
 
                 x_points = [points[i] for i in range(0, len(points), 2)]
                 y_points = [points[i] for i in range(1, len(points), 2)]