From 05f49df6d2a204ab74a2232ed779bb3a4178fbe3 Mon Sep 17 00:00:00 2001 From: Kai Labusch Date: Wed, 11 Mar 2020 12:56:09 +0100 Subject: [PATCH] support Qurator calamari PAGE xml --- cli.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cli.py b/cli.py index 5850600..326b364 100644 --- a/cli.py +++ b/cli.py @@ -175,10 +175,16 @@ def page2tsv(page_xml_file, tsv_out_file, image_url, ner_rest_endpoint, noproxy, rgn_number += 1 for text_line in region.findall('.//{%s}TextLine' % xmlns): line_number += 1 - for words in text_line.findall('.//{%s}Word' % xmlns): - for word in words.findall('.//{%s}Unicode' % xmlns): + + # import ipdb; ipdb.set_trace() + + for words in text_line.findall('./{%s}Word' % xmlns): + + # import ipdb;ipdb.set_trace() + + for word in words.findall('./{%s}TextEquiv/{%s}Unicode' % (xmlns, xmlns)): text = word.text - for coords in words.findall('.//{%s}Coords' % xmlns): + for coords in words.findall('./{%s}Coords' % xmlns): # transform OCR coordinates using `scale_factor` to derive # correct coordinates for the web presentation image