From 534958be1d068b42d97d90e89c347c97a8755e58 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 30 Sep 2019 16:06:59 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20dinglehopper:=20Fix=20sorting=20?= =?UTF-8?q?the=20reading=20order?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Regions were sorted wrongly when there are more than 9 regions in an OrderedGroup because the index was sorted alphabetically, not numerically. Fix this by converting the index to integers. --- qurator/dinglehopper/ocr_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index aac743e..b57a047 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -64,7 +64,7 @@ def page_text(tree): for group in reading_order.iterfind('./*', namespaces=nsmap): if ET.QName(group.tag).localname == 'OrderedGroup': region_ref_indexeds = group.findall('./page:RegionRefIndexed', namespaces=nsmap) - for region_ref_indexed in sorted(region_ref_indexeds, key=lambda r: r.attrib['index']): + for region_ref_indexed in sorted(region_ref_indexeds, key=lambda r: int(r.attrib['index'])): region_id = region_ref_indexed.attrib['regionRef'] region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap) if region is not None: