From 75733039b8a804414352d1bd5845a2db21d3e341 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 20 Oct 2020 18:43:56 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Do=20not=20hardc?= =?UTF-8?q?ode=20joiner=20to=20\n?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/extracted_text.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py index c6f2984..f99a9b1 100644 --- a/qurator/dinglehopper/extracted_text.py +++ b/qurator/dinglehopper/extracted_text.py @@ -187,6 +187,9 @@ class ExtractedText: children_for_localname = { 'TextRegion': 'TextLine' } + joiner_for_textequiv_level = { + 'line': '\n' + } segment_id = text_segment.attrib['id'] localname = ET.QName(text_segment).localname @@ -209,7 +212,7 @@ class ExtractedText: sub_segment, nsmap, textequiv_level=sub_textequiv_level) ) - joiner = '\n' # XXX + joiner = joiner_for_textequiv_level[sub_textequiv_level] return cls(segment_id, segments, joiner, None)