From bc1002b1e69f32893f5d1ecaf6d0f1ee3a5e9acb Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 17:43:30 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Extract=20text?= =?UTF-8?q?=20while=20retaining=20segment=20id=20info?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index a5187c5..fd89b03 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -148,21 +148,35 @@ def page_text(tree): return page_extract(tree).text -def text(filename): - """Read the text from the given file. +def plain_extract(filename): + with open(filename, 'r') as f: + return ExtractedText( + (ExtractedTextSegment('line %d' % no, line) for no, line in enumerate(f.readlines())), + '\n' + ) + + +def plain_text(filename): + return plain_extract(filename).text + + +def extract(filename): + """Extract the text from the given file. Supports PAGE, ALTO and falls back to plain text. """ - try: tree = ET.parse(filename) except XMLSyntaxError: - with open(filename, 'r') as f: - return f.read() + return plain_extract(filename) try: - return page_text(tree) + return page_extract(tree) except ValueError: - return alto_text(tree) + return alto_extract(tree) + + +def text(filename): + return extract(filename).text if __name__ == '__main__':