1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-06-09 11:50:00 +02:00

🚧 dinglehopper: Extract text while retaining segment id info

This commit is contained in:
Gerber, Mike 2020-06-11 17:43:30 +02:00
parent 6d0db229fa
commit bc1002b1e6

View file

@ -148,21 +148,35 @@ def page_text(tree):
return page_extract(tree).text
def text(filename):
"""Read the text from the given file.
def plain_extract(filename):
with open(filename, 'r') as f:
return ExtractedText(
(ExtractedTextSegment('line %d' % no, line) for no, line in enumerate(f.readlines())),
'\n'
)
def plain_text(filename):
return plain_extract(filename).text
def extract(filename):
"""Extract the text from the given file.
Supports PAGE, ALTO and falls back to plain text.
"""
try:
tree = ET.parse(filename)
except XMLSyntaxError:
with open(filename, 'r') as f:
return f.read()
return plain_extract(filename)
try:
return page_text(tree)
return page_extract(tree)
except ValueError:
return alto_text(tree)
return alto_extract(tree)
def text(filename):
return extract(filename).text
if __name__ == '__main__':