|
|
@ -148,21 +148,35 @@ def page_text(tree):
|
|
|
|
return page_extract(tree).text
|
|
|
|
return page_extract(tree).text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def text(filename):
|
|
|
|
def plain_extract(filename):
|
|
|
|
"""Read the text from the given file.
|
|
|
|
with open(filename, 'r') as f:
|
|
|
|
|
|
|
|
return ExtractedText(
|
|
|
|
|
|
|
|
(ExtractedTextSegment('line %d' % no, line) for no, line in enumerate(f.readlines())),
|
|
|
|
|
|
|
|
'\n'
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def plain_text(filename):
|
|
|
|
|
|
|
|
return plain_extract(filename).text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract(filename):
|
|
|
|
|
|
|
|
"""Extract the text from the given file.
|
|
|
|
|
|
|
|
|
|
|
|
Supports PAGE, ALTO and falls back to plain text.
|
|
|
|
Supports PAGE, ALTO and falls back to plain text.
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
tree = ET.parse(filename)
|
|
|
|
tree = ET.parse(filename)
|
|
|
|
except XMLSyntaxError:
|
|
|
|
except XMLSyntaxError:
|
|
|
|
with open(filename, 'r') as f:
|
|
|
|
return plain_extract(filename)
|
|
|
|
return f.read()
|
|
|
|
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
return page_text(tree)
|
|
|
|
return page_extract(tree)
|
|
|
|
except ValueError:
|
|
|
|
except ValueError:
|
|
|
|
return alto_text(tree)
|
|
|
|
return alto_extract(tree)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def text(filename):
|
|
|
|
|
|
|
|
return extract(filename).text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
if __name__ == '__main__':
|
|
|
|