From d4199d0ddd1f9384f422dcfd28aadc2b72ce3f8d Mon Sep 17 00:00:00 2001 From: cneud Date: Wed, 30 Oct 2019 15:02:29 +0100 Subject: [PATCH] add Python script for converting PAGE-XML to TSV --- tools/page2tsv.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 tools/page2tsv.py diff --git a/tools/page2tsv.py b/tools/page2tsv.py new file mode 100644 index 0000000..34e2ba3 --- /dev/null +++ b/tools/page2tsv.py @@ -0,0 +1,12 @@ +import sys +import codecs +import xml.etree.ElementTree as ET + +sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict') +tree = ET.parse(sys.argv[1]) +xmlns = tree.getroot().tag.split('}')[0].strip('{') +for words in tree.findall('.//{%s}Word' % xmlns): + for word in words.findall('.//{%s}Unicode' % xmlns): + text = word.text + for coords in words.findall('.//{%s}Coords' % xmlns): + sys.stdout.write('0\t'+text+'\tO\tO\t'+coords.attrib['points']+'\n') \ No newline at end of file