diff --git a/tools/page2tsv.py b/tools/page2tsv.py new file mode 100644 index 0000000..34e2ba3 --- /dev/null +++ b/tools/page2tsv.py @@ -0,0 +1,12 @@ +import sys +import codecs +import xml.etree.ElementTree as ET + +sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict') +tree = ET.parse(sys.argv[1]) +xmlns = tree.getroot().tag.split('}')[0].strip('{') +for words in tree.findall('.//{%s}Word' % xmlns): + for word in words.findall('.//{%s}Unicode' % xmlns): + text = word.text + for coords in words.findall('.//{%s}Coords' % xmlns): + sys.stdout.write('0\t'+text+'\tO\tO\t'+coords.attrib['points']+'\n') \ No newline at end of file