1
0
Fork 0
mirror of https://github.com/qurator-spk/neat.git synced 2025-06-09 11:49:54 +02:00

add Python script for converting PAGE-XML to TSV

This commit is contained in:
cneud 2019-10-30 15:02:29 +01:00
parent 225693390b
commit d4199d0ddd

12
tools/page2tsv.py Normal file
View file

@ -0,0 +1,12 @@
import sys
import codecs
import xml.etree.ElementTree as ET
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict')
tree = ET.parse(sys.argv[1])
xmlns = tree.getroot().tag.split('}')[0].strip('{')
for words in tree.findall('.//{%s}Word' % xmlns):
for word in words.findall('.//{%s}Unicode' % xmlns):
text = word.text
for coords in words.findall('.//{%s}Coords' % xmlns):
sys.stdout.write('0\t'+text+'\tO\tO\t'+coords.attrib['points']+'\n')