mirror of
https://github.com/qurator-spk/neat.git
synced 2025-06-09 11:49:54 +02:00
add Python script for converting PAGE-XML to TSV
This commit is contained in:
parent
225693390b
commit
d4199d0ddd
1 changed files with 12 additions and 0 deletions
12
tools/page2tsv.py
Normal file
12
tools/page2tsv.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
import sys
|
||||
import codecs
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict')
|
||||
tree = ET.parse(sys.argv[1])
|
||||
xmlns = tree.getroot().tag.split('}')[0].strip('{')
|
||||
for words in tree.findall('.//{%s}Word' % xmlns):
|
||||
for word in words.findall('.//{%s}Unicode' % xmlns):
|
||||
text = word.text
|
||||
for coords in words.findall('.//{%s}Coords' % xmlns):
|
||||
sys.stdout.write('0\t'+text+'\tO\tO\t'+coords.attrib['points']+'\n')
|
Loading…
Add table
Add a link
Reference in a new issue