mirror of
https://github.com/qurator-spk/page2tsv.git
synced 2025-06-16 14:59:54 +02:00
support Qurator calamari PAGE xml
This commit is contained in:
parent
abdabbac4f
commit
05f49df6d2
1 changed files with 9 additions and 3 deletions
12
cli.py
12
cli.py
|
@ -175,10 +175,16 @@ def page2tsv(page_xml_file, tsv_out_file, image_url, ner_rest_endpoint, noproxy,
|
||||||
rgn_number += 1
|
rgn_number += 1
|
||||||
for text_line in region.findall('.//{%s}TextLine' % xmlns):
|
for text_line in region.findall('.//{%s}TextLine' % xmlns):
|
||||||
line_number += 1
|
line_number += 1
|
||||||
for words in text_line.findall('.//{%s}Word' % xmlns):
|
|
||||||
for word in words.findall('.//{%s}Unicode' % xmlns):
|
# import ipdb; ipdb.set_trace()
|
||||||
|
|
||||||
|
for words in text_line.findall('./{%s}Word' % xmlns):
|
||||||
|
|
||||||
|
# import ipdb;ipdb.set_trace()
|
||||||
|
|
||||||
|
for word in words.findall('./{%s}TextEquiv/{%s}Unicode' % (xmlns, xmlns)):
|
||||||
text = word.text
|
text = word.text
|
||||||
for coords in words.findall('.//{%s}Coords' % xmlns):
|
for coords in words.findall('./{%s}Coords' % xmlns):
|
||||||
|
|
||||||
# transform OCR coordinates using `scale_factor` to derive
|
# transform OCR coordinates using `scale_factor` to derive
|
||||||
# correct coordinates for the web presentation image
|
# correct coordinates for the web presentation image
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue