mirror of
https://github.com/qurator-spk/page2tsv.git
synced 2025-07-06 00:29:54 +02:00
fix repeated text rows
This commit is contained in:
parent
a6008b83b5
commit
de575037e6
1 changed files with 7 additions and 5 deletions
|
@ -125,11 +125,13 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
|
||||||
for text_equiv in word.findall('./{%s}TextEquiv/{%s}Unicode' % (xmlns, xmlns)):
|
for text_equiv in word.findall('./{%s}TextEquiv/{%s}Unicode' % (xmlns, xmlns)):
|
||||||
text = text_equiv.text
|
text = text_equiv.text
|
||||||
|
|
||||||
|
points = []
|
||||||
|
|
||||||
for coords in word.findall('./{%s}Coords' % xmlns):
|
for coords in word.findall('./{%s}Coords' % xmlns):
|
||||||
|
|
||||||
# transform OCR coordinates using `scale_factor` to derive
|
# transform OCR coordinates using `scale_factor` to derive
|
||||||
# correct coordinates for the web presentation image
|
# correct coordinates for the web presentation image
|
||||||
points = [int(scale_factor * float(pos))
|
points += [int(scale_factor * float(pos))
|
||||||
for p in coords.attrib['points'].split(' ') for pos in p.split(',')]
|
for p in coords.attrib['points'].split(' ') for pos in p.split(',')]
|
||||||
|
|
||||||
x_points, y_points = points[0::2], points[1::2]
|
x_points, y_points = points[0::2], points[1::2]
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue