fix repeated text rows

pull/3/head
Kai 4 years ago
parent a6008b83b5
commit de575037e6

@ -125,11 +125,13 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
for text_equiv in word.findall('./{%s}TextEquiv/{%s}Unicode' % (xmlns, xmlns)): for text_equiv in word.findall('./{%s}TextEquiv/{%s}Unicode' % (xmlns, xmlns)):
text = text_equiv.text text = text_equiv.text
points = []
for coords in word.findall('./{%s}Coords' % xmlns): for coords in word.findall('./{%s}Coords' % xmlns):
# transform OCR coordinates using `scale_factor` to derive # transform OCR coordinates using `scale_factor` to derive
# correct coordinates for the web presentation image # correct coordinates for the web presentation image
points = [int(scale_factor * float(pos)) points += [int(scale_factor * float(pos))
for p in coords.attrib['points'].split(' ') for pos in p.split(',')] for p in coords.attrib['points'].split(' ') for pos in p.split(',')]
x_points, y_points = points[0::2], points[1::2] x_points, y_points = points[0::2], points[1::2]

Loading…
Cancel
Save