mirror of
https://github.com/qurator-spk/page2tsv.git
synced 2025-07-03 23:29:52 +02:00
use max confidence instead of mean
This commit is contained in:
parent
85ec36218e
commit
5d55ba24a3
1 changed files with 4 additions and 7 deletions
|
@ -100,12 +100,9 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
|
||||||
|
|
||||||
tsv = []
|
tsv = []
|
||||||
line_info = []
|
line_info = []
|
||||||
line_number = 0
|
for rgn_number, region in enumerate(tree.findall('.//{%s}TextRegion' % xmlns)):
|
||||||
rgn_number = 0
|
|
||||||
for region in tree.findall('.//{%s}TextRegion' % xmlns):
|
for line_number, text_line in enumerate(region.findall('.//{%s}TextLine' % xmlns)):
|
||||||
rgn_number += 1
|
|
||||||
for text_line in region.findall('.//{%s}TextLine' % xmlns):
|
|
||||||
line_number += 1
|
|
||||||
|
|
||||||
points = [int(scale_factor * float(pos)) for coords in text_line.findall('./{%s}Coords' % xmlns) for p in
|
points = [int(scale_factor * float(pos)) for coords in text_line.findall('./{%s}Coords' % xmlns) for p in
|
||||||
coords.attrib['points'].split(' ') for pos in p.split(',')]
|
coords.attrib['points'].split(' ') for pos in p.split(',')]
|
||||||
|
@ -115,7 +112,7 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
|
||||||
left, right, top, bottom = min(x_points), max(x_points), min(y_points), max(y_points)
|
left, right, top, bottom = min(x_points), max(x_points), min(y_points), max(y_points)
|
||||||
|
|
||||||
if min_confidence is not None and max_confidence is not None:
|
if min_confidence is not None and max_confidence is not None:
|
||||||
conf = np.mean([float(text.attrib['conf']) for text in text_line.findall('./{%s}TextEquiv' % xmlns)])
|
conf = np.max([float(text.attrib['conf']) for text in text_line.findall('./{%s}TextEquiv' % xmlns)])
|
||||||
else:
|
else:
|
||||||
conf = np.nan
|
conf = np.nan
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue