From 6ffba183ab33fa7348421b0fd1208e5d8ffdae98 Mon Sep 17 00:00:00 2001 From: Kai Date: Wed, 10 Mar 2021 15:33:30 +0100 Subject: [PATCH] fix repeated text lines --- tsvtools/cli.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tsvtools/cli.py b/tsvtools/cli.py index 41bf2ad..24d89db 100644 --- a/tsvtools/cli.py +++ b/tsvtools/cli.py @@ -104,7 +104,7 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, line_info = [] for rgn_number, region in enumerate(tree.findall('.//{%s}TextRegion' % xmlns)): - for line_number, text_line in enumerate(region.findall('.//{%s}TextLine' % xmlns)): + for text_line in region.findall('.//{%s}TextLine' % xmlns): points = [int(scale_factor * float(pos)) for coords in text_line.findall('./{%s}Coords' % xmlns) for p in coords.attrib['points'].split(' ') for pos in p.split(',')] @@ -118,7 +118,7 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, else: conf = np.nan - line_info.append((line_number, len(urls), left, right, top, bottom, conf)) + line_info.append((len(urls), left, right, top, bottom, conf)) for word in text_line.findall('./{%s}Word' % xmlns): @@ -138,10 +138,10 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, left, right, top, bottom = min(x_points), max(x_points), min(y_points), max(y_points) - tsv.append((rgn_number, line_number, left + (right - left) / 2.0, text, + tsv.append((rgn_number, len(line_info), left + (right - left) / 2.0, text, len(urls), left, right, top, bottom)) - line_info = pd.DataFrame(line_info, columns=['line', 'url_id', 'left', 'right', 'top', 'bottom', 'conf']) + line_info = pd.DataFrame(line_info, columns=['url_id', 'left', 'right', 'top', 'bottom', 'conf']) if min_confidence is not None and max_confidence is not None: line_info['ocrconf'] = line_info.conf.map(lambda x: get_conf_color(x, min_confidence, max_confidence)) @@ -180,7 +180,7 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, tsv = pd.DataFrame([(line, " ".join(part.TEXT.to_list())) for line, part in tsv.groupby('line')], columns=['line', 'TEXT']) - tsv = tsv.merge(line_info, left_on='line', right_on='line') + tsv = tsv.merge(line_info, left_on='line', right_index=True) tsv = tsv[out_columns].reset_index(drop=True)