1
0
Fork 0
mirror of https://github.com/qurator-spk/page2tsv.git synced 2025-10-26 06:04:14 +01:00

try to infer correct line ordering ...

This commit is contained in:
Kai Labusch 2020-03-09 10:58:07 +01:00
parent e535a070c4
commit 7bf9cfa5de

55
cli.py
View file

@ -152,6 +152,8 @@ def ner(tsv, ner_rest_endpoint):
@click.option('--scale-factor', type=float, default=0.5685, help='default: 0.5685') @click.option('--scale-factor', type=float, default=0.5685, help='default: 0.5685')
def page2tsv(page_xml_file, tsv_out_file, image_url, ner_rest_endpoint, noproxy, scale_factor): def page2tsv(page_xml_file, tsv_out_file, image_url, ner_rest_endpoint, noproxy, scale_factor):
out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'GND-ID', 'url_id', 'left', 'right', 'top', 'bottom']
if noproxy: if noproxy:
os.environ['no_proxy'] = '*' os.environ['no_proxy'] = '*'
@ -164,34 +166,53 @@ def page2tsv(page_xml_file, tsv_out_file, image_url, ner_rest_endpoint, noproxy,
urls = [part['url'] for part in parts] urls = [part['url'] for part in parts]
else: else:
pd.DataFrame([], columns=['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'GND-ID', 'url_id', 'left', 'right', 'top', pd.DataFrame([], columns=out_columns). to_csv(tsv_out_file, sep="\t", quoting=3, index=False)
'bottom']). to_csv(tsv_out_file, sep="\t", quoting=3, index=False)
tsv = [] tsv = []
for words in tree.findall('.//{%s}Word' % xmlns): line_number = 0
for word in words.findall('.//{%s}Unicode' % xmlns): rgn_number = 0
text = word.text for region in tree.findall('.//{%s}TextRegion' % xmlns):
for coords in words.findall('.//{%s}Coords' % xmlns): rgn_number += 1
for text_line in region.findall('.//{%s}TextLine' % xmlns):
line_number += 1
for words in text_line.findall('.//{%s}Word' % xmlns):
for word in words.findall('.//{%s}Unicode' % xmlns):
text = word.text
for coords in words.findall('.//{%s}Coords' % xmlns):
# transform OCR coordinates using `scale_factor` to derive correct coordinates for the web presentation image # transform OCR coordinates using `scale_factor` to derive
points = [int(scale_factor * float(pos)) for p in coords.attrib['points'].split(' ') for pos in p.split(',')] # correct coordinates for the web presentation image
points = [int(scale_factor * float(pos))
for p in coords.attrib['points'].split(' ') for pos in p.split(',')]
x_points = [points[i] for i in range(0, len(points), 2)] x_points = [points[i] for i in range(0, len(points), 2)]
y_points = [points[i] for i in range(1, len(points), 2)] y_points = [points[i] for i in range(1, len(points), 2)]
left = min(x_points) left = min(x_points)
right = max(x_points) right = max(x_points)
top = min(y_points) top = min(y_points)
bottom = max(y_points) bottom = max(y_points)
tsv.append((0, text, 'O', 'O', '-', len(urls), left, right, top, bottom)) tsv.append((rgn_number, line_number, left + (right-left)/2.0,
0, text, 'O', 'O', '-', len(urls), left, right, top, bottom))
with open(tsv_out_file, 'a') as f: with open(tsv_out_file, 'a') as f:
f.write('# ' + image_url + '\n') f.write('# ' + image_url + '\n')
tsv = pd.DataFrame(tsv, columns=['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'GND-ID', tsv = pd.DataFrame(tsv, columns=['rid', 'line', 'hcenter'] + out_columns)
'url_id', 'left', 'right', 'top', 'bottom'])
vlinecenter = pd.DataFrame(tsv[['line', 'top']].groupby('line', sort=False).mean().top +
(tsv[['line', 'bottom']].groupby('line').mean().bottom -
tsv[['line', 'top']].groupby('line').mean().top) / 2, columns=['vlinecenter'])
tsv = tsv.merge(vlinecenter, left_on='line', right_index=True)
regions = [region.sort_values(['vlinecenter', 'hcenter']) for rid, region in tsv.groupby('rid', sort=False)]
tsv = pd.concat(regions)
tsv = tsv[out_columns].reset_index(drop=True)
if ner_rest_endpoint is not None: if ner_rest_endpoint is not None: