Merge pull request #6 from kba/lineid-ocr-tsv

Retain line_id, tsv2page CLI to propagate results back to PAGE-XML
pull/9/head
Kai Labusch 3 years ago committed by GitHub
commit abcdb67e9e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -23,6 +23,7 @@ setup(
"extract-doc-links=tsvtools.cli:extract_document_links",
"annotate-tsv=tsvtools.cli:annotate_tsv",
"page2tsv=tsvtools.cli:page2tsv",
"tsv2page=tsvtools.cli:tsv2page",
"find-entities=tsvtools.cli:find_entities",
"make-page2tsv-commands=tsvtools.cli:make_page2tsv_commands"
]

@ -3,11 +3,13 @@ import glob
import re
import os
from io import StringIO
from pathlib import Path
import numpy as np
import click
import pandas as pd
import requests
from lxml import etree as ET
from ocrd_models.ocrd_page import parse
from ocrd_utils import bbox_from_points
@ -81,7 +83,7 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
if purpose == "NERD":
out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom', 'conf']
elif purpose == "OCR":
out_columns = ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'conf']
out_columns = ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'conf', 'line_id']
if min_confidence is not None and max_confidence is not None:
out_columns += ['ocrconf']
else:
@ -110,7 +112,7 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
else:
conf = np.nan
line_info.append((len(urls), left, right, top, bottom, conf))
line_info.append((len(urls), left, right, top, bottom, conf, text_line.id))
for word in text_line.get_Word():
for text_equiv in word.get_TextEquiv():
@ -119,14 +121,14 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(word.get_Coords().points)]
tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0,
text_equiv.get_Unicode(), len(urls), left, right, top, bottom))
text_equiv.get_Unicode(), len(urls), left, right, top, bottom, text_line.id))
line_info = pd.DataFrame(line_info, columns=['url_id', 'left', 'right', 'top', 'bottom', 'conf'])
line_info = pd.DataFrame(line_info, columns=['url_id', 'left', 'right', 'top', 'bottom', 'conf', 'line_id'])
if min_confidence is not None and max_confidence is not None:
line_info['ocrconf'] = line_info.conf.map(lambda x: get_conf_color(x, min_confidence, max_confidence))
tsv = pd.DataFrame(tsv, columns=['rid', 'line', 'hcenter'] + ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom'])
tsv = pd.DataFrame(tsv, columns=['rid', 'line', 'hcenter'] + ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'line_id'])
if len(tsv) == 0:
return
@ -177,6 +179,25 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
except requests.HTTPError as e:
print(e)
@click.command()
@click.option('--output-filename', '-o', help="Output filename. If omitted, PAGE-XML filename with .corrected.xml extension")
@click.option('--keep-words', '-k', is_flag=True, help="Keep (out-of-date) Words of TextLines")
@click.argument('page-file')
@click.argument('tsv-file')
def tsv2page(output_filename, keep_words, page_file, tsv_file):
if not output_filename:
output_filename = Path(page_file).stem + '.corrected.xml'
ns = {'pc': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15'}
tsv = pd.read_csv(tsv_file, sep='\t', comment='#', quoting=3)
tree = ET.parse(page_file)
for _, row in tsv.iterrows():
el_textline = tree.find(f'//pc:TextLine[@id="{row.line_id}"]', namespaces=ns)
el_textline.find('pc:TextEquiv/pc:Unicode', namespaces=ns).text = row.TEXT
if not keep_words:
for el_word in el_textline.findall('pc:Word', namespaces=ns):
el_textline.remove(el_word)
with open(output_filename, 'w', encoding='utf-8') as f:
f.write(ET.tostring(tree, pretty_print=True).decode('utf-8'))
@click.command()
@click.argument('tsv-file', type=click.Path(exists=True), required=True, nargs=1)

Loading…
Cancel
Save