Merge pull request #6 from kba/lineid-ocr-tsv

Retain line_id, tsv2page CLI to propagate results back to PAGE-XML
2025-12-15 06:14:14 +01:00 · 2021-04-08 10:53:51 +02:00 · 2021-04-08 10:53:51 +02:00 · abcdb67e9e
commit abcdb67e9e
parent 9c63631d7a f03acbf54d
2 changed files with 27 additions and 5 deletions
--- a/setup.py
+++ b/setup.py
@ -23,6 +23,7 @@ setup(
        "extract-doc-links=tsvtools.cli:extract_document_links",
        "annotate-tsv=tsvtools.cli:annotate_tsv",
        "page2tsv=tsvtools.cli:page2tsv",
        "tsv2page=tsvtools.cli:tsv2page",
        "find-entities=tsvtools.cli:find_entities",
        "make-page2tsv-commands=tsvtools.cli:make_page2tsv_commands"
      ]
--- a/tsvtools/cli.py
+++ b/tsvtools/cli.py
@ -3,11 +3,13 @@ import glob
 import re
 import os
 from io import StringIO
 from pathlib import Path
 import numpy as np
 import click
 import pandas as pd
 import requests
 from lxml import etree as ET
 from ocrd_models.ocrd_page import parse
 from ocrd_utils import bbox_from_points
@ -81,7 +83,7 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
    if purpose == "NERD":
        out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom', 'conf']
    elif purpose == "OCR":
-        out_columns = ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'conf']
+        out_columns = ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'conf', 'line_id']
        if min_confidence is not None and max_confidence is not None:
            out_columns += ['ocrconf']
    else:
@ -110,7 +112,7 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
            else:
                conf = np.nan
-            line_info.append((len(urls), left, right, top, bottom, conf))
+            line_info.append((len(urls), left, right, top, bottom, conf, text_line.id))
            for word in text_line.get_Word():
                for text_equiv in word.get_TextEquiv():
@ -119,14 +121,14 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
                    left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(word.get_Coords().points)]
                    tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0,
-                                text_equiv.get_Unicode(), len(urls), left, right, top, bottom))
+                                text_equiv.get_Unicode(), len(urls), left, right, top, bottom, text_line.id))
-    line_info = pd.DataFrame(line_info, columns=['url_id', 'left', 'right', 'top', 'bottom', 'conf'])
+    line_info = pd.DataFrame(line_info, columns=['url_id', 'left', 'right', 'top', 'bottom', 'conf', 'line_id'])
    if min_confidence is not None and max_confidence is not None:
        line_info['ocrconf'] = line_info.conf.map(lambda x: get_conf_color(x, min_confidence, max_confidence))
-    tsv = pd.DataFrame(tsv, columns=['rid', 'line', 'hcenter'] + ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom'])
+    tsv = pd.DataFrame(tsv, columns=['rid', 'line', 'hcenter'] + ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'line_id'])
    if len(tsv) == 0:
        return
@ -177,6 +179,25 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
    except requests.HTTPError as e:
        print(e)
@click.command()
@click.option('--output-filename', '-o', help="Output filename. If omitted, PAGE-XML filename with .corrected.xml extension")
@click.option('--keep-words', '-k', is_flag=True, help="Keep (out-of-date) Words of TextLines")
@click.argument('page-file')
@click.argument('tsv-file')
 def tsv2page(output_filename, keep_words, page_file, tsv_file):
    if not output_filename:
        output_filename = Path(page_file).stem + '.corrected.xml'
    ns = {'pc': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15'}
    tsv = pd.read_csv(tsv_file, sep='\t', comment='#', quoting=3)
    tree = ET.parse(page_file)
    for _, row in tsv.iterrows():
        el_textline = tree.find(f'//pc:TextLine[@id="{row.line_id}"]', namespaces=ns)
        el_textline.find('pc:TextEquiv/pc:Unicode', namespaces=ns).text = row.TEXT
        if not keep_words:
            for el_word in el_textline.findall('pc:Word', namespaces=ns):
                el_textline.remove(el_word)
    with open(output_filename, 'w', encoding='utf-8') as f:
        f.write(ET.tostring(tree, pretty_print=True).decode('utf-8'))
@click.command()
@click.argument('tsv-file', type=click.Path(exists=True), required=True, nargs=1)