install into qurator namespace

2025-12-10 03:44:13 +01:00 · 2022-11-08 16:19:23 +01:00 · 2022-11-08 16:19:23 +01:00 · e1a440b91c
commit e1a440b91c
parent abeca0df16
10 changed files with 17 additions and 12 deletions
--- a/qurator/tsvtools/init.py
+++ b/qurator/tsvtools/init.py
--- a/qurator/tsvtools/cli.py
+++ b/qurator/tsvtools/cli.py
@ -0,0 +1,251 @@
+import glob
+import re
+import os
+from io import StringIO
+from pathlib import Path
+
+import numpy as np
+import click
+import pandas as pd
+import requests
+from lxml import etree as ET
+
+from ocrd_models.ocrd_page import parse
+from ocrd_utils import bbox_from_points
+
+from qurator.utils.tsv import read_tsv, write_tsv, extract_doc_links
+from .ocr import get_conf_color
+
+from qurator.utils.ner import ner
+from qurator.utils.ned import ned
+
+
+@click.command()
+@click.argument('tsv-file', type=click.Path(exists=True), required=True, nargs=1)
+@click.argument('url-file', type=click.Path(exists=False), required=True, nargs=1)
+def extract_document_links(tsv_file, url_file):
+
+    parts = extract_doc_links(tsv_file)
+
+    urls = [part['url'] for part in parts]
+
+    urls = pd.DataFrame(urls, columns=['url'])
+
+    urls.to_csv(url_file, sep="\t", quoting=3, index=False)
+
+
+@click.command()
+@click.argument('tsv-file', type=click.Path(exists=True), required=True, nargs=1)
+@click.argument('annotated-tsv-file', type=click.Path(exists=False), required=True, nargs=1)
+def annotate_tsv(tsv_file, annotated_tsv_file):
+
+    parts = extract_doc_links(tsv_file)
+
+    annotated_parts = []
+
+    for part in parts:
+
+        part_data = StringIO(part['header'] + part['text'])
+
+        df = pd.read_csv(part_data, sep="\t", comment='#', quoting=3)
+
+        df['url_id'] = len(annotated_parts)
+
+        annotated_parts.append(df)
+
+    df = pd.concat(annotated_parts)
+
+    df.to_csv(annotated_tsv_file, sep="\t", quoting=3, index=False)
+
+
+def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, ned_rest_endpoint,
+             noproxy, scale_factor, ned_threshold, min_confidence, max_confidence, ned_priority):
+    if purpose == "NERD":
+        out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom', 'conf']
+    elif purpose == "OCR":
+        out_columns = ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'conf', 'line_id']
+        if min_confidence is not None and max_confidence is not None:
+            out_columns += ['ocrconf']
+    else:
+        raise RuntimeError("Unknown purpose.")
+
+    if noproxy:
+        os.environ['no_proxy'] = '*'
+
+    urls = []
+    if os.path.exists(tsv_out_file):
+        parts = extract_doc_links(tsv_out_file)
+        urls = [part['url'] for part in parts]
+    else:
+        pd.DataFrame([], columns=out_columns).to_csv(tsv_out_file, sep="\t", quoting=3, index=False)
+
+    pcgts = parse(page_xml_file)
+    tsv = []
+    line_info = []
+
+    for region_idx, region in enumerate(pcgts.get_Page().get_AllRegions(classes=['Text'], order='reading-order')):
+        for text_line in region.get_TextLine():
+            left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(text_line.get_Coords().points)]
+
+            if min_confidence is not None and max_confidence is not None:
+                conf = np.max([textequiv.conf for textequiv in text_line.get_TextEquiv()])
+            else:
+                conf = np.nan
+
+            line_info.append((len(urls), left, right, top, bottom, conf, text_line.id))
+
+            words = [word for word in text_line.get_Word()]
+
+            if len(words) <= 0:
+                for text_equiv in text_line.get_TextEquiv():
+                    # transform OCR coordinates using `scale_factor` to derive
+                    # correct coordinates for the web presentation image
+                    left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(text_line.get_Coords().points)]
+                    tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0,
+                                text_equiv.get_Unicode(), len(urls), left, right, top, bottom, text_line.id))
+            else:
+                for word in words:
+                    # XXX TODO make this configurable
+                    textequiv = ''
+                    list_textequivs = word.get_TextEquiv()
+                    if list_textequivs:
+                        textequiv = list_textequivs[0].get_Unicode()
+                    # transform OCR coordinates using `scale_factor` to derive
+                    # correct coordinates for the web presentation image
+                    left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(word.get_Coords().points)]
+                    tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0,
+                                textequiv, len(urls), left, right, top, bottom, text_line.id))
+
+    line_info = pd.DataFrame(line_info, columns=['url_id', 'left', 'right', 'top', 'bottom', 'conf', 'line_id'])
+
+    if min_confidence is not None and max_confidence is not None:
+        line_info['ocrconf'] = line_info.conf.map(lambda x: get_conf_color(x, min_confidence, max_confidence))
+
+    tsv = pd.DataFrame(tsv, columns=['rid', 'line', 'hcenter'] +
+                                    ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'line_id'])
+
+    # print(tsv)
+    with open(tsv_out_file, 'a') as f:
+        f.write('# ' + image_url + '\n')
+
+    if len(tsv) == 0:
+        return
+
+    vlinecenter = pd.DataFrame(tsv[['line', 'top']].groupby('line', sort=False).mean().top +
+                               (tsv[['line', 'bottom']].groupby('line', sort=False).mean().bottom -
+                                tsv[['line', 'top']].groupby('line', sort=False).mean().top) / 2,
+                               columns=['vlinecenter'])
+
+    tsv = tsv.merge(vlinecenter, left_on='line', right_index=True)
+    regions = [region.sort_values(['vlinecenter', 'hcenter']) for rid, region in tsv.groupby('rid', sort=False)]
+    tsv = pd.concat(regions)
+
+    if purpose == 'NERD':
+        tsv['No.'] = 0
+        tsv['NE-TAG'] = 'O'
+        tsv['NE-EMB'] = 'O'
+        tsv['ID'] = '-'
+        tsv['conf'] = '-'
+        tsv = tsv.rename(columns={'TEXT': 'TOKEN'})
+
+    elif purpose == 'OCR':
+        tsv = pd.DataFrame([(line, " ".join(part.TEXT.to_list())) for line, part in tsv.groupby('line')],
+                           columns=['line', 'TEXT'])
+        tsv = tsv.merge(line_info, left_on='line', right_index=True)
+    tsv = tsv[out_columns].reset_index(drop=True)
+
+    try:
+        if purpose == 'NERD' and ner_rest_endpoint is not None:
+            tsv, ner_result = ner(tsv, ner_rest_endpoint)
+            if ned_rest_endpoint is not None:
+                tsv, _ = ned(tsv, ner_result, ned_rest_endpoint, threshold=ned_threshold, priority=ned_priority)
+        tsv.to_csv(tsv_out_file, sep="\t", quoting=3, index=False, mode='a', header=False)
+    except requests.HTTPError as e:
+        print(e)
+
+def tsv2page(output_filename, keep_words, page_file, tsv_file):
+    if not output_filename:
+        output_filename = Path(page_file).stem + '.corrected.xml'
+    ns = {'pc': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15'}
+    tsv = pd.read_csv(tsv_file, sep='\t', comment='#', quoting=3)
+    tree = ET.parse(page_file)
+    for _, row in tsv.iterrows():
+        el_textline = tree.find(f'//pc:TextLine[@id="{row.line_id}"]', namespaces=ns)
+        el_textline.find('pc:TextEquiv/pc:Unicode', namespaces=ns).text = row.TEXT
+        if not keep_words:
+            for el_word in el_textline.findall('pc:Word', namespaces=ns):
+                el_textline.remove(el_word)
+    with open(output_filename, 'w', encoding='utf-8') as f:
+        f.write(ET.tostring(tree, pretty_print=True).decode('utf-8'))
+
+@click.command()
+@click.option('--output-filename', '-o', help="Output filename. "
+                                              "If omitted, PAGE-XML filename with .corrected.xml extension")
+@click.option('--keep-words', '-k', is_flag=True, help="Keep (out-of-date) Words of TextLines")
+@click.argument('page-file')
+@click.argument('tsv-file')
+def tsv2page_cli(output_filename, keep_words, page_file, tsv_file):
+    return tsv2page_cli(output_filename, keep_words, page_file, tsv_file)
+
+@click.command()
+@click.option('--xls-file', type=click.Path(exists=True), default=None,
+              help="Read parameters from xls-file. Expected columns:  Filename, iiif_url, scale_factor.")
+@click.option('--directory', type=click.Path(exists=True), default=None,
+              help="Search directory for PPN**/*.xml files. Extract PPN and file number into image-url.")
+@click.option('--purpose', type=click.Choice(['NERD', 'OCR'], case_sensitive=False), default="NERD",
+              help="Purpose of output tsv file. "
+                   "\n\nNERD: NER/NED application/ground-truth creation. "
+                   "\n\nOCR: OCR application/ground-truth creation. "
+                   "\n\ndefault: NERD.")
+def make_page2tsv_commands(xls_file, directory, purpose):
+    if xls_file is not None:
+
+        if xls_file.endswith(".xls"):
+            df = pd.read_excel(xls_file)
+        else:
+            df = pd.read_excel(xls_file, engine='openpyxl')
+
+        df = df.dropna(how='all')
+
+        for _, row in df.iterrows():
+            print('page2tsv $(OPTIONS) {}.xml {}.tsv --image-url={} --scale-factor={} --purpose={}'.
+                  format(row.Filename, row.Filename, row.iiif_url.replace('/full/full', '/left,top,width,height/full'),
+                         row.scale_factor, purpose))
+
+    elif directory is not None:
+        for file in glob.glob('{}/**/*.xml'.format(directory), recursive=True):
+
+            ma = re.match('(.*/(PPN[0-9X]+)/.*?([0-9]+).*?).xml', file)
+
+            if ma:
+                print('page2tsv {} {}.tsv '
+                      '--image-url=https://content.staatsbibliothek-berlin.de/dc/'
+                      '{}-{:08d}/left,top,width,height/full/0/default.jpg --scale-factor=1.0 --purpose={}'.
+                      format(file, ma.group(1), ma.group(2), int(ma.group(3)), purpose))
+
+
+@click.command()
+@click.argument('page-xml-file', type=click.Path(exists=True), required=True, nargs=1)
+@click.argument('tsv-out-file', type=click.Path(), required=True, nargs=1)
+@click.option('--purpose', type=click.Choice(['NERD', 'OCR'], case_sensitive=False), default="NERD",
+              help="Purpose of output tsv file. "
+                   "\n\nNERD: NER/NED application/ground-truth creation. "
+                   "\n\nOCR: OCR application/ground-truth creation. "
+                   "\n\ndefault: NERD.")
+@click.option('--image-url', type=str, default='http://empty')
+@click.option('--ner-rest-endpoint', type=str, default=None,
+              help="REST endpoint of sbb_ner service. See https://github.com/qurator-spk/sbb_ner for details. "
+                   "Only applicable in case of NERD.")
+@click.option('--ned-rest-endpoint', type=str, default=None,
+              help="REST endpoint of sbb_ned service. See https://github.com/qurator-spk/sbb_ned for details. "
+                   "Only applicable in case of NERD.")
+@click.option('--noproxy', type=bool, is_flag=True, help='disable proxy. default: enabled.')
+@click.option('--scale-factor', type=float, default=1.0, help='default: 1.0')
+@click.option('--ned-threshold', type=float, default=None)
+@click.option('--min-confidence', type=float, default=None)
+@click.option('--max-confidence', type=float, default=None)
+@click.option('--ned-priority', type=int, default=1)
+def page2tsv_cli(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, ned_rest_endpoint,
+             noproxy, scale_factor, ned_threshold, min_confidence, max_confidence, ned_priority):
+    return page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, ned_rest_endpoint,
+             noproxy, scale_factor, ned_threshold, min_confidence, max_confidence, ned_priority)
--- a/qurator/tsvtools/ocr.py
+++ b/qurator/tsvtools/ocr.py
@ -0,0 +1,23 @@
+import numpy as np
+import pandas as pd
+
+
+def get_conf_color(conf, min_conf, max_conf):
+
+    conf = min_conf if conf < min_conf else conf
+    conf = max_conf if conf > max_conf else conf
+
+    interval_size = (max_conf - min_conf) / 2.0
+
+    colors = np.array([[216, 108, 117], [216, 206, 108], [108, 216, 146]])
+
+    colors = pd.DataFrame(colors, index=[0, 1, 2], columns=['R', 'G', 'B'])
+
+    lower = np.floor((conf - min_conf) / interval_size)
+    upper = np.ceil((conf - min_conf) / interval_size)
+
+    pos = (conf - min_conf) / (2.0*interval_size)
+
+    col = (colors.loc[lower] * (1.0 - pos) + colors.loc[upper] * pos).astype(int)
+
+    return '#{:02x}'.format(col.R) + '{:02x}'.format(col.G) + '{:02x}'.format(col.B)
--- a/qurator/tsvtools/ocrd-tool.json
+++ b/qurator/tsvtools/ocrd-tool.json
@ -0,0 +1,46 @@
+{
+  "version": "0.0.1",
+  "git_url": "https://github.com/qurator-spk/page2tsv",
+  "tools": {
+    "ocrd-neat-export": {
+      "executable": "ocrd-neat-export",
+      "description": "Convert PAGE-XML to neat-loadable TSV",
+      "categories": [ "Format-Conversion" ],
+      "steps": [ "format-conversion" ],
+      "input_file_grp": ["INPUT"],
+      "output_file_grp": ["OUTPUT"],
+      "parameters": {
+        "iiif_url_template": {
+          "type": "string",
+          "description": "URL template for lookup of images via IIIF based on {{ unique_identifier }}, {{ page_id }}, {{ page_no }} and {{ PPN }}. 'left', 'top', 'right', 'bottom', 'width', and 'height' are replaced by the neat JS.",
+          "default": "https://content.staatsbibliothek-berlin.de/dc/{{ PPN }}-{{ page_no }}/left,top,width,height/full/0/default.jpg"
+        },
+        "scale_filegrp": {
+          "type": "string",
+          "description": "If the OCR was run on images with a different resolution thant the 'full' IIIF size, use the images in this file group to scale. Set to empty string to disable",
+          "default": ""
+        },
+        "noproxy": {
+          "type": "boolean",
+          "description": "Disable proxy if set",
+          "default": true
+        }
+      }
+    },
+    "ocrd-neat-import": {
+      "executable": "ocrd-neat-export",
+      "description": "Re-integrate TSV into PAGE-XML",
+      "categories": [ "Format-Conversion" ],
+      "steps": [ "format-conversion" ],
+      "input_file_grp": ["PAGE-GRP,TSV-GRP"],
+      "output_file_grp": ["OUTPUT"],
+      "parameters": {
+        "keep_words": {
+          "type": "boolean",
+          "description": "After updating the line TextEquiv, remove (false) or keep (true) existing and probably inconsistent pc:Word",
+          "default": false
+        }
+      }
+    }
+  }
+}
--- a/qurator/tsvtools/ocrd_cli.py
+++ b/qurator/tsvtools/ocrd_cli.py
@ -0,0 +1,14 @@
+import click
+from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
+
+from .ocrd_processors import OcrdNeatExportProcessor, OcrdNeatImportProcessor
+
+@click.command()
+@ocrd_cli_options
+def export_cli(*args, **kwargs):
+    return ocrd_cli_wrap_processor(OcrdNeatExportProcessor, *args, **kwargs)
+
+@click.command()
+@ocrd_cli_options
+def import_cli(*args, **kwargs):
+    return ocrd_cli_wrap_processor(OcrdNeatImportProcessor, *args, **kwargs)
--- a/qurator/tsvtools/ocrd_processors.py
+++ b/qurator/tsvtools/ocrd_processors.py
@ -0,0 +1,107 @@
+from json import loads
+from pathlib import Path
+from pkg_resources import resource_string
+from re import sub as re_sub
+
+import pandas as pd
+from PIL import Image
+
+from ocrd import Processor
+from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality, MIMETYPE_PAGE
+from ocrd_models import OcrdExif
+from ocrd_models.constants import NAMESPACES as NS
+from ocrd_models.ocrd_page import TextEquivType, to_xml
+from ocrd_modelfactory import page_from_file
+
+from .cli import page2tsv
+
+OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json'))
+
+class OcrdNeatExportProcessor(Processor):
+
+    def __init__(self, *args, **kwargs):
+        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-neat-export']
+        kwargs['version'] = OCRD_TOOL['version']
+        super().__init__(*args, **kwargs)
+
+    def process(self):
+        """
+        Convert PAGE-XML to TSV loadable by the neat GT editor.
+        """
+        log = getLogger('ocrd_neat.export')
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+        iiif_url_template = self.parameter['iiif_url_template']
+        noproxy = self.parameter['noproxy']
+
+        ppn_found = self.workspace.mets._tree.find('//mods:recordIdentifier[@source="gbv-ppn"]', NS)
+        if ppn_found is not None:
+            ppn = ppn_found.text
+        else:
+            ppn = ''
+        for n, input_file in enumerate(self.input_files):
+            page_id = input_file.pageId or input_file.ID
+            log.info('Processing: %d / %s of %d', n, page_id, len(list(self.input_files)))
+            file_id = make_file_id(input_file, self.output_file_grp)
+            pcgts = page_from_file(self.workspace.download_file(input_file))
+            page = pcgts.get_Page()
+
+            iiif_url = iiif_url_template\
+                    .replace('{{ unique_identifier }}', self.workspace.mets.unique_identifier)\
+                    .replace('{{ PPN }}', ppn)\
+                    .replace('{{ page_id }}', page_id)\
+                    .replace('{{ page_no }}', re_sub('[^0-9]', '', page_id))
+            Path(self.output_file_grp).mkdir(exist_ok=True)
+            tsv_filepath = Path(self.output_file_grp, file_id + '.tsv')
+            page2tsv(input_file.local_filename, tsv_filepath, 'OCR', iiif_url, None, None, noproxy, 1.0, None, None, None, 1)
+
+            self.workspace.add_file(
+                ID=file_id,
+                file_grp=self.output_file_grp,
+                pageId=page_id,
+                mimetype='text/tab-separated-values',
+                local_filename=str(tsv_filepath))
+
+class OcrdNeatImportProcessor(Processor):
+
+    def __init__(self, *args, **kwargs):
+        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-neat-import']
+        kwargs['version'] = OCRD_TOOL['version']
+        super().__init__(*args, **kwargs)
+
+    def process(self):
+        """
+        Merge neat TSV results back into PAGE-XML.
+        """
+        log = getLogger('ocrd_neat.import')
+        assert_file_grp_cardinality(self.input_file_grp, 2)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+        keep_words = self.parameter['keep_words']
+        for n, (page_in_file, tsv_file) in enumerate(self.zip_input_files()):
+            page_id = page_in_file.pageId or page_in_file.ID
+            log.info('Processing: %d / %s of %d', n, page_id, len(list(self.zip_input_files())))
+            file_id = make_file_id(page_in_file, self.output_file_grp)
+            pcgts = page_from_file(self.workspace.download_file(page_in_file))
+            page = pcgts.get_Page()
+
+            tsv = pd.read_csv(tsv_file.local_filename, sep='\t', comment='#', quoting=3)
+            id_to_text = {}
+            for _, row in tsv.iterrows():
+                if str(row.TEXT).strip():
+                    id_to_text[row.line_id] = row.TEXT
+            for textline in page.get_AllTextLines():
+                if textline.id in id_to_text:
+                    textline.set_TextEquiv([TextEquivType(Unicode=id_to_text[textline.id])])
+                if not keep_words:
+                    textline.set_Word([])
+
+            self.add_metadata(pcgts)
+            pcgts.set_pcGtsId(file_id)
+            self.workspace.add_file(
+                ID=file_id,
+                file_grp=self.output_file_grp,
+                pageId=page_id,
+                mimetype=MIMETYPE_PAGE,
+                local_filename="%s/%s.xml" % (self.output_file_grp, file_id),
+                content=to_xml(pcgts)
+            )