diff --git a/ocrd-tool.json b/ocrd-tool.json new file mode 120000 index 0000000..03f3104 --- /dev/null +++ b/ocrd-tool.json @@ -0,0 +1 @@ +tsvtools/ocrd-tool.json \ No newline at end of file diff --git a/tsvtools/ocrd-tool.json b/tsvtools/ocrd-tool.json new file mode 100644 index 0000000..28eca7c --- /dev/null +++ b/tsvtools/ocrd-tool.json @@ -0,0 +1,46 @@ +{ + "version": "0.0.1", + "git_url": "https://github.com/qurator-spk/page2tsv", + "tools": { + "ocrd-neat-export": { + "executable": "ocrd-neat-export", + "description": "Convert PAGE-XML to neat-loadable TSV", + "categories": [ "Format-Conversion" ], + "steps": [ "format-conversion" ], + "input_file_grp": ["INPUT"], + "output_file_grp": ["OUTPUT"], + "parameters": { + "iiif_url_template": { + "type": "string", + "description": "URL template for lookup of images via IIIF based on {{ unique_identifier }}, {{ page_id }}, {{ page_no }} and {{ image_width }}, or {{ PPN }}. 'left', 'top', 'right', 'bottom', 'width', and 'height' are replaced by the neat JS.", + "default": "https://content.staatsbibliothek-berlin.de/dc/{{ PPN }}-{{ page_no }}/left,top,width,height/{{ image_width }}/0/default.jpg" + }, + "scale_filegrp": { + "type": "string", + "description": "If the OCR was run on images with a different resolution thant the 'full' IIIF size, use the images in this file group to scale. Set to empty string to disable", + "default": "" + }, + "noproxy": { + "type": "boolean", + "description": "Disable proxy if set", + "default": true + } + } + }, + "ocrd-neat-import": { + "executable": "ocrd-neat-export", + "description": "Re-integrate TSV into PAGE-XML", + "categories": [ "Format-Conversion" ], + "steps": [ "format-conversion" ], + "input_file_grp": ["PAGE-GRP,TSV-GRP"], + "output_file_grp": ["OUTPUT"], + "parameters": { + "keep_words": { + "type": "boolean", + "description": "After updating the line TextEquiv, remove (false) or keep (true) existing and probably inconsistent pc:Word", + "default": false + } + } + } + } +} diff --git a/tsvtools/ocrd_cli.py b/tsvtools/ocrd_cli.py new file mode 100644 index 0000000..38616aa --- /dev/null +++ b/tsvtools/ocrd_cli.py @@ -0,0 +1,14 @@ +import click +from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor + +from .ocrd_processors import OcrdNeatExportProcessor, OcrdNeatImportProcessor + +@click.command() +@ocrd_cli_options +def export_cli(*args, **kwargs): + return ocrd_cli_wrap_processor(OcrdNeatExportProcessor, *args, **kwargs) + +@click.command() +@ocrd_cli_options +def import_cli(*args, **kwargs): + return ocrd_cli_wrap_processor(OcrdNeatImportProcessor, *args, **kwargs) diff --git a/tsvtools/ocrd_processors.py b/tsvtools/ocrd_processors.py new file mode 100644 index 0000000..bede756 --- /dev/null +++ b/tsvtools/ocrd_processors.py @@ -0,0 +1,114 @@ +from json import loads +from pathlib import Path +from pkg_resources import resource_string +from re import sub as re_sub + +import pandas as pd +from PIL import Image + +from ocrd import Processor +from ocrd_models import OcrdExif +from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality, MIMETYPE_PAGE +from ocrd_models.constants import NAMESPACES as NS +from ocrd_models.ocrd_page import TextEquivType, to_xml +from ocrd_modelfactory import page_from_file + +from .cli import page2tsv + +OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json')) + +class OcrdNeatExportProcessor(Processor): + + def __init__(self, *args, **kwargs): + kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-neat-export'] + kwargs['version'] = OCRD_TOOL['version'] + super().__init__(*args, **kwargs) + + def process(self): + """ + Convert PAGE-XML to TSV loadable by the neat GT editor. + """ + log = getLogger('ocrd_neat.export') + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + iiif_url_template = self.parameter['iiif_url_template'] + scale_filegrp = self.parameter['scale_filegrp'] + noproxy = self.parameter['noproxy'] + for n, input_file in enumerate(self.input_files): + page_id = input_file.pageId or input_file.ID + log.info('Processing: %d / %s of %d', n, page_id, len(list(self.input_files))) + file_id = make_file_id(input_file, self.output_file_grp) + pcgts = page_from_file(self.workspace.download_file(input_file)) + page = pcgts.get_Page() + scale_factor = 1.0 + iiif_width = f',{page.imageHeight}' + ppn = self.workspace.mets.unique_identifier + el_recordIdentifier = self.workspace.mets._tree.getroot().find(".//mods:recordIdentifier[@source='gbv-ppn']", NS) + if el_recordIdentifier is not None: + ppn = el_recordIdentifier.text + if scale_filegrp: + scaled_img_ocrd_file = self.workspace.download_file(next( + self.workspace.mets.find_files(fileGrp=scale_filegrp, pageId=page_id))) + scaled_img_pil = Image.open(scaled_img_ocrd_file.local_filename) + scale_factor = scaled_img_pil.width / page.imageWidth + iiif_width = 'full' + iiif_url = iiif_url_template\ + .replace('{{ unique_identifier }}', self.workspace.mets.unique_identifier)\ + .replace('{{ PPN }}', ppn)\ + .replace('{{ page_id }}', page_id)\ + .replace('{{ page_no }}', re_sub('[^0-9]', '', page_id))\ + .replace('{{ image_width }}', str(iiif_width)) + Path(self.output_file_grp).mkdir(exist_ok=True) + tsv_filepath = Path(self.output_file_grp, file_id + '.tsv') + page2tsv(input_file.local_filename, tsv_filepath, 'OCR', iiif_url, None, None, noproxy, scale_factor, None, None, None, 1) + + self.workspace.add_file( + ID=file_id, + file_grp=self.output_file_grp, + pageId=page_id, + mimetype='text/tab-separated-values', + local_filename=str(tsv_filepath)) + +class OcrdNeatImportProcessor(Processor): + + def __init__(self, *args, **kwargs): + kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-neat-import'] + kwargs['version'] = OCRD_TOOL['version'] + super().__init__(*args, **kwargs) + + def process(self): + """ + Merge neat TSV results back into PAGE-XML. + """ + log = getLogger('ocrd_neat.import') + assert_file_grp_cardinality(self.input_file_grp, 2) + assert_file_grp_cardinality(self.output_file_grp, 1) + keep_words = self.parameter['keep_words'] + for n, (page_in_file, tsv_file) in enumerate(self.zip_input_files()): + page_id = page_in_file.pageId or page_in_file.ID + log.info('Processing: %d / %s of %d', n, page_id, len(list(self.zip_input_files()))) + file_id = make_file_id(page_in_file, self.output_file_grp) + pcgts = page_from_file(self.workspace.download_file(page_in_file)) + page = pcgts.get_Page() + + tsv = pd.read_csv(tsv_file.local_filename, sep='\t', comment='#', quoting=3) + id_to_text = {} + for _, row in tsv.iterrows(): + if str(row.TEXT).strip(): + id_to_text[row.line_id] = row.TEXT + for textline in page.get_AllTextLines(): + if textline.id in id_to_text: + textline.set_TextEquiv([TextEquivType(Unicode=id_to_text[textline.id])]) + if not keep_words: + textline.set_Word([]) + + self.add_metadata(pcgts) + pcgts.set_pcGtsId(file_id) + self.workspace.add_file( + ID=file_id, + file_grp=self.output_file_grp, + pageId=page_id, + mimetype=MIMETYPE_PAGE, + local_filename="%s/%s.xml" % (self.output_file_grp, file_id), + content=to_xml(pcgts) + )