implement page2tsv/tsv2page as ocrd-neat-{ex,im}port
parent
0aee20a7f6
commit
aeb67e445f
@ -0,0 +1 @@
|
||||
tsvtools/ocrd-tool.json
|
@ -0,0 +1,46 @@
|
||||
{
|
||||
"version": "0.0.1",
|
||||
"git_url": "https://github.com/qurator-spk/page2tsv",
|
||||
"tools": {
|
||||
"ocrd-neat-export": {
|
||||
"executable": "ocrd-neat-export",
|
||||
"description": "Convert PAGE-XML to neat-loadable TSV",
|
||||
"categories": [ "Format-Conversion" ],
|
||||
"steps": [ "format-conversion" ],
|
||||
"input_file_grp": ["INPUT"],
|
||||
"output_file_grp": ["OUTPUT"],
|
||||
"parameters": {
|
||||
"iiif_url_template": {
|
||||
"type": "string",
|
||||
"description": "URL template for lookup of images via IIIF based on {{ unique_identifier }}, {{ page_id }}, {{ page_no }} and {{ image_width }}, or {{ PPN }}. 'left', 'top', 'right', 'bottom', 'width', and 'height' are replaced by the neat JS.",
|
||||
"default": "https://content.staatsbibliothek-berlin.de/dc/{{ PPN }}-{{ page_no }}/left,top,width,height/{{ image_width }}/0/default.jpg"
|
||||
},
|
||||
"scale_filegrp": {
|
||||
"type": "string",
|
||||
"description": "If the OCR was run on images with a different resolution thant the 'full' IIIF size, use the images in this file group to scale. Set to empty string to disable",
|
||||
"default": ""
|
||||
},
|
||||
"noproxy": {
|
||||
"type": "boolean",
|
||||
"description": "Disable proxy if set",
|
||||
"default": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"ocrd-neat-import": {
|
||||
"executable": "ocrd-neat-export",
|
||||
"description": "Re-integrate TSV into PAGE-XML",
|
||||
"categories": [ "Format-Conversion" ],
|
||||
"steps": [ "format-conversion" ],
|
||||
"input_file_grp": ["PAGE-GRP,TSV-GRP"],
|
||||
"output_file_grp": ["OUTPUT"],
|
||||
"parameters": {
|
||||
"keep_words": {
|
||||
"type": "boolean",
|
||||
"description": "After updating the line TextEquiv, remove (false) or keep (true) existing and probably inconsistent pc:Word",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,14 @@
|
||||
import click
|
||||
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
||||
|
||||
from .ocrd_processors import OcrdNeatExportProcessor, OcrdNeatImportProcessor
|
||||
|
||||
@click.command()
|
||||
@ocrd_cli_options
|
||||
def export_cli(*args, **kwargs):
|
||||
return ocrd_cli_wrap_processor(OcrdNeatExportProcessor, *args, **kwargs)
|
||||
|
||||
@click.command()
|
||||
@ocrd_cli_options
|
||||
def import_cli(*args, **kwargs):
|
||||
return ocrd_cli_wrap_processor(OcrdNeatImportProcessor, *args, **kwargs)
|
@ -0,0 +1,114 @@
|
||||
from json import loads
|
||||
from pathlib import Path
|
||||
from pkg_resources import resource_string
|
||||
from re import sub as re_sub
|
||||
|
||||
import pandas as pd
|
||||
from PIL import Image
|
||||
|
||||
from ocrd import Processor
|
||||
from ocrd_models import OcrdExif
|
||||
from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality, MIMETYPE_PAGE
|
||||
from ocrd_models.constants import NAMESPACES as NS
|
||||
from ocrd_models.ocrd_page import TextEquivType, to_xml
|
||||
from ocrd_modelfactory import page_from_file
|
||||
|
||||
from .cli import page2tsv
|
||||
|
||||
OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json'))
|
||||
|
||||
class OcrdNeatExportProcessor(Processor):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-neat-export']
|
||||
kwargs['version'] = OCRD_TOOL['version']
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def process(self):
|
||||
"""
|
||||
Convert PAGE-XML to TSV loadable by the neat GT editor.
|
||||
"""
|
||||
log = getLogger('ocrd_neat.export')
|
||||
assert_file_grp_cardinality(self.input_file_grp, 1)
|
||||
assert_file_grp_cardinality(self.output_file_grp, 1)
|
||||
iiif_url_template = self.parameter['iiif_url_template']
|
||||
scale_filegrp = self.parameter['scale_filegrp']
|
||||
noproxy = self.parameter['noproxy']
|
||||
for n, input_file in enumerate(self.input_files):
|
||||
page_id = input_file.pageId or input_file.ID
|
||||
log.info('Processing: %d / %s of %d', n, page_id, len(list(self.input_files)))
|
||||
file_id = make_file_id(input_file, self.output_file_grp)
|
||||
pcgts = page_from_file(self.workspace.download_file(input_file))
|
||||
page = pcgts.get_Page()
|
||||
scale_factor = 1.0
|
||||
iiif_width = f',{page.imageHeight}'
|
||||
ppn = self.workspace.mets.unique_identifier
|
||||
el_recordIdentifier = self.workspace.mets._tree.getroot().find(".//mods:recordIdentifier[@source='gbv-ppn']", NS)
|
||||
if el_recordIdentifier is not None:
|
||||
ppn = el_recordIdentifier.text
|
||||
if scale_filegrp:
|
||||
scaled_img_ocrd_file = self.workspace.download_file(next(
|
||||
self.workspace.mets.find_files(fileGrp=scale_filegrp, pageId=page_id)))
|
||||
scaled_img_pil = Image.open(scaled_img_ocrd_file.local_filename)
|
||||
scale_factor = scaled_img_pil.width / page.imageWidth
|
||||
iiif_width = 'full'
|
||||
iiif_url = iiif_url_template\
|
||||
.replace('{{ unique_identifier }}', self.workspace.mets.unique_identifier)\
|
||||
.replace('{{ PPN }}', ppn)\
|
||||
.replace('{{ page_id }}', page_id)\
|
||||
.replace('{{ page_no }}', re_sub('[^0-9]', '', page_id))\
|
||||
.replace('{{ image_width }}', str(iiif_width))
|
||||
Path(self.output_file_grp).mkdir(exist_ok=True)
|
||||
tsv_filepath = Path(self.output_file_grp, file_id + '.tsv')
|
||||
page2tsv(input_file.local_filename, tsv_filepath, 'OCR', iiif_url, None, None, noproxy, scale_factor, None, None, None, 1)
|
||||
|
||||
self.workspace.add_file(
|
||||
ID=file_id,
|
||||
file_grp=self.output_file_grp,
|
||||
pageId=page_id,
|
||||
mimetype='text/tab-separated-values',
|
||||
local_filename=str(tsv_filepath))
|
||||
|
||||
class OcrdNeatImportProcessor(Processor):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-neat-import']
|
||||
kwargs['version'] = OCRD_TOOL['version']
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def process(self):
|
||||
"""
|
||||
Merge neat TSV results back into PAGE-XML.
|
||||
"""
|
||||
log = getLogger('ocrd_neat.import')
|
||||
assert_file_grp_cardinality(self.input_file_grp, 2)
|
||||
assert_file_grp_cardinality(self.output_file_grp, 1)
|
||||
keep_words = self.parameter['keep_words']
|
||||
for n, (page_in_file, tsv_file) in enumerate(self.zip_input_files()):
|
||||
page_id = page_in_file.pageId or page_in_file.ID
|
||||
log.info('Processing: %d / %s of %d', n, page_id, len(list(self.zip_input_files())))
|
||||
file_id = make_file_id(page_in_file, self.output_file_grp)
|
||||
pcgts = page_from_file(self.workspace.download_file(page_in_file))
|
||||
page = pcgts.get_Page()
|
||||
|
||||
tsv = pd.read_csv(tsv_file.local_filename, sep='\t', comment='#', quoting=3)
|
||||
id_to_text = {}
|
||||
for _, row in tsv.iterrows():
|
||||
if str(row.TEXT).strip():
|
||||
id_to_text[row.line_id] = row.TEXT
|
||||
for textline in page.get_AllTextLines():
|
||||
if textline.id in id_to_text:
|
||||
textline.set_TextEquiv([TextEquivType(Unicode=id_to_text[textline.id])])
|
||||
if not keep_words:
|
||||
textline.set_Word([])
|
||||
|
||||
self.add_metadata(pcgts)
|
||||
pcgts.set_pcGtsId(file_id)
|
||||
self.workspace.add_file(
|
||||
ID=file_id,
|
||||
file_grp=self.output_file_grp,
|
||||
pageId=page_id,
|
||||
mimetype=MIMETYPE_PAGE,
|
||||
local_filename="%s/%s.xml" % (self.output_file_grp, file_id),
|
||||
content=to_xml(pcgts)
|
||||
)
|
Loading…
Reference in New Issue