diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..e69de29 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..bfe4f21 --- /dev/null +++ b/Makefile @@ -0,0 +1,17 @@ +deps: + pip install -r requirements.txt + +deps-test: + pip install -r requirements-test.txt + +test: + pytest tests + +install: + pip install . + +install-dev: + pip install -e . + +.PHONY: test + diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..e079f8a --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1 @@ +pytest diff --git a/tsvtools/ocrd-tool.json b/tsvtools/ocrd-tool.json index 28eca7c..7844abf 100644 --- a/tsvtools/ocrd-tool.json +++ b/tsvtools/ocrd-tool.json @@ -12,8 +12,8 @@ "parameters": { "iiif_url_template": { "type": "string", - "description": "URL template for lookup of images via IIIF based on {{ unique_identifier }}, {{ page_id }}, {{ page_no }} and {{ image_width }}, or {{ PPN }}. 'left', 'top', 'right', 'bottom', 'width', and 'height' are replaced by the neat JS.", - "default": "https://content.staatsbibliothek-berlin.de/dc/{{ PPN }}-{{ page_no }}/left,top,width,height/{{ image_width }}/0/default.jpg" + "description": "URL template for lookup of images via IIIF based on {{ unique_identifier }}, {{ page_id }}, {{ page_no }} and {{ PPN }}. 'left', 'top', 'right', 'bottom', 'width', and 'height' are replaced by the neat JS.", + "default": "https://content.staatsbibliothek-berlin.de/dc/{{ PPN }}-{{ page_no }}/left,top,width,height/full/0/default.jpg" }, "scale_filegrp": { "type": "string", diff --git a/tsvtools/ocrd_processors.py b/tsvtools/ocrd_processors.py index bede756..23abb42 100644 --- a/tsvtools/ocrd_processors.py +++ b/tsvtools/ocrd_processors.py @@ -7,8 +7,8 @@ import pandas as pd from PIL import Image from ocrd import Processor -from ocrd_models import OcrdExif from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality, MIMETYPE_PAGE +from ocrd_models import OcrdExif from ocrd_models.constants import NAMESPACES as NS from ocrd_models.ocrd_page import TextEquivType, to_xml from ocrd_modelfactory import page_from_file @@ -32,35 +32,29 @@ class OcrdNeatExportProcessor(Processor): assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) iiif_url_template = self.parameter['iiif_url_template'] - scale_filegrp = self.parameter['scale_filegrp'] noproxy = self.parameter['noproxy'] + + ppn_found = self.workspace.mets._tree.find('//mods:recordIdentifier[@source="gbv-ppn"]', NS) + print(ppn_found) + if ppn_found is not None: + ppn = ppn_found.text + else: + ppn = '' for n, input_file in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID log.info('Processing: %d / %s of %d', n, page_id, len(list(self.input_files))) file_id = make_file_id(input_file, self.output_file_grp) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() - scale_factor = 1.0 - iiif_width = f',{page.imageHeight}' - ppn = self.workspace.mets.unique_identifier - el_recordIdentifier = self.workspace.mets._tree.getroot().find(".//mods:recordIdentifier[@source='gbv-ppn']", NS) - if el_recordIdentifier is not None: - ppn = el_recordIdentifier.text - if scale_filegrp: - scaled_img_ocrd_file = self.workspace.download_file(next( - self.workspace.mets.find_files(fileGrp=scale_filegrp, pageId=page_id))) - scaled_img_pil = Image.open(scaled_img_ocrd_file.local_filename) - scale_factor = scaled_img_pil.width / page.imageWidth - iiif_width = 'full' + iiif_url = iiif_url_template\ .replace('{{ unique_identifier }}', self.workspace.mets.unique_identifier)\ .replace('{{ PPN }}', ppn)\ .replace('{{ page_id }}', page_id)\ - .replace('{{ page_no }}', re_sub('[^0-9]', '', page_id))\ - .replace('{{ image_width }}', str(iiif_width)) + .replace('{{ page_no }}', re_sub('[^0-9]', '', page_id)) Path(self.output_file_grp).mkdir(exist_ok=True) tsv_filepath = Path(self.output_file_grp, file_id + '.tsv') - page2tsv(input_file.local_filename, tsv_filepath, 'OCR', iiif_url, None, None, noproxy, scale_factor, None, None, None, 1) + page2tsv(input_file.local_filename, tsv_filepath, 'OCR', iiif_url, None, None, noproxy, 1.0, None, None, None, 1) self.workspace.add_file( ID=file_id,