mirror of
https://github.com/qurator-spk/page2tsv.git
synced 2025-06-15 22:39:54 +02:00
drop support for scaling, not necessary for SBB use case anymore
This commit is contained in:
parent
fe4a1eabb1
commit
60a07c6310
5 changed files with 31 additions and 19 deletions
0
.gitmodules
vendored
Normal file
0
.gitmodules
vendored
Normal file
17
Makefile
Normal file
17
Makefile
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
deps:
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
deps-test:
|
||||||
|
pip install -r requirements-test.txt
|
||||||
|
|
||||||
|
test:
|
||||||
|
pytest tests
|
||||||
|
|
||||||
|
install:
|
||||||
|
pip install .
|
||||||
|
|
||||||
|
install-dev:
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
.PHONY: test
|
||||||
|
|
1
requirements-test.txt
Normal file
1
requirements-test.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
pytest
|
|
@ -12,8 +12,8 @@
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"iiif_url_template": {
|
"iiif_url_template": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "URL template for lookup of images via IIIF based on {{ unique_identifier }}, {{ page_id }}, {{ page_no }} and {{ image_width }}, or {{ PPN }}. 'left', 'top', 'right', 'bottom', 'width', and 'height' are replaced by the neat JS.",
|
"description": "URL template for lookup of images via IIIF based on {{ unique_identifier }}, {{ page_id }}, {{ page_no }} and {{ PPN }}. 'left', 'top', 'right', 'bottom', 'width', and 'height' are replaced by the neat JS.",
|
||||||
"default": "https://content.staatsbibliothek-berlin.de/dc/{{ PPN }}-{{ page_no }}/left,top,width,height/{{ image_width }}/0/default.jpg"
|
"default": "https://content.staatsbibliothek-berlin.de/dc/{{ PPN }}-{{ page_no }}/left,top,width,height/full/0/default.jpg"
|
||||||
},
|
},
|
||||||
"scale_filegrp": {
|
"scale_filegrp": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
|
|
|
@ -7,8 +7,8 @@ import pandas as pd
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from ocrd import Processor
|
from ocrd import Processor
|
||||||
from ocrd_models import OcrdExif
|
|
||||||
from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality, MIMETYPE_PAGE
|
from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality, MIMETYPE_PAGE
|
||||||
|
from ocrd_models import OcrdExif
|
||||||
from ocrd_models.constants import NAMESPACES as NS
|
from ocrd_models.constants import NAMESPACES as NS
|
||||||
from ocrd_models.ocrd_page import TextEquivType, to_xml
|
from ocrd_models.ocrd_page import TextEquivType, to_xml
|
||||||
from ocrd_modelfactory import page_from_file
|
from ocrd_modelfactory import page_from_file
|
||||||
|
@ -32,35 +32,29 @@ class OcrdNeatExportProcessor(Processor):
|
||||||
assert_file_grp_cardinality(self.input_file_grp, 1)
|
assert_file_grp_cardinality(self.input_file_grp, 1)
|
||||||
assert_file_grp_cardinality(self.output_file_grp, 1)
|
assert_file_grp_cardinality(self.output_file_grp, 1)
|
||||||
iiif_url_template = self.parameter['iiif_url_template']
|
iiif_url_template = self.parameter['iiif_url_template']
|
||||||
scale_filegrp = self.parameter['scale_filegrp']
|
|
||||||
noproxy = self.parameter['noproxy']
|
noproxy = self.parameter['noproxy']
|
||||||
|
|
||||||
|
ppn_found = self.workspace.mets._tree.find('//mods:recordIdentifier[@source="gbv-ppn"]', NS)
|
||||||
|
print(ppn_found)
|
||||||
|
if ppn_found is not None:
|
||||||
|
ppn = ppn_found.text
|
||||||
|
else:
|
||||||
|
ppn = ''
|
||||||
for n, input_file in enumerate(self.input_files):
|
for n, input_file in enumerate(self.input_files):
|
||||||
page_id = input_file.pageId or input_file.ID
|
page_id = input_file.pageId or input_file.ID
|
||||||
log.info('Processing: %d / %s of %d', n, page_id, len(list(self.input_files)))
|
log.info('Processing: %d / %s of %d', n, page_id, len(list(self.input_files)))
|
||||||
file_id = make_file_id(input_file, self.output_file_grp)
|
file_id = make_file_id(input_file, self.output_file_grp)
|
||||||
pcgts = page_from_file(self.workspace.download_file(input_file))
|
pcgts = page_from_file(self.workspace.download_file(input_file))
|
||||||
page = pcgts.get_Page()
|
page = pcgts.get_Page()
|
||||||
scale_factor = 1.0
|
|
||||||
iiif_width = f',{page.imageHeight}'
|
|
||||||
ppn = self.workspace.mets.unique_identifier
|
|
||||||
el_recordIdentifier = self.workspace.mets._tree.getroot().find(".//mods:recordIdentifier[@source='gbv-ppn']", NS)
|
|
||||||
if el_recordIdentifier is not None:
|
|
||||||
ppn = el_recordIdentifier.text
|
|
||||||
if scale_filegrp:
|
|
||||||
scaled_img_ocrd_file = self.workspace.download_file(next(
|
|
||||||
self.workspace.mets.find_files(fileGrp=scale_filegrp, pageId=page_id)))
|
|
||||||
scaled_img_pil = Image.open(scaled_img_ocrd_file.local_filename)
|
|
||||||
scale_factor = scaled_img_pil.width / page.imageWidth
|
|
||||||
iiif_width = 'full'
|
|
||||||
iiif_url = iiif_url_template\
|
iiif_url = iiif_url_template\
|
||||||
.replace('{{ unique_identifier }}', self.workspace.mets.unique_identifier)\
|
.replace('{{ unique_identifier }}', self.workspace.mets.unique_identifier)\
|
||||||
.replace('{{ PPN }}', ppn)\
|
.replace('{{ PPN }}', ppn)\
|
||||||
.replace('{{ page_id }}', page_id)\
|
.replace('{{ page_id }}', page_id)\
|
||||||
.replace('{{ page_no }}', re_sub('[^0-9]', '', page_id))\
|
.replace('{{ page_no }}', re_sub('[^0-9]', '', page_id))
|
||||||
.replace('{{ image_width }}', str(iiif_width))
|
|
||||||
Path(self.output_file_grp).mkdir(exist_ok=True)
|
Path(self.output_file_grp).mkdir(exist_ok=True)
|
||||||
tsv_filepath = Path(self.output_file_grp, file_id + '.tsv')
|
tsv_filepath = Path(self.output_file_grp, file_id + '.tsv')
|
||||||
page2tsv(input_file.local_filename, tsv_filepath, 'OCR', iiif_url, None, None, noproxy, scale_factor, None, None, None, 1)
|
page2tsv(input_file.local_filename, tsv_filepath, 'OCR', iiif_url, None, None, noproxy, 1.0, None, None, None, 1)
|
||||||
|
|
||||||
self.workspace.add_file(
|
self.workspace.add_file(
|
||||||
ID=file_id,
|
ID=file_id,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue