From b90d1fe56accecb475ff5e64513040896713da28 Mon Sep 17 00:00:00 2001 From: Kai Labusch Date: Tue, 24 Feb 2026 10:49:52 +0100 Subject: [PATCH] add support for image url in page meta data --- qurator/tsvtools/cli.py | 8 ++++++++ requirements.txt | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/qurator/tsvtools/cli.py b/qurator/tsvtools/cli.py index 8706706..410e7d7 100644 --- a/qurator/tsvtools/cli.py +++ b/qurator/tsvtools/cli.py @@ -403,6 +403,14 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, tsv = [] line_info = [] + if image_url == 'http://empty': + creator = pcgts.Metadata.get_Creator() + + neat_url_ma = re.match(r"(.*)\|NEAT_URL:(.*?)\|.*", creator) + + if neat_url_ma: + image_url = neat_url_ma.group(2) + _unicode_normalize = unicode_normalize if normalization_file is not None: diff --git a/requirements.txt b/requirements.txt index 4ceb92e..984ccf2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ ocrd >= 2.23.2 pandas -qurator-sbb-utils @ git+https://github.com/qurator-spk/sbb_utils.git +qurator-sbb-utils @ git+https://github.com/qurator-spk/sbb_utils.git \ No newline at end of file