From d4848100388a5d24983286ef23dca285c78d89f6 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 20:43:25 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20dinglehopper:=20Validate=20read=20s?= =?UTF-8?q?egment=20ids?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli.py | 1 - qurator/dinglehopper/ocr_files.py | 11 +++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 2889e46..9c963c1 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -26,7 +26,6 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): # Set Bootstrap tooltip to the segment id if id_: html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_) - # XXX must sanitize id_ or do we trust the XML? if css_classes: return '{html_t}'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 1652b71..d3918d1 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -11,6 +11,7 @@ import sys import attr import enum import unicodedata +import re @attr.s(frozen=True) @@ -30,7 +31,7 @@ class ExtractedText: if not self._segment_id_for_pos: segment_id_for_pos = [] for s in self.segments: - segment_id_for_pos.extend(repeat(s.id, len(s.text))) + segment_id_for_pos.extend(repeat(s.segment_id, len(s.text))) segment_id_for_pos.extend(repeat(None, len(self.joiner))) # This is frozen, so we have to jump through the hoop: object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos) @@ -62,7 +63,13 @@ normalize_sbb = lambda t: normalize(t, Normalization.NFC_SBB) @attr.s(frozen=True) class ExtractedTextSegment: - id = attr.ib(type=str) + segment_id = attr.ib(type=str) + @segment_id.validator + def check(self, attribute, value): + if value is None: + return + if not re.match(r'[\w\d_-]+', value): + raise ValueError('Malformed segment id "{}"'.format(value)) text = attr.ib(type=str) @text.validator def check(self, attribute, value):