diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 2889e46..9c963c1 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -26,7 +26,6 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): # Set Bootstrap tooltip to the segment id if id_: html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_) - # XXX must sanitize id_ or do we trust the XML? if css_classes: return '{html_t}'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 1652b71..d3918d1 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -11,6 +11,7 @@ import sys import attr import enum import unicodedata +import re @attr.s(frozen=True) @@ -30,7 +31,7 @@ class ExtractedText: if not self._segment_id_for_pos: segment_id_for_pos = [] for s in self.segments: - segment_id_for_pos.extend(repeat(s.id, len(s.text))) + segment_id_for_pos.extend(repeat(s.segment_id, len(s.text))) segment_id_for_pos.extend(repeat(None, len(self.joiner))) # This is frozen, so we have to jump through the hoop: object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos) @@ -62,7 +63,13 @@ normalize_sbb = lambda t: normalize(t, Normalization.NFC_SBB) @attr.s(frozen=True) class ExtractedTextSegment: - id = attr.ib(type=str) + segment_id = attr.ib(type=str) + @segment_id.validator + def check(self, attribute, value): + if value is None: + return + if not re.match(r'[\w\d_-]+', value): + raise ValueError('Malformed segment id "{}"'.format(value)) text = attr.ib(type=str) @text.validator def check(self, attribute, value):