1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-06-07 19:05:13 +02:00

dinglehopper: Validate read segment ids

This commit is contained in:
Gerber, Mike 2020-06-12 20:43:25 +02:00
parent c9109999db
commit e972328e51
2 changed files with 9 additions and 3 deletions

View file

@ -26,7 +26,6 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
# Set Bootstrap tooltip to the segment id # Set Bootstrap tooltip to the segment id
if id_: if id_:
html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_) html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)
# XXX must sanitize id_ or do we trust the XML?
if css_classes: if css_classes:
return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs) return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs)

View file

@ -11,6 +11,7 @@ import sys
import attr import attr
import enum import enum
import unicodedata import unicodedata
import re
@attr.s(frozen=True) @attr.s(frozen=True)
@ -30,7 +31,7 @@ class ExtractedText:
if not self._segment_id_for_pos: if not self._segment_id_for_pos:
segment_id_for_pos = [] segment_id_for_pos = []
for s in self.segments: for s in self.segments:
segment_id_for_pos.extend(repeat(s.id, len(s.text))) segment_id_for_pos.extend(repeat(s.segment_id, len(s.text)))
segment_id_for_pos.extend(repeat(None, len(self.joiner))) segment_id_for_pos.extend(repeat(None, len(self.joiner)))
# This is frozen, so we have to jump through the hoop: # This is frozen, so we have to jump through the hoop:
object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos) object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
@ -62,7 +63,13 @@ normalize_sbb = lambda t: normalize(t, Normalization.NFC_SBB)
@attr.s(frozen=True) @attr.s(frozen=True)
class ExtractedTextSegment: class ExtractedTextSegment:
id = attr.ib(type=str) segment_id = attr.ib(type=str)
@segment_id.validator
def check(self, attribute, value):
if value is None:
return
if not re.match(r'[\w\d_-]+', value):
raise ValueError('Malformed segment id "{}"'.format(value))
text = attr.ib(type=str) text = attr.ib(type=str)
@text.validator @text.validator
def check(self, attribute, value): def check(self, attribute, value):