dinglehopper: Validate read segment ids

pull/38/head
Gerber, Mike 4 years ago
parent d39f74f11a
commit d484810038

@ -26,7 +26,6 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
# Set Bootstrap tooltip to the segment id
if id_:
html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)
# XXX must sanitize id_ or do we trust the XML?
if css_classes:
return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs)

@ -11,6 +11,7 @@ import sys
import attr
import enum
import unicodedata
import re
@attr.s(frozen=True)
@ -30,7 +31,7 @@ class ExtractedText:
if not self._segment_id_for_pos:
segment_id_for_pos = []
for s in self.segments:
segment_id_for_pos.extend(repeat(s.id, len(s.text)))
segment_id_for_pos.extend(repeat(s.segment_id, len(s.text)))
segment_id_for_pos.extend(repeat(None, len(self.joiner)))
# This is frozen, so we have to jump through the hoop:
object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
@ -62,7 +63,13 @@ normalize_sbb = lambda t: normalize(t, Normalization.NFC_SBB)
@attr.s(frozen=True)
class ExtractedTextSegment:
id = attr.ib(type=str)
segment_id = attr.ib(type=str)
@segment_id.validator
def check(self, attribute, value):
if value is None:
return
if not re.match(r'[\w\d_-]+', value):
raise ValueError('Malformed segment id "{}"'.format(value))
text = attr.ib(type=str)
@text.validator
def check(self, attribute, value):

Loading…
Cancel
Save