mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 03:40:12 +02:00
✨ dinglehopper: Validate read segment ids
This commit is contained in:
parent
d39f74f11a
commit
d484810038
2 changed files with 9 additions and 3 deletions
|
@ -26,7 +26,6 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
|
|||
# Set Bootstrap tooltip to the segment id
|
||||
if id_:
|
||||
html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)
|
||||
# XXX must sanitize id_ or do we trust the XML?
|
||||
|
||||
if css_classes:
|
||||
return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs)
|
||||
|
|
|
@ -11,6 +11,7 @@ import sys
|
|||
import attr
|
||||
import enum
|
||||
import unicodedata
|
||||
import re
|
||||
|
||||
|
||||
@attr.s(frozen=True)
|
||||
|
@ -30,7 +31,7 @@ class ExtractedText:
|
|||
if not self._segment_id_for_pos:
|
||||
segment_id_for_pos = []
|
||||
for s in self.segments:
|
||||
segment_id_for_pos.extend(repeat(s.id, len(s.text)))
|
||||
segment_id_for_pos.extend(repeat(s.segment_id, len(s.text)))
|
||||
segment_id_for_pos.extend(repeat(None, len(self.joiner)))
|
||||
# This is frozen, so we have to jump through the hoop:
|
||||
object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
|
||||
|
@ -62,7 +63,13 @@ normalize_sbb = lambda t: normalize(t, Normalization.NFC_SBB)
|
|||
|
||||
@attr.s(frozen=True)
|
||||
class ExtractedTextSegment:
|
||||
id = attr.ib(type=str)
|
||||
segment_id = attr.ib(type=str)
|
||||
@segment_id.validator
|
||||
def check(self, attribute, value):
|
||||
if value is None:
|
||||
return
|
||||
if not re.match(r'[\w\d_-]+', value):
|
||||
raise ValueError('Malformed segment id "{}"'.format(value))
|
||||
text = attr.ib(type=str)
|
||||
@text.validator
|
||||
def check(self, attribute, value):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue