mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-07 19:05:13 +02:00
✨ dinglehopper: Validate read segment ids
This commit is contained in:
parent
c9109999db
commit
e972328e51
2 changed files with 9 additions and 3 deletions
|
@ -26,7 +26,6 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
|
||||||
# Set Bootstrap tooltip to the segment id
|
# Set Bootstrap tooltip to the segment id
|
||||||
if id_:
|
if id_:
|
||||||
html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)
|
html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)
|
||||||
# XXX must sanitize id_ or do we trust the XML?
|
|
||||||
|
|
||||||
if css_classes:
|
if css_classes:
|
||||||
return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs)
|
return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs)
|
||||||
|
|
|
@ -11,6 +11,7 @@ import sys
|
||||||
import attr
|
import attr
|
||||||
import enum
|
import enum
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
@attr.s(frozen=True)
|
@attr.s(frozen=True)
|
||||||
|
@ -30,7 +31,7 @@ class ExtractedText:
|
||||||
if not self._segment_id_for_pos:
|
if not self._segment_id_for_pos:
|
||||||
segment_id_for_pos = []
|
segment_id_for_pos = []
|
||||||
for s in self.segments:
|
for s in self.segments:
|
||||||
segment_id_for_pos.extend(repeat(s.id, len(s.text)))
|
segment_id_for_pos.extend(repeat(s.segment_id, len(s.text)))
|
||||||
segment_id_for_pos.extend(repeat(None, len(self.joiner)))
|
segment_id_for_pos.extend(repeat(None, len(self.joiner)))
|
||||||
# This is frozen, so we have to jump through the hoop:
|
# This is frozen, so we have to jump through the hoop:
|
||||||
object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
|
object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
|
||||||
|
@ -62,7 +63,13 @@ normalize_sbb = lambda t: normalize(t, Normalization.NFC_SBB)
|
||||||
|
|
||||||
@attr.s(frozen=True)
|
@attr.s(frozen=True)
|
||||||
class ExtractedTextSegment:
|
class ExtractedTextSegment:
|
||||||
id = attr.ib(type=str)
|
segment_id = attr.ib(type=str)
|
||||||
|
@segment_id.validator
|
||||||
|
def check(self, attribute, value):
|
||||||
|
if value is None:
|
||||||
|
return
|
||||||
|
if not re.match(r'[\w\d_-]+', value):
|
||||||
|
raise ValueError('Malformed segment id "{}"'.format(value))
|
||||||
text = attr.ib(type=str)
|
text = attr.ib(type=str)
|
||||||
@text.validator
|
@text.validator
|
||||||
def check(self, attribute, value):
|
def check(self, attribute, value):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue