Merge branch 'feat/display-segment-id'
commit
f50591abac
@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (dinglehopper)" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (dinglehopper-github)" project-jdk-type="Python SDK" />
|
||||
</project>
|
@ -0,0 +1,118 @@
|
||||
import enum
|
||||
import re
|
||||
import unicodedata
|
||||
from contextlib import suppress
|
||||
from itertools import repeat
|
||||
from typing import Optional
|
||||
|
||||
import attr
|
||||
|
||||
from .substitute_equivalences import substitute_equivalences
|
||||
|
||||
|
||||
class Normalization(enum.Enum):
|
||||
NFC = 1
|
||||
NFC_MUFI = 2 # TODO
|
||||
NFC_SBB = 3
|
||||
|
||||
|
||||
def normalize(text, normalization):
|
||||
if normalization == Normalization.NFC:
|
||||
return unicodedata.normalize('NFC', text)
|
||||
if normalization == Normalization.NFC_MUFI:
|
||||
raise NotImplementedError()
|
||||
if normalization == Normalization.NFC_SBB:
|
||||
return substitute_equivalences(text)
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
|
||||
# XXX hack
|
||||
def normalize_sbb(t):
|
||||
return normalize(t, Normalization.NFC_SBB)
|
||||
|
||||
|
||||
@attr.s(frozen=True)
|
||||
class ExtractedText:
|
||||
"""
|
||||
Extracted text
|
||||
|
||||
Objects of this class are guaranteed to be a. always in their normalization and
|
||||
b. in NFC.
|
||||
"""
|
||||
segment_id = attr.ib(type=Optional[str])
|
||||
|
||||
@segment_id.validator
|
||||
def check(self, _, value):
|
||||
if value is None:
|
||||
return
|
||||
if not re.match(r'[\w\d_-]+', value):
|
||||
raise ValueError('Malformed segment id "{}"'.format(value))
|
||||
|
||||
# An object contains either
|
||||
# a. _text itself
|
||||
# b. or segments (ExtractedText) and a joiner
|
||||
|
||||
segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list))
|
||||
joiner = attr.ib(type=Optional[str])
|
||||
_text = attr.ib(type=Optional[str])
|
||||
|
||||
@segments.validator
|
||||
def check(self, _, value):
|
||||
if value is not None and self._text is not None:
|
||||
raise ValueError("Can't have both segments and text")
|
||||
|
||||
@_text.validator
|
||||
def check(self, _, value):
|
||||
if value is not None and self.segments is not None:
|
||||
raise ValueError("Can't have both segments and text")
|
||||
if value is not None and unicodedata.normalize('NFC', value) != value:
|
||||
raise ValueError('String "{}" is not in NFC.'.format(value))
|
||||
if value is not None and normalize(value, self.normalization) != value:
|
||||
raise ValueError('String "{}" is not normalized.'.format(value))
|
||||
|
||||
normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB)
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
if self._text is not None:
|
||||
if self._text == '':
|
||||
return None
|
||||
else:
|
||||
return self._text
|
||||
else:
|
||||
return self.joiner.join(s.text for s in self.segments)
|
||||
|
||||
_segment_id_for_pos = None
|
||||
|
||||
def segment_id_for_pos(self, pos):
|
||||
# Calculate segment ids once, on the first call
|
||||
if not self._segment_id_for_pos:
|
||||
segment_id_for_pos = []
|
||||
for s in self.segments:
|
||||
segment_id_for_pos.extend(repeat(s.segment_id, len(s.text)))
|
||||
segment_id_for_pos.extend(repeat(None, len(self.joiner)))
|
||||
segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)]
|
||||
# This is frozen, so we have to jump through the hoop:
|
||||
object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
|
||||
assert self._segment_id_for_pos
|
||||
|
||||
return self._segment_id_for_pos[pos]
|
||||
|
||||
@classmethod
|
||||
def from_text_segment(cls, text_segment, nsmap):
|
||||
"""Build an ExtractedText from a PAGE content text element"""
|
||||
|
||||
segment_id = text_segment.attrib['id']
|
||||
segment_text = None
|
||||
with suppress(AttributeError):
|
||||
segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
|
||||
segment_text = segment_text or ''
|
||||
segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization
|
||||
segment_text = segment_text or ''
|
||||
return cls(segment_id, None, None, segment_text)
|
||||
|
||||
@classmethod
|
||||
def from_str(cls, text, normalization=Normalization.NFC_SBB):
|
||||
normalized_text = normalize(text, normalization)
|
||||
return cls(None, None, None, normalized_text, normalization=normalization)
|
@ -1,14 +1,15 @@
|
||||
function find_diff_class(classes) {
|
||||
return classes.split(/\s+/).find(x => x.match(/.diff\d.*/));
|
||||
return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
|
||||
}
|
||||
|
||||
$(document).ready(function() {
|
||||
/* Enable Bootstrap tooltips */
|
||||
$('[data-toggle="tooltip"]').tooltip();
|
||||
|
||||
$('.diff').mouseover(function() {
|
||||
let c = find_diff_class($(this).attr('class'))
|
||||
$('.' + c).addClass('diff-highlight')
|
||||
find_diff_class($(this).attr('class')).addClass('diff-highlight');
|
||||
});
|
||||
$('.diff').mouseout(function() {
|
||||
let c = find_diff_class($(this).attr('class'))
|
||||
$('.' + c).removeClass('diff-highlight')
|
||||
find_diff_class($(this).attr('class')).removeClass('diff-highlight');
|
||||
});
|
||||
});
|
||||
|
@ -0,0 +1,68 @@
|
||||
import unicodedata
|
||||
import pytest
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
from collections import namedtuple
|
||||
|
||||
from .. import seq_align, ExtractedText
|
||||
|
||||
|
||||
def test_text():
|
||||
test1 = ExtractedText(None, [
|
||||
ExtractedText('s0', None, None, 'foo'),
|
||||
ExtractedText('s1', None, None, 'bar'),
|
||||
ExtractedText('s2', None, None, 'bazinga')
|
||||
], ' ', None)
|
||||
|
||||
assert test1.text == 'foo bar bazinga'
|
||||
assert test1.segment_id_for_pos(0) == 's0'
|
||||
assert test1.segment_id_for_pos(3) is None
|
||||
assert test1.segment_id_for_pos(10) == 's2'
|
||||
|
||||
|
||||
def test_normalization_check():
|
||||
with pytest.raises(ValueError, match=r'.*is not in NFC.*'):
|
||||
ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ'))
|
||||
assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ'))
|
||||
|
||||
|
||||
AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id')
|
||||
|
||||
|
||||
def test_align():
|
||||
"""
|
||||
Test aligning by character while retaining segment id info
|
||||
|
||||
The difficulty here is that aligning should work on grapheme clusters,
|
||||
not Python characters.
|
||||
"""
|
||||
|
||||
test1 = ExtractedText(None, [
|
||||
ExtractedText('s0', None, None, 'foo'),
|
||||
ExtractedText('s1', None, None, 'bar'),
|
||||
ExtractedText('s2', None, None, 'batzinga')
|
||||
], ' ', None)
|
||||
test2 = ExtractedText(None, [
|
||||
ExtractedText('x0', None, None, 'foo'),
|
||||
ExtractedText('x1', None, None, 'bar'),
|
||||
ExtractedText('x2', None, None, '.'), # extra .
|
||||
ExtractedText('x3', None, None, 'bazim̃ga'), # deletion + different grapheme cluster, m̃ also is two Python characters
|
||||
], ' ', None)
|
||||
|
||||
left_pos = 0; right_pos = 0; alignment = []
|
||||
for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)):
|
||||
left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
|
||||
right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
|
||||
el = AlignmentElement(left, right, left_id, right_id)
|
||||
alignment.append(el)
|
||||
if left is not None:
|
||||
left_pos += len(left)
|
||||
if right is not None:
|
||||
right_pos += len(right)
|
||||
|
||||
print('test1: {}'.format(test1.text))
|
||||
print('test2: {}'.format(test2.text))
|
||||
|
||||
assert alignment[0] == ('f', 'f', 's0', 'x0')
|
||||
assert alignment[8] == (None, '.', None, 'x2')
|
||||
assert alignment[12] == ('t', None, 's2', None)
|
||||
assert alignment[15] == ('n', 'm̃', 's2', 'x3')
|
Loading…
Reference in New Issue