Merge branch 'feat/display-segment-id'
commit
f50591abac
@ -1,4 +1,4 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project version="4">
|
<project version="4">
|
||||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (dinglehopper)" project-jdk-type="Python SDK" />
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (dinglehopper-github)" project-jdk-type="Python SDK" />
|
||||||
</project>
|
</project>
|
@ -0,0 +1,118 @@
|
|||||||
|
import enum
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
from contextlib import suppress
|
||||||
|
from itertools import repeat
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import attr
|
||||||
|
|
||||||
|
from .substitute_equivalences import substitute_equivalences
|
||||||
|
|
||||||
|
|
||||||
|
class Normalization(enum.Enum):
|
||||||
|
NFC = 1
|
||||||
|
NFC_MUFI = 2 # TODO
|
||||||
|
NFC_SBB = 3
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(text, normalization):
|
||||||
|
if normalization == Normalization.NFC:
|
||||||
|
return unicodedata.normalize('NFC', text)
|
||||||
|
if normalization == Normalization.NFC_MUFI:
|
||||||
|
raise NotImplementedError()
|
||||||
|
if normalization == Normalization.NFC_SBB:
|
||||||
|
return substitute_equivalences(text)
|
||||||
|
else:
|
||||||
|
raise ValueError()
|
||||||
|
|
||||||
|
|
||||||
|
# XXX hack
|
||||||
|
def normalize_sbb(t):
|
||||||
|
return normalize(t, Normalization.NFC_SBB)
|
||||||
|
|
||||||
|
|
||||||
|
@attr.s(frozen=True)
|
||||||
|
class ExtractedText:
|
||||||
|
"""
|
||||||
|
Extracted text
|
||||||
|
|
||||||
|
Objects of this class are guaranteed to be a. always in their normalization and
|
||||||
|
b. in NFC.
|
||||||
|
"""
|
||||||
|
segment_id = attr.ib(type=Optional[str])
|
||||||
|
|
||||||
|
@segment_id.validator
|
||||||
|
def check(self, _, value):
|
||||||
|
if value is None:
|
||||||
|
return
|
||||||
|
if not re.match(r'[\w\d_-]+', value):
|
||||||
|
raise ValueError('Malformed segment id "{}"'.format(value))
|
||||||
|
|
||||||
|
# An object contains either
|
||||||
|
# a. _text itself
|
||||||
|
# b. or segments (ExtractedText) and a joiner
|
||||||
|
|
||||||
|
segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list))
|
||||||
|
joiner = attr.ib(type=Optional[str])
|
||||||
|
_text = attr.ib(type=Optional[str])
|
||||||
|
|
||||||
|
@segments.validator
|
||||||
|
def check(self, _, value):
|
||||||
|
if value is not None and self._text is not None:
|
||||||
|
raise ValueError("Can't have both segments and text")
|
||||||
|
|
||||||
|
@_text.validator
|
||||||
|
def check(self, _, value):
|
||||||
|
if value is not None and self.segments is not None:
|
||||||
|
raise ValueError("Can't have both segments and text")
|
||||||
|
if value is not None and unicodedata.normalize('NFC', value) != value:
|
||||||
|
raise ValueError('String "{}" is not in NFC.'.format(value))
|
||||||
|
if value is not None and normalize(value, self.normalization) != value:
|
||||||
|
raise ValueError('String "{}" is not normalized.'.format(value))
|
||||||
|
|
||||||
|
normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text(self):
|
||||||
|
if self._text is not None:
|
||||||
|
if self._text == '':
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return self._text
|
||||||
|
else:
|
||||||
|
return self.joiner.join(s.text for s in self.segments)
|
||||||
|
|
||||||
|
_segment_id_for_pos = None
|
||||||
|
|
||||||
|
def segment_id_for_pos(self, pos):
|
||||||
|
# Calculate segment ids once, on the first call
|
||||||
|
if not self._segment_id_for_pos:
|
||||||
|
segment_id_for_pos = []
|
||||||
|
for s in self.segments:
|
||||||
|
segment_id_for_pos.extend(repeat(s.segment_id, len(s.text)))
|
||||||
|
segment_id_for_pos.extend(repeat(None, len(self.joiner)))
|
||||||
|
segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)]
|
||||||
|
# This is frozen, so we have to jump through the hoop:
|
||||||
|
object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
|
||||||
|
assert self._segment_id_for_pos
|
||||||
|
|
||||||
|
return self._segment_id_for_pos[pos]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_text_segment(cls, text_segment, nsmap):
|
||||||
|
"""Build an ExtractedText from a PAGE content text element"""
|
||||||
|
|
||||||
|
segment_id = text_segment.attrib['id']
|
||||||
|
segment_text = None
|
||||||
|
with suppress(AttributeError):
|
||||||
|
segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
|
||||||
|
segment_text = segment_text or ''
|
||||||
|
segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization
|
||||||
|
segment_text = segment_text or ''
|
||||||
|
return cls(segment_id, None, None, segment_text)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_str(cls, text, normalization=Normalization.NFC_SBB):
|
||||||
|
normalized_text = normalize(text, normalization)
|
||||||
|
return cls(None, None, None, normalized_text, normalization=normalization)
|
@ -1,14 +1,15 @@
|
|||||||
function find_diff_class(classes) {
|
function find_diff_class(classes) {
|
||||||
return classes.split(/\s+/).find(x => x.match(/.diff\d.*/));
|
return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
|
||||||
}
|
}
|
||||||
|
|
||||||
$(document).ready(function() {
|
$(document).ready(function() {
|
||||||
|
/* Enable Bootstrap tooltips */
|
||||||
|
$('[data-toggle="tooltip"]').tooltip();
|
||||||
|
|
||||||
$('.diff').mouseover(function() {
|
$('.diff').mouseover(function() {
|
||||||
let c = find_diff_class($(this).attr('class'))
|
find_diff_class($(this).attr('class')).addClass('diff-highlight');
|
||||||
$('.' + c).addClass('diff-highlight')
|
|
||||||
});
|
});
|
||||||
$('.diff').mouseout(function() {
|
$('.diff').mouseout(function() {
|
||||||
let c = find_diff_class($(this).attr('class'))
|
find_diff_class($(this).attr('class')).removeClass('diff-highlight');
|
||||||
$('.' + c).removeClass('diff-highlight')
|
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -0,0 +1,68 @@
|
|||||||
|
import unicodedata
|
||||||
|
import pytest
|
||||||
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
|
from .. import seq_align, ExtractedText
|
||||||
|
|
||||||
|
|
||||||
|
def test_text():
|
||||||
|
test1 = ExtractedText(None, [
|
||||||
|
ExtractedText('s0', None, None, 'foo'),
|
||||||
|
ExtractedText('s1', None, None, 'bar'),
|
||||||
|
ExtractedText('s2', None, None, 'bazinga')
|
||||||
|
], ' ', None)
|
||||||
|
|
||||||
|
assert test1.text == 'foo bar bazinga'
|
||||||
|
assert test1.segment_id_for_pos(0) == 's0'
|
||||||
|
assert test1.segment_id_for_pos(3) is None
|
||||||
|
assert test1.segment_id_for_pos(10) == 's2'
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalization_check():
|
||||||
|
with pytest.raises(ValueError, match=r'.*is not in NFC.*'):
|
||||||
|
ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ'))
|
||||||
|
assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ'))
|
||||||
|
|
||||||
|
|
||||||
|
AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id')
|
||||||
|
|
||||||
|
|
||||||
|
def test_align():
|
||||||
|
"""
|
||||||
|
Test aligning by character while retaining segment id info
|
||||||
|
|
||||||
|
The difficulty here is that aligning should work on grapheme clusters,
|
||||||
|
not Python characters.
|
||||||
|
"""
|
||||||
|
|
||||||
|
test1 = ExtractedText(None, [
|
||||||
|
ExtractedText('s0', None, None, 'foo'),
|
||||||
|
ExtractedText('s1', None, None, 'bar'),
|
||||||
|
ExtractedText('s2', None, None, 'batzinga')
|
||||||
|
], ' ', None)
|
||||||
|
test2 = ExtractedText(None, [
|
||||||
|
ExtractedText('x0', None, None, 'foo'),
|
||||||
|
ExtractedText('x1', None, None, 'bar'),
|
||||||
|
ExtractedText('x2', None, None, '.'), # extra .
|
||||||
|
ExtractedText('x3', None, None, 'bazim̃ga'), # deletion + different grapheme cluster, m̃ also is two Python characters
|
||||||
|
], ' ', None)
|
||||||
|
|
||||||
|
left_pos = 0; right_pos = 0; alignment = []
|
||||||
|
for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)):
|
||||||
|
left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
|
||||||
|
right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
|
||||||
|
el = AlignmentElement(left, right, left_id, right_id)
|
||||||
|
alignment.append(el)
|
||||||
|
if left is not None:
|
||||||
|
left_pos += len(left)
|
||||||
|
if right is not None:
|
||||||
|
right_pos += len(right)
|
||||||
|
|
||||||
|
print('test1: {}'.format(test1.text))
|
||||||
|
print('test2: {}'.format(test2.text))
|
||||||
|
|
||||||
|
assert alignment[0] == ('f', 'f', 's0', 'x0')
|
||||||
|
assert alignment[8] == (None, '.', None, 'x2')
|
||||||
|
assert alignment[12] == ('t', None, 's2', None)
|
||||||
|
assert alignment[15] == ('n', 'm̃', 's2', 'x3')
|
Loading…
Reference in New Issue