diff --git a/.vimrc b/.vimrc
new file mode 100644
index 0000000..3b935a0
--- /dev/null
+++ b/.vimrc
@@ -0,0 +1,2 @@
+" project-specific .vimrc (needs set exrc + set secure)
+set textwidth=90
diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py
index ab44760..87febb7 100644
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@@ -28,16 +28,16 @@ def seq_align(s1, s2):
if o:
if o[0] == 'insert':
- yield (None, s2[j])
+ yield None, s2[j]
j += 1
elif o[0] == 'delete':
- yield (s1[i], None)
+ yield s1[i], None
i += 1
elif o[0] == 'replace':
- yield (s1[i], s2[j])
+ yield s1[i], s2[j]
i += 1
j += 1
else:
- yield (s1[i], s2[j])
+ yield s1[i], s2[j]
i += 1
j += 1
diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py
index 05cc931..e99f391 100644
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
@@ -15,6 +15,10 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]:
:return: character error rate and length of the reference
"""
d = distance(reference, compared)
+ # XXX
+ from .cli import ExtractedText
+ if isinstance(reference, ExtractedText):
+ reference = reference.text
n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))
if d == 0:
diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py
index 63bfd92..9c963c1 100644
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@@ -8,11 +8,11 @@ from markupsafe import escape
from qurator.dinglehopper import *
-def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
+def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
gtx = ''
ocrx = ''
- def format_thing(t, css_classes=None):
+ def format_thing(t, css_classes=None, id_=None):
if t is None:
html_t = none
css_classes += ' ellipsis'
@@ -21,19 +21,51 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
else:
html_t = escape(t)
+ html_custom_attrs = ""
+
+ # Set Bootstrap tooltip to the segment id
+ if id_:
+ html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)
+
if css_classes:
- return '{html_t}'.format(css_classes=css_classes, html_t=html_t)
+ return '{html_t}'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs)
else:
return '{html_t}'.format(html_t=html_t)
- for k, (g, o) in enumerate(align(gt_things, ocr_things)):
- if g == o:
- css_classes = None
- else:
+ if isinstance(gt_in, ExtractedText):
+ if not isinstance(ocr_in, ExtractedText):
+ raise TypeError()
+ # XXX splitting should be done in ExtractedText
+ gt_things = list(grapheme_clusters(gt_in.text))
+ ocr_things = list(grapheme_clusters(ocr_in.text))
+ else:
+ gt_things = gt_in
+ ocr_things = ocr_in
+
+
+
+ g_pos = 0
+ o_pos = 0
+ for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
+ css_classes = None
+ gt_id = None
+ ocr_id = None
+ if g != o:
css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k)
+ if isinstance(gt_in, ExtractedText):
+ gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None
+ ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None
+ # Deletions and inserts only produce one id + None, UI must
+ # support this, i.e. display for the one id produced
+
+ gtx += joiner + format_thing(g, css_classes, gt_id)
+ ocrx += joiner + format_thing(o, css_classes, ocr_id)
+
+ if g is not None:
+ g_pos += len(g)
+ if o is not None:
+ o_pos += len(o)
- gtx += joiner + format_thing(g, css_classes)
- ocrx += joiner + format_thing(o, css_classes)
return \
'''
@@ -51,20 +83,17 @@ def process(gt, ocr, report_prefix, *, metrics=True):
Click on a wrapper.
"""
- gt_text = text(gt)
- ocr_text = text(ocr)
-
- gt_text = substitute_equivalences(gt_text)
- ocr_text = substitute_equivalences(ocr_text)
+ gt_text = extract(gt)
+ ocr_text = extract(ocr)
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
wer, n_words = word_error_rate_n(gt_text, ocr_text)
- char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align)
+ char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·')
gt_words = words_normalized(gt_text)
ocr_words = words_normalized(ocr_text)
- word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯', align=seq_align)
+ word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯')
def json_float(value):
"""Convert a float value to an JSON float.
diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py
index 8ca24d3..284b676 100644
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@@ -75,6 +75,12 @@ def distance(s1, s2):
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
clusters. This should be the correct way to compare two Unicode strings.
"""
+ # XXX
+ from .cli import ExtractedText
+ if isinstance(s1, ExtractedText):
+ s1 = s1.text
+ if isinstance(s2, ExtractedText):
+ s2 = s2.text
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
return levenshtein(s1, s2)
@@ -116,7 +122,11 @@ def seq_editops(seq1, seq2):
def editops(word1, word2):
- # XXX Note that this returns indices to the _grapheme clusters_, not characters!
+ """
+ Return sequence of edit operations transforming one string to another.
+
+ Note that this returns indices to the _grapheme clusters_, not characters!
+ """
word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))
word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))
return seq_editops(word1, word2)
diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py
index b57a047..a048b1e 100644
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@@ -1,11 +1,97 @@
from __future__ import division, print_function
+from typing import Optional
from warnings import warn
from lxml import etree as ET
+from lxml.etree import XMLSyntaxError
+from contextlib import suppress
+from itertools import repeat
+from .substitute_equivalences import substitute_equivalences
import sys
+import attr
+import enum
+import unicodedata
+import re
+
+
+@attr.s(frozen=True)
+class ExtractedText:
+ segments = attr.ib(converter=list)
+ joiner = attr.ib(type=str)
+ # TODO Types are not validated (attr does not do this yet)
+
+ @property
+ def text(self):
+ return self.joiner.join(s.text for s in self.segments)
+
+ _segment_id_for_pos = None
+
+ def segment_id_for_pos(self, pos):
+ # Calculate segment ids once, on the first call
+ if not self._segment_id_for_pos:
+ segment_id_for_pos = []
+ for s in self.segments:
+ segment_id_for_pos.extend(repeat(s.segment_id, len(s.text)))
+ segment_id_for_pos.extend(repeat(None, len(self.joiner)))
+ # This is frozen, so we have to jump through the hoop:
+ object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
+ assert self._segment_id_for_pos
+
+ return self._segment_id_for_pos[pos]
+
+
+class Normalization(enum.Enum):
+ NFC = 1
+ NFC_MUFI = 2 # TODO
+ NFC_SBB = 3
+
+
+def normalize(text, normalization):
+ if normalization == Normalization.NFC:
+ return unicodedata.normalize('NFC', text)
+ if normalization == Normalization.NFC_MUFI:
+ raise NotImplementedError()
+ if normalization == Normalization.NFC_SBB:
+ return substitute_equivalences(text)
+ else:
+ raise ValueError()
-from lxml.etree import XMLSyntaxError
+
+# XXX hack
+def normalize_sbb(t):
+ return normalize(t, Normalization.NFC_SBB)
+
+
+@attr.s(frozen=True)
+class ExtractedTextSegment:
+ segment_id = attr.ib(type=Optional[str])
+
+ @segment_id.validator
+ def check(self, _, value):
+ if value is None:
+ return
+ if not re.match(r'[\w\d_-]+', value):
+ raise ValueError('Malformed segment id "{}"'.format(value))
+ text = attr.ib(type=str)
+
+ @text.validator
+ def check(self, _, value):
+ if value is not None and normalize(value, self.normalization) != value:
+ raise ValueError('String "{}" is not normalized.'.format(value))
+ normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB)
+
+ @classmethod
+ def from_text_segment(cls, text_segment, nsmap):
+ """Build an ExtractedTextSegment from a PAGE content text element"""
+
+ segment_id = text_segment.attrib['id']
+ segment_text = None
+ with suppress(AttributeError):
+ segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
+ segment_text = segment_text or ''
+ segment_text = normalize_sbb(segment_text)
+ return cls(segment_id, segment_text)
def alto_namespace(tree):
@@ -21,7 +107,7 @@ def alto_namespace(tree):
raise ValueError('Not an ALTO tree')
-def alto_text(tree):
+def alto_extract(tree):
"""Extract text from the given ALTO ElementTree."""
nsmap = {'alto': alto_namespace(tree)}
@@ -29,9 +115,18 @@ def alto_text(tree):
lines = (
' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap))
for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap))
- text_ = '\n'.join(lines)
- return text_
+ return ExtractedText(
+ (ExtractedTextSegment(None, normalize_sbb(line_text)) for line_text in lines),
+ '\n'
+ )
+ # TODO This currently does not extract any segment id, because we are
+ # clueless about the ALTO format.
+ # FIXME needs to handle normalization
+
+
+def alto_text(tree):
+ return alto_extract(tree).text
def page_namespace(tree):
@@ -47,18 +142,12 @@ def page_namespace(tree):
raise ValueError('Not a PAGE tree')
-def page_text(tree):
+def page_extract(tree):
"""Extract text from the given PAGE content ElementTree."""
nsmap = {'page': page_namespace(tree)}
- def region_text(region):
- try:
- return region.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
- except AttributeError:
- return None
-
- region_texts = []
+ regions = []
reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap)
if reading_order is not None:
for group in reading_order.iterfind('./*', namespaces=nsmap):
@@ -68,39 +157,55 @@ def page_text(tree):
region_id = region_ref_indexed.attrib['regionRef']
region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap)
if region is not None:
- region_texts.append(region_text(region))
+ regions.append(ExtractedTextSegment.from_text_segment(region, nsmap))
else:
warn('Not a TextRegion: "%s"' % region_id)
else:
raise NotImplementedError
else:
for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap):
- region_texts.append(region_text(region))
+ regions.append(ExtractedTextSegment.from_text_segment(region, nsmap))
- # XXX Does a file have to have regions etc.? region vs lines etc.
# Filter empty region texts
- region_texts = (t for t in region_texts if t)
+ regions = (r for r in regions if r.text is not None)
- text_ = '\n'.join(region_texts)
+ return ExtractedText(regions, '\n')
+ # FIXME needs to handle normalization
- return text_
+def page_text(tree):
+ return page_extract(tree).text
-def text(filename):
- """Read the text from the given file.
+
+def plain_extract(filename):
+ with open(filename, 'r') as f:
+ return ExtractedText(
+ (ExtractedTextSegment('line %d' % no, line) for no, line in enumerate(f.readlines())),
+ '\n'
+ )
+
+
+def plain_text(filename):
+ return plain_extract(filename).text
+
+
+def extract(filename):
+ """Extract the text from the given file.
Supports PAGE, ALTO and falls back to plain text.
"""
-
try:
tree = ET.parse(filename)
except XMLSyntaxError:
- with open(filename, 'r') as f:
- return f.read()
+ return plain_extract(filename)
try:
- return page_text(tree)
+ return page_extract(tree)
except ValueError:
- return alto_text(tree)
+ return alto_extract(tree)
+
+
+def text(filename):
+ return extract(filename).text
if __name__ == '__main__':
diff --git a/qurator/dinglehopper/substitute_equivalences.py b/qurator/dinglehopper/substitute_equivalences.py
index 1b7e0cf..39be276 100644
--- a/qurator/dinglehopper/substitute_equivalences.py
+++ b/qurator/dinglehopper/substitute_equivalences.py
@@ -1,21 +1,15 @@
import unicodedata
-def substitute_equivalences(s):
+def unjoin_ligatures(s):
+ """Unjoin ligatures, i.e. ff becomes ff."""
- # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
- # It might make sense to use different rules for GT and for the different OCR
equivalences = {
- '': 'ü',
'': 'ſſ',
"\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
- '': 'ä',
'': 'ch',
- '==': '–', # → en-dash
- '—': '–', # em-dash → en-dash
'': 'ck',
'': 'll',
- '': 'ö',
'': 'ſi',
'': 'ſt',
'fi': 'fi',
@@ -23,12 +17,7 @@ def substitute_equivalences(s):
'fl': 'fl',
'ffi': 'ffi',
'': 'ct',
- '’': '\'',
- '⸗': '-',
'': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
- 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
- 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
- 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
'\uf532': 'as', # eMOP: Latin small ligature as
'\uf533': 'is', # eMOP: Latin small ligature is
'\uf534': 'us', # eMOP: Latin small ligature us
@@ -37,10 +26,32 @@ def substitute_equivalences(s):
'\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly?
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST
+ }
+ s = unicodedata.normalize('NFC', s)
+ for fr, to in equivalences.items():
+ s = s.replace(fr, to)
+ return s
+
+
+def substitute_equivalences(s):
+ # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
+ # It might make sense to use different rules for GT and for the different OCR
+ equivalences = {
+ '': 'ü',
+ '': 'ä',
+ '==': '–', # → en-dash
+ '—': '–', # em-dash → en-dash
+ '': 'ö',
+ '’': '\'',
+ '⸗': '-',
+ 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
+ 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
+ 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
'\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
}
s = unicodedata.normalize('NFC', s)
+ s = unjoin_ligatures(s)
for fr, to in equivalences.items():
s = s.replace(fr, to)
return s
diff --git a/qurator/dinglehopper/templates/report.html.js b/qurator/dinglehopper/templates/report.html.js
index ac43676..4c2ba28 100644
--- a/qurator/dinglehopper/templates/report.html.js
+++ b/qurator/dinglehopper/templates/report.html.js
@@ -1,14 +1,15 @@
function find_diff_class(classes) {
- return classes.split(/\s+/).find(x => x.match(/.diff\d.*/));
+ return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
}
$(document).ready(function() {
+ /* Enable Bootstrap tooltips */
+ $('[data-toggle="tooltip"]').tooltip();
+
$('.diff').mouseover(function() {
- let c = find_diff_class($(this).attr('class'))
- $('.' + c).addClass('diff-highlight')
+ find_diff_class($(this).attr('class')).addClass('diff-highlight');
});
$('.diff').mouseout(function() {
- let c = find_diff_class($(this).attr('class'))
- $('.' + c).removeClass('diff-highlight')
+ find_diff_class($(this).attr('class')).removeClass('diff-highlight');
});
});
diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py
new file mode 100644
index 0000000..2e6a9e6
--- /dev/null
+++ b/qurator/dinglehopper/tests/extracted_text_test.py
@@ -0,0 +1,68 @@
+import unicodedata
+import pytest
+from qurator.dinglehopper import ExtractedText, ExtractedTextSegment
+from uniseg.graphemecluster import grapheme_clusters
+from qurator.dinglehopper import seq_align
+from collections import namedtuple
+
+
+def test_text():
+ test1 = ExtractedText([
+ ExtractedTextSegment('s0', 'foo'),
+ ExtractedTextSegment('s1', 'bar'),
+ ExtractedTextSegment('s2', 'bazinga')
+ ], ' ')
+
+ assert test1.text == 'foo bar bazinga'
+ assert test1.segment_id_for_pos(0) == 's0'
+ assert test1.segment_id_for_pos(3) is None
+ assert test1.segment_id_for_pos(10) == 's2'
+
+
+def test_normalization_check():
+ with pytest.raises(ValueError, match=r'.*is not normalized.*'):
+ ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ'))
+ assert ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ'))
+
+
+AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id')
+
+
+def test_align():
+ """
+ Test aligning by character while retaining segment id info
+
+ The difficulty here is that aligning should work on grapheme clusters,
+ not Python characters.
+ """
+
+ test1 = ExtractedText([
+ ExtractedTextSegment('s0', 'foo'),
+ ExtractedTextSegment('s1', 'bar'),
+ ExtractedTextSegment('s2', 'batzinga')
+ ], ' ')
+ test2 = ExtractedText([
+ ExtractedTextSegment('x0', 'foo'),
+ ExtractedTextSegment('x1', 'bar'),
+ ExtractedTextSegment('x2', '.'), # extra .
+ ExtractedTextSegment('x3', 'bazim̃ga'), # deletion + different grapheme cluster, m̃ also is two Python characters
+ ], ' ')
+
+ left_pos = 0; right_pos = 0; alignment = []
+ for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)):
+ left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
+ right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
+ el = AlignmentElement(left, right, left_id, right_id)
+ alignment.append(el)
+ if left is not None:
+ left_pos += len(left)
+ if right is not None:
+ right_pos += len(right)
+
+ print('test1: {}'.format(test1.text))
+ print('test2: {}'.format(test2.text))
+
+ assert alignment[0] == ('f', 'f', 's0', 'x0')
+ assert alignment[8] == (None, '.', None, 'x2')
+ assert alignment[12] == ('t', None, 's2', None)
+ assert alignment[15] == ('n', 'm̃', 's2', 'x3')
diff --git a/qurator/dinglehopper/tests/test_align.py b/qurator/dinglehopper/tests/test_align.py
index cc5cb43..23483f8 100644
--- a/qurator/dinglehopper/tests/test_align.py
+++ b/qurator/dinglehopper/tests/test_align.py
@@ -78,7 +78,8 @@ def test_lines():
def test_lines_similar():
- """Test comparing list of lines while using a "weaker equivalence".
+ """
+ Test comparing list of lines while using a "weaker equivalence".
This mainly serves as documentation.
"""
@@ -88,7 +89,14 @@ def test_lines_similar():
self._string = string
def __eq__(self, other):
- return distance(self._string, other._string) < 2 # XXX NOT the final version
+ # Just an example!
+ min_len = min(len(self._string), len(other._string))
+ if min_len > 0:
+ normalized_distance = distance(self._string, other._string)/min_len
+ similar = normalized_distance < 0.1
+ else:
+ similar = False
+ return similar
def __ne__(self, other):
return not self.__eq__(other)
@@ -106,3 +114,6 @@ def test_lines_similar():
left, right = unzip(result)
assert list(left) == [SimilarString('This is a line.'), SimilarString('This is another'), None, SimilarString('And the last line')]
assert list(right) == [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')]
+
+ # Test __eq__ (i.e. is it a substitution or a similar string?)
+ assert list(left)[0] == list(right)[0]
diff --git a/qurator/dinglehopper/tests/test_integ_align.py b/qurator/dinglehopper/tests/test_integ_align.py
index df1e230..b35974b 100644
--- a/qurator/dinglehopper/tests/test_integ_align.py
+++ b/qurator/dinglehopper/tests/test_integ_align.py
@@ -13,11 +13,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@pytest.mark.integration
def test_align_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
- # → 4 elements in the alignment should be different.
+ # → 2 elements in the alignment should be different, the ligature is
+ # (currently) not counted due to normalization.
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
result = list(align(gt, ocr))
- assert sum(left != right for left, right in result) == 4
+ for left, right in result:
+ if left != right:
+ print(left, right)
+ assert sum(left != right for left, right in result) == 2
diff --git a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
index c27cd31..1c3bf52 100644
--- a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
@@ -4,6 +4,7 @@ import os
import pytest
from lxml import etree as ET
+from uniseg.graphemecluster import grapheme_clusters
from .. import character_error_rate, page_text, alto_text
@@ -13,9 +14,14 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@pytest.mark.integration
def test_character_error_rate_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
+ # The fi ligature does not count.
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
- assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311) # 2 TextRegions, 1 \n
+
+ gt_len = len(list(grapheme_clusters(gt)))
+ expected_cer = 2/gt_len
+
+ assert character_error_rate(gt, ocr) == expected_cer
@pytest.mark.integration
diff --git a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
index 5699700..d71bc14 100644
--- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
+++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
@@ -1,4 +1,3 @@
-import os
import json
import pytest
@@ -10,14 +9,17 @@ from ..cli import process
def test_cli_json(tmp_path):
"""Test that the cli/process() yields a loadable JSON report"""
- # XXX Path.__str__() is necessary for Python 3.5
with working_directory(str(tmp_path)):
with open('gt.txt', 'w') as gtf:
gtf.write('AAAAA')
with open('ocr.txt', 'w') as ocrf:
ocrf.write('AAAAB')
+ with open('gt.txt', 'r') as gtf:
+ print(gtf.read())
process('gt.txt', 'ocr.txt', 'report')
+ with open('report.json', 'r') as jsonf:
+ print(jsonf.read())
with open('report.json', 'r') as jsonf:
j = json.load(jsonf)
assert j['cer'] == pytest.approx(0.2)
@@ -26,7 +28,6 @@ def test_cli_json(tmp_path):
def test_cli_json_cer_is_infinity(tmp_path):
"""Test that the cli/process() yields a loadable JSON report when CER == inf"""
- # XXX Path.__str__() is necessary for Python 3.5
with working_directory(str(tmp_path)):
with open('gt.txt', 'w') as gtf:
gtf.write('') # Empty to yield CER == inf
diff --git a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
index 2857d56..cbe12f8 100644
--- a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
@@ -13,9 +13,11 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@pytest.mark.integration
def test_distance_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
+ # Due to normalization, we don't count the ligature.
+ # → 2 differences
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
- assert distance(gt, ocr) == 4
+ assert distance(gt, ocr) == 2
@pytest.mark.integration
diff --git a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py
index 41da748..75bb816 100644
--- a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py
+++ b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py
@@ -1,11 +1,9 @@
import os
-import re
import shutil
import json
from pathlib import Path
from click.testing import CliRunner
-import pytest
from .util import working_directory
@@ -17,8 +15,6 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
def test_ocrd_cli(tmp_path):
"""Test OCR-D interface"""
- # XXX Path.str() is necessary for Python 3.5
-
# Copy test workspace
test_workspace_dir_source = Path(data_dir) / 'actevedef_718448162'
test_workspace_dir = tmp_path / 'test_ocrd_cli'
diff --git a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
index 1d2dead..f5c922b 100644
--- a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
@@ -12,14 +12,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@pytest.mark.integration
def test_word_error_rate_between_page_files():
- # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. → 3 changed words
+ # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
+ # the ligature does not count → 2 errors
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line
assert len(list(words(gt))) == gt_word_count
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
- assert word_error_rate(gt, ocr) == 3/gt_word_count
+ assert word_error_rate(gt, ocr) == 2/gt_word_count
@pytest.mark.integration
diff --git a/qurator/dinglehopper/tests/test_ocr_files.py b/qurator/dinglehopper/tests/test_ocr_files.py
index dd9377a..3291152 100644
--- a/qurator/dinglehopper/tests/test_ocr_files.py
+++ b/qurator/dinglehopper/tests/test_ocr_files.py
@@ -6,7 +6,8 @@ import textwrap
import pytest
-from .. import alto_namespace, alto_text, page_namespace, page_text, text
+from .util import working_directory
+from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@@ -49,27 +50,51 @@ def test_page_namespace():
def test_page_test():
tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
result = page_text(tree)
+
+ # We are currently normalizing on extraction, so the text is normalized.
+ #
+ # expected = textwrap.dedent("""\
+ # ber die vielen Sorgen wegen deelben vergaß
+ # Hartkopf, der Frau Amtmnnin das ver⸗
+ # ſproene zu berliefern. — Ein Erpreer
+ # wurde an ihn abgeſit, um ihn ums Him⸗
+ # melswien zu ſagen, daß er das Verſproene
+ # glei den Augenbli berbringen mte, die
+ # Frau Amtmnnin htte auf ihn verlaen,
+ # und nun wßte e nit, was e anfangen
+ # ſote. Den Augenbli ſote er kommen,
+ # ſon vergieng e in ihrer Ang. — Die
+ # Ge wren ſon angekommen, und es fehlte
+ # ihr do no an aem. —
+ # Hartkopf mußte er bennen, und
+ # endli na langem Nadenken fiel es ihm er
+ # wieder ein. — Er langte den Zettel aus dem
+ # Accisbue heraus, und ſagte ſeiner Frau, daß
+ # e das, was da wre, herbeyſaffen mte.
+ # Jndeß mangelten do einige Generalia, die
+ # alſo wegfielen. — Hartkopf gieng ſelb
+ # mit und berbrate es. —""")
expected = textwrap.dedent("""\
- ber die vielen Sorgen wegen deelben vergaß
- Hartkopf, der Frau Amtmnnin das ver⸗
- ſproene zu berliefern. — Ein Erpreer
- wurde an ihn abgeſit, um ihn ums Him⸗
- melswien zu ſagen, daß er das Verſproene
- glei den Augenbli berbringen mte, die
- Frau Amtmnnin htte auf ihn verlaen,
- und nun wßte e nit, was e anfangen
- ſote. Den Augenbli ſote er kommen,
- ſon vergieng e in ihrer Ang. — Die
- Ge wren ſon angekommen, und es fehlte
- ihr do no an aem. —
- Hartkopf mußte er bennen, und
- endli na langem Nadenken fiel es ihm er
- wieder ein. — Er langte den Zettel aus dem
- Accisbue heraus, und ſagte ſeiner Frau, daß
- e das, was da wre, herbeyſaffen mte.
- Jndeß mangelten do einige Generalia, die
- alſo wegfielen. — Hartkopf gieng ſelb
- mit und berbrate es. —""")
+ über die vielen Sorgen wegen deſſelben vergaß
+ Hartkopf, der Frau Amtmännin das ver-
+ ſprochene zu überliefern. – Ein Erpreſſer
+ wurde an ihn abgeſchickt, um ihn ums Him-
+ melswillen zu ſagen, daß er das Verſprochene
+ gleich den Augenblick überbringen möchte, die
+ Frau Amtmännin hätte ſich auf ihn verlaſſen,
+ und nun wüßte ſie nicht, was ſie anfangen
+ ſollte. Den Augenblick ſollte er kommen,
+ ſonſt vergieng ſie in ihrer Angſt. – Die
+ Gäſte wären ſchon angekommen, und es fehlte
+ ihr doch noch an allem. –
+ Hartkopf mußte ſich erſt beſinnen, und
+ endlich nach langem Nachdenken fiel es ihm erſt
+ wieder ein. – Er langte den Zettel aus dem
+ Accisbuche heraus, und ſagte ſeiner Frau, daß
+ ſie das, was da wäre, herbeyſchaffen möchte.
+ Jndeß mangelten doch einige Generalia, die
+ alſo wegfielen. – Hartkopf gieng ſelbſt
+ mit und überbrachte es. –""")
assert result == expected
@@ -92,7 +117,8 @@ def test_page_order():
tree = ET.parse(os.path.join(data_dir, 'order.page.xml'))
result = page_text(tree)
- assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL)
+ print(result)
+ assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die', result, re.DOTALL)
def test_page_mixed_regions():
@@ -106,5 +132,15 @@ def test_page_mixed_regions():
def test_text():
assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
- assert "wieder ein. — Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
+ assert "wieder ein. – Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt'))
+
+
+def test_plain(tmp_path):
+ with working_directory(str(tmp_path)):
+ with open('ocr.txt', 'w') as ocrf:
+ ocrf.write('AAAAB')
+
+ result = plain_text('ocr.txt')
+ expected = 'AAAAB'
+ assert result == expected
diff --git a/qurator/dinglehopper/tests/util.py b/qurator/dinglehopper/tests/util.py
index 52b7506..1f224e5 100644
--- a/qurator/dinglehopper/tests/util.py
+++ b/qurator/dinglehopper/tests/util.py
@@ -21,8 +21,8 @@ def diffprint(x, y):
_diffprint(x, y)
-def unzip(l):
- return zip(*l)
+def unzip(an_iterable_of_tuples):
+ return zip(*an_iterable_of_tuples)
class working_directory:
diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py
index 7ed56e4..64eba0a 100644
--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@@ -32,6 +32,11 @@ def words(s):
cat = subcat[0]
return cat in unwanted_categories or subcat in unwanted_subcategories
+ # XXX
+ from .cli import ExtractedText
+ if isinstance(s, ExtractedText):
+ s = s.text
+
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
# uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
for word in uniseg.wordbreak.words(s):
@@ -42,10 +47,20 @@ def words(s):
def words_normalized(s):
+ # XXX
+ from .cli import ExtractedText
+ if isinstance(s, ExtractedText):
+ s = s.text
return words(unicodedata.normalize('NFC', s))
def word_error_rate_n(reference, compared) -> Tuple[float, int]:
+ # XXX
+ from .cli import ExtractedText
+ if isinstance(reference, ExtractedText):
+ reference = reference.text
+ if isinstance(compared, ExtractedText):
+ compared = compared.text
if isinstance(reference, str):
reference_seq = list(words_normalized(reference))
compared_seq = list(words_normalized(compared))
diff --git a/requirements.txt b/requirements.txt
index de6547b..846990b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,5 +4,6 @@ lxml
uniseg
numpy
colorama
+MarkupSafe
ocrd >= 1.0.0b15
attrs
diff --git a/setup.cfg b/setup.cfg
index 6deafc2..43d7a3a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,2 +1,2 @@
[flake8]
-max-line-length = 120
+max-line-length = 90