🐛 dinglehopper: Fix tests to deal with new normalization logic

pull/38/head
Gerber, Mike 4 years ago
parent c010a7f05e
commit 079be203bd

@ -15,7 +15,7 @@ import unicodedata
@attr.s(frozen=True)
class ExtractedText:
segments = attr.ib()
segments = attr.ib(converter=list)
joiner = attr.ib(type=str)
# TODO Types are not validated (attr does not do this yet)
@ -80,6 +80,7 @@ class ExtractedTextSegment:
segment_text = None
with suppress(AttributeError):
segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
segment_text = segment_text or ''
segment_text = normalize_sbb(segment_text)
return cls(segment_id, segment_text)
@ -157,7 +158,7 @@ def page_extract(tree):
regions.append(ExtractedTextSegment.from_text_segment(region, nsmap))
# Filter empty region texts
regions = [r for r in regions if r.text is not None]
regions = (r for r in regions if r.text is not None)
return ExtractedText(regions, '\n')
# FIXME needs to handle normalization

@ -13,11 +13,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@pytest.mark.integration
def test_align_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
# → 4 elements in the alignment should be different.
# → 2 elements in the alignment should be different, the ligature is
# (currently) not counted due to normalization.
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
result = list(align(gt, ocr))
assert sum(left != right for left, right in result) == 4
for left, right in result:
if left != right:
print(left, right)
assert sum(left != right for left, right in result) == 2

@ -4,6 +4,7 @@ import os
import pytest
from lxml import etree as ET
from uniseg.graphemecluster import grapheme_clusters
from .. import character_error_rate, page_text, alto_text
@ -13,9 +14,14 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@pytest.mark.integration
def test_character_error_rate_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
# The fi ligature does not count.
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311) # 2 TextRegions, 1 \n
gt_len = len(list(grapheme_clusters(gt)))
expected_cer = 2/gt_len
assert character_error_rate(gt, ocr) == expected_cer
@pytest.mark.integration

@ -1,4 +1,3 @@
import os
import json
import pytest
@ -16,7 +15,11 @@ def test_cli_json(tmp_path):
with open('ocr.txt', 'w') as ocrf:
ocrf.write('AAAAB')
with open('gt.txt', 'r') as gtf:
print(gtf.read())
process('gt.txt', 'ocr.txt', 'report')
with open('report.json', 'r') as jsonf:
print(jsonf.read())
with open('report.json', 'r') as jsonf:
j = json.load(jsonf)
assert j['cer'] == pytest.approx(0.2)

@ -13,9 +13,11 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@pytest.mark.integration
def test_distance_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
# Due to normalization, we don't count the ligature.
# → 2 differences
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
assert distance(gt, ocr) == 4
assert distance(gt, ocr) == 2
@pytest.mark.integration

@ -12,14 +12,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@pytest.mark.integration
def test_word_error_rate_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. → 3 changed words
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
# the ligature does not count → 2 errors
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line
assert len(list(words(gt))) == gt_word_count
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
assert word_error_rate(gt, ocr) == 3/gt_word_count
assert word_error_rate(gt, ocr) == 2/gt_word_count
@pytest.mark.integration

@ -6,7 +6,8 @@ import textwrap
import pytest
from .. import alto_namespace, alto_text, page_namespace, page_text, text
from .util import working_directory
from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@ -49,27 +50,51 @@ def test_page_namespace():
def test_page_test():
tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
result = page_text(tree)
# We are currently normalizing on extraction, so the text is normalized.
#
# expected = textwrap.dedent("""\
# ber die vielen Sorgen wegen deelben vergaß
# Hartkopf, der Frau Amtmnnin das ver⸗
# ſproene zu berliefern. — Ein Erpreer
# wurde an ihn abgeſit, um ihn ums Him⸗
# melswien zu ſagen, daß er das Verſproene
# glei den Augenbli berbringen mte, die
# Frau Amtmnnin htte  auf ihn verlaen,
# und nun wßte e nit, was e anfangen
# ſote. Den Augenbli ſote er kommen,
# ſon vergieng e in ihrer Ang. — Die
# Ge wren ſon angekommen, und es fehlte
# ihr do no an aem. —
# Hartkopf mußte  er bennen, und
# endli na langem Nadenken fiel es ihm er
# wieder ein. — Er langte den Zettel aus dem
# Accisbue heraus, und ſagte ſeiner Frau, daß
# e das, was da wre, herbeyſaffen mte.
# Jndeß mangelten do einige Generalia, die
# alſo wegfielen. — Hartkopf gieng ſelb
# mit und berbrate es. —""")
expected = textwrap.dedent("""\
ber die vielen Sorgen wegen deelben vergaß
Hartkopf, der Frau Amtmnnin das ver
ſproene zu berliefern. Ein Erpreer
wurde an ihn abgeſit, um ihn ums Him
melswien zu ſagen, daß er das Verſproene
glei den Augenbli berbringen mte, die
Frau Amtmnnin htte auf ihn verlaen,
und nun wßte e nit, was e anfangen
ſote. Den Augenbli ſote er kommen,
ſon vergieng e in ihrer Ang. Die
Ge wren ſon angekommen, und es fehlte
ihr do no an aem.
Hartkopf mußte er bennen, und
endli na langem Nadenken fiel es ihm er
wieder ein. Er langte den Zettel aus dem
Accisbue heraus, und ſagte ſeiner Frau, daß
e das, was da wre, herbeyſaffen mte.
Jndeß mangelten do einige Generalia, die
alſo wegfielen. Hartkopf gieng ſelb
mit und berbrate es. """)
über die vielen Sorgen wegen deſſelben vergaß
Hartkopf, der Frau Amtmännin das ver-
ſprochene zu überliefern. Ein Erpreſſer
wurde an ihn abgeſchickt, um ihn ums Him-
melswillen zu ſagen, daß er das Verſprochene
gleich den Augenblick überbringen möchte, die
Frau Amtmännin hätte ſich auf ihn verlaſſen,
und nun wüßte ſie nicht, was ſie anfangen
ſollte. Den Augenblick ſollte er kommen,
ſonſt vergieng ſie in ihrer Angſt. Die
Gäſte wären ſchon angekommen, und es fehlte
ihr doch noch an allem.
Hartkopf mußte ſich erſt beſinnen, und
endlich nach langem Nachdenken fiel es ihm erſt
wieder ein. Er langte den Zettel aus dem
Accisbuche heraus, und ſagte ſeiner Frau, daß
ſie das, was da wäre, herbeyſchaffen möchte.
Jndeß mangelten doch einige Generalia, die
alſo wegfielen. Hartkopf gieng ſelbſt
mit und überbrachte es. """)
assert result == expected
@ -92,7 +117,8 @@ def test_page_order():
tree = ET.parse(os.path.join(data_dir, 'order.page.xml'))
result = page_text(tree)
assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL)
print(result)
assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die', result, re.DOTALL)
def test_page_mixed_regions():
@ -106,5 +132,15 @@ def test_page_mixed_regions():
def test_text():
assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
assert "wieder ein. Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
assert "wieder ein. Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt'))
def test_plain(tmp_path):
with working_directory(str(tmp_path)):
with open('ocr.txt', 'w') as ocrf:
ocrf.write('AAAAB')
result = plain_text('ocr.txt')
expected = 'AAAAB'
assert result == expected

Loading…
Cancel
Save