🐛 dinglehopper: Fix tests to deal with new normalization logic

pull/38/head
Gerber, Mike 5 years ago
parent c3ae73d576
commit e3e7938162

@ -15,7 +15,7 @@ import unicodedata
@attr.s(frozen=True) @attr.s(frozen=True)
class ExtractedText: class ExtractedText:
segments = attr.ib() segments = attr.ib(converter=list)
joiner = attr.ib(type=str) joiner = attr.ib(type=str)
# TODO Types are not validated (attr does not do this yet) # TODO Types are not validated (attr does not do this yet)
@ -80,6 +80,7 @@ class ExtractedTextSegment:
segment_text = None segment_text = None
with suppress(AttributeError): with suppress(AttributeError):
segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
segment_text = segment_text or ''
segment_text = normalize_sbb(segment_text) segment_text = normalize_sbb(segment_text)
return cls(segment_id, segment_text) return cls(segment_id, segment_text)
@ -157,7 +158,7 @@ def page_extract(tree):
regions.append(ExtractedTextSegment.from_text_segment(region, nsmap)) regions.append(ExtractedTextSegment.from_text_segment(region, nsmap))
# Filter empty region texts # Filter empty region texts
regions = [r for r in regions if r.text is not None] regions = (r for r in regions if r.text is not None)
return ExtractedText(regions, '\n') return ExtractedText(regions, '\n')
# FIXME needs to handle normalization # FIXME needs to handle normalization

@ -13,11 +13,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@pytest.mark.integration @pytest.mark.integration
def test_align_page_files(): def test_align_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
# → 4 elements in the alignment should be different. # → 2 elements in the alignment should be different, the ligature is
# (currently) not counted due to normalization.
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters. # NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
result = list(align(gt, ocr)) result = list(align(gt, ocr))
assert sum(left != right for left, right in result) == 4 for left, right in result:
if left != right:
print(left, right)
assert sum(left != right for left, right in result) == 2

@ -4,6 +4,7 @@ import os
import pytest import pytest
from lxml import etree as ET from lxml import etree as ET
from uniseg.graphemecluster import grapheme_clusters
from .. import character_error_rate, page_text, alto_text from .. import character_error_rate, page_text, alto_text
@ -13,9 +14,14 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@pytest.mark.integration @pytest.mark.integration
def test_character_error_rate_between_page_files(): def test_character_error_rate_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
# The fi ligature does not count.
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311) # 2 TextRegions, 1 \n
gt_len = len(list(grapheme_clusters(gt)))
expected_cer = 2/gt_len
assert character_error_rate(gt, ocr) == expected_cer
@pytest.mark.integration @pytest.mark.integration

@ -1,4 +1,3 @@
import os
import json import json
import pytest import pytest
@ -16,7 +15,11 @@ def test_cli_json(tmp_path):
with open('ocr.txt', 'w') as ocrf: with open('ocr.txt', 'w') as ocrf:
ocrf.write('AAAAB') ocrf.write('AAAAB')
with open('gt.txt', 'r') as gtf:
print(gtf.read())
process('gt.txt', 'ocr.txt', 'report') process('gt.txt', 'ocr.txt', 'report')
with open('report.json', 'r') as jsonf:
print(jsonf.read())
with open('report.json', 'r') as jsonf: with open('report.json', 'r') as jsonf:
j = json.load(jsonf) j = json.load(jsonf)
assert j['cer'] == pytest.approx(0.2) assert j['cer'] == pytest.approx(0.2)

@ -13,9 +13,11 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@pytest.mark.integration @pytest.mark.integration
def test_distance_between_page_files(): def test_distance_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
# Due to normalization, we don't count the ligature.
# → 2 differences
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
assert distance(gt, ocr) == 4 assert distance(gt, ocr) == 2
@pytest.mark.integration @pytest.mark.integration

@ -12,14 +12,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@pytest.mark.integration @pytest.mark.integration
def test_word_error_rate_between_page_files(): def test_word_error_rate_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. → 3 changed words # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
# the ligature does not count → 2 errors
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line
assert len(list(words(gt))) == gt_word_count assert len(list(words(gt))) == gt_word_count
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
assert word_error_rate(gt, ocr) == 3/gt_word_count assert word_error_rate(gt, ocr) == 2/gt_word_count
@pytest.mark.integration @pytest.mark.integration

@ -6,7 +6,8 @@ import textwrap
import pytest import pytest
from .. import alto_namespace, alto_text, page_namespace, page_text, text from .util import working_directory
from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@ -49,27 +50,51 @@ def test_page_namespace():
def test_page_test(): def test_page_test():
tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml')) tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
result = page_text(tree) result = page_text(tree)
# We are currently normalizing on extraction, so the text is normalized.
#
# expected = textwrap.dedent("""\
# ber die vielen Sorgen wegen deelben vergaß
# Hartkopf, der Frau Amtmnnin das ver⸗
# ſproene zu berliefern. — Ein Erpreer
# wurde an ihn abgeſit, um ihn ums Him⸗
# melswien zu ſagen, daß er das Verſproene
# glei den Augenbli berbringen mte, die
# Frau Amtmnnin htte  auf ihn verlaen,
# und nun wßte e nit, was e anfangen
# ſote. Den Augenbli ſote er kommen,
# ſon vergieng e in ihrer Ang. — Die
# Ge wren ſon angekommen, und es fehlte
# ihr do no an aem. —
# Hartkopf mußte  er bennen, und
# endli na langem Nadenken fiel es ihm er
# wieder ein. — Er langte den Zettel aus dem
# Accisbue heraus, und ſagte ſeiner Frau, daß
# e das, was da wre, herbeyſaffen mte.
# Jndeß mangelten do einige Generalia, die
# alſo wegfielen. — Hartkopf gieng ſelb
# mit und berbrate es. —""")
expected = textwrap.dedent("""\ expected = textwrap.dedent("""\
ber die vielen Sorgen wegen deelben vergaß über die vielen Sorgen wegen deſſelben vergaß
Hartkopf, der Frau Amtmnnin das ver Hartkopf, der Frau Amtmännin das ver-
ſproene zu berliefern. Ein Erpreer ſprochene zu überliefern. Ein Erpreſſer
wurde an ihn abgeſit, um ihn ums Him wurde an ihn abgeſchickt, um ihn ums Him-
melswien zu ſagen, daß er das Verſproene melswillen zu ſagen, daß er das Verſprochene
glei den Augenbli berbringen mte, die gleich den Augenblick überbringen möchte, die
Frau Amtmnnin htte auf ihn verlaen, Frau Amtmännin hätte ſich auf ihn verlaſſen,
und nun wßte e nit, was e anfangen und nun wüßte ſie nicht, was ſie anfangen
ſote. Den Augenbli ſote er kommen, ſollte. Den Augenblick ſollte er kommen,
ſon vergieng e in ihrer Ang. Die ſonſt vergieng ſie in ihrer Angſt. Die
Ge wren ſon angekommen, und es fehlte Gäſte wären ſchon angekommen, und es fehlte
ihr do no an aem. ihr doch noch an allem.
Hartkopf mußte er bennen, und Hartkopf mußte ſich erſt beſinnen, und
endli na langem Nadenken fiel es ihm er endlich nach langem Nachdenken fiel es ihm erſt
wieder ein. Er langte den Zettel aus dem wieder ein. Er langte den Zettel aus dem
Accisbue heraus, und ſagte ſeiner Frau, daß Accisbuche heraus, und ſagte ſeiner Frau, daß
e das, was da wre, herbeyſaffen mte. ſie das, was da wäre, herbeyſchaffen möchte.
Jndeß mangelten do einige Generalia, die Jndeß mangelten doch einige Generalia, die
alſo wegfielen. Hartkopf gieng ſelb alſo wegfielen. Hartkopf gieng ſelbſt
mit und berbrate es. """) mit und überbrachte es. """)
assert result == expected assert result == expected
@ -92,7 +117,8 @@ def test_page_order():
tree = ET.parse(os.path.join(data_dir, 'order.page.xml')) tree = ET.parse(os.path.join(data_dir, 'order.page.xml'))
result = page_text(tree) result = page_text(tree)
assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL) print(result)
assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die', result, re.DOTALL)
def test_page_mixed_regions(): def test_page_mixed_regions():
@ -106,5 +132,15 @@ def test_page_mixed_regions():
def test_text(): def test_text():
assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml')) assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
assert "wieder ein. Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml')) assert "wieder ein. Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt')) assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt'))
def test_plain(tmp_path):
with working_directory(str(tmp_path)):
with open('ocr.txt', 'w') as ocrf:
ocrf.write('AAAAB')
result = plain_text('ocr.txt')
expected = 'AAAAB'
assert result == expected

Loading…
Cancel
Save