mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-30 22:19:57 +02:00
🐛 dinglehopper: Fix tests to deal with new normalization logic
This commit is contained in:
parent
c3ae73d576
commit
e3e7938162
7 changed files with 85 additions and 32 deletions
|
@ -15,7 +15,7 @@ import unicodedata
|
||||||
|
|
||||||
@attr.s(frozen=True)
|
@attr.s(frozen=True)
|
||||||
class ExtractedText:
|
class ExtractedText:
|
||||||
segments = attr.ib()
|
segments = attr.ib(converter=list)
|
||||||
joiner = attr.ib(type=str)
|
joiner = attr.ib(type=str)
|
||||||
# TODO Types are not validated (attr does not do this yet)
|
# TODO Types are not validated (attr does not do this yet)
|
||||||
|
|
||||||
|
@ -80,6 +80,7 @@ class ExtractedTextSegment:
|
||||||
segment_text = None
|
segment_text = None
|
||||||
with suppress(AttributeError):
|
with suppress(AttributeError):
|
||||||
segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
|
segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
|
||||||
|
segment_text = segment_text or ''
|
||||||
segment_text = normalize_sbb(segment_text)
|
segment_text = normalize_sbb(segment_text)
|
||||||
return cls(segment_id, segment_text)
|
return cls(segment_id, segment_text)
|
||||||
|
|
||||||
|
@ -157,7 +158,7 @@ def page_extract(tree):
|
||||||
regions.append(ExtractedTextSegment.from_text_segment(region, nsmap))
|
regions.append(ExtractedTextSegment.from_text_segment(region, nsmap))
|
||||||
|
|
||||||
# Filter empty region texts
|
# Filter empty region texts
|
||||||
regions = [r for r in regions if r.text is not None]
|
regions = (r for r in regions if r.text is not None)
|
||||||
|
|
||||||
return ExtractedText(regions, '\n')
|
return ExtractedText(regions, '\n')
|
||||||
# FIXME needs to handle normalization
|
# FIXME needs to handle normalization
|
||||||
|
|
|
@ -13,11 +13,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
def test_align_page_files():
|
def test_align_page_files():
|
||||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||||
# → 4 elements in the alignment should be different.
|
# → 2 elements in the alignment should be different, the ligature is
|
||||||
|
# (currently) not counted due to normalization.
|
||||||
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
|
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
|
||||||
|
|
||||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||||
|
|
||||||
result = list(align(gt, ocr))
|
result = list(align(gt, ocr))
|
||||||
assert sum(left != right for left, right in result) == 4
|
for left, right in result:
|
||||||
|
if left != right:
|
||||||
|
print(left, right)
|
||||||
|
assert sum(left != right for left, right in result) == 2
|
||||||
|
|
|
@ -4,6 +4,7 @@ import os
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
from .. import character_error_rate, page_text, alto_text
|
from .. import character_error_rate, page_text, alto_text
|
||||||
|
|
||||||
|
@ -13,9 +14,14 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
def test_character_error_rate_between_page_files():
|
def test_character_error_rate_between_page_files():
|
||||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||||
|
# The fi ligature does not count.
|
||||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||||
assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311) # 2 TextRegions, 1 \n
|
|
||||||
|
gt_len = len(list(grapheme_clusters(gt)))
|
||||||
|
expected_cer = 2/gt_len
|
||||||
|
|
||||||
|
assert character_error_rate(gt, ocr) == expected_cer
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import os
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -16,7 +15,11 @@ def test_cli_json(tmp_path):
|
||||||
with open('ocr.txt', 'w') as ocrf:
|
with open('ocr.txt', 'w') as ocrf:
|
||||||
ocrf.write('AAAAB')
|
ocrf.write('AAAAB')
|
||||||
|
|
||||||
|
with open('gt.txt', 'r') as gtf:
|
||||||
|
print(gtf.read())
|
||||||
process('gt.txt', 'ocr.txt', 'report')
|
process('gt.txt', 'ocr.txt', 'report')
|
||||||
|
with open('report.json', 'r') as jsonf:
|
||||||
|
print(jsonf.read())
|
||||||
with open('report.json', 'r') as jsonf:
|
with open('report.json', 'r') as jsonf:
|
||||||
j = json.load(jsonf)
|
j = json.load(jsonf)
|
||||||
assert j['cer'] == pytest.approx(0.2)
|
assert j['cer'] == pytest.approx(0.2)
|
||||||
|
|
|
@ -13,9 +13,11 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
def test_distance_between_page_files():
|
def test_distance_between_page_files():
|
||||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||||
|
# Due to normalization, we don't count the ligature.
|
||||||
|
# → 2 differences
|
||||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||||
assert distance(gt, ocr) == 4
|
assert distance(gt, ocr) == 2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
|
|
|
@ -12,14 +12,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||||
|
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
def test_word_error_rate_between_page_files():
|
def test_word_error_rate_between_page_files():
|
||||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. → 3 changed words
|
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
|
||||||
|
# the ligature does not count → 2 errors
|
||||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||||
|
|
||||||
gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line
|
gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line
|
||||||
assert len(list(words(gt))) == gt_word_count
|
assert len(list(words(gt))) == gt_word_count
|
||||||
|
|
||||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||||
assert word_error_rate(gt, ocr) == 3/gt_word_count
|
assert word_error_rate(gt, ocr) == 2/gt_word_count
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
|
|
|
@ -6,7 +6,8 @@ import textwrap
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from .. import alto_namespace, alto_text, page_namespace, page_text, text
|
from .util import working_directory
|
||||||
|
from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text
|
||||||
|
|
||||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||||
|
|
||||||
|
@ -49,27 +50,51 @@ def test_page_namespace():
|
||||||
def test_page_test():
|
def test_page_test():
|
||||||
tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
|
tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
|
||||||
result = page_text(tree)
|
result = page_text(tree)
|
||||||
|
|
||||||
|
# We are currently normalizing on extraction, so the text is normalized.
|
||||||
|
#
|
||||||
|
# expected = textwrap.dedent("""\
|
||||||
|
# ber die vielen Sorgen wegen deelben vergaß
|
||||||
|
# Hartkopf, der Frau Amtmnnin das ver⸗
|
||||||
|
# ſproene zu berliefern. — Ein Erpreer
|
||||||
|
# wurde an ihn abgeſit, um ihn ums Him⸗
|
||||||
|
# melswien zu ſagen, daß er das Verſproene
|
||||||
|
# glei den Augenbli berbringen mte, die
|
||||||
|
# Frau Amtmnnin htte auf ihn verlaen,
|
||||||
|
# und nun wßte e nit, was e anfangen
|
||||||
|
# ſote. Den Augenbli ſote er kommen,
|
||||||
|
# ſon vergieng e in ihrer Ang. — Die
|
||||||
|
# Ge wren ſon angekommen, und es fehlte
|
||||||
|
# ihr do no an aem. —
|
||||||
|
# Hartkopf mußte er bennen, und
|
||||||
|
# endli na langem Nadenken fiel es ihm er
|
||||||
|
# wieder ein. — Er langte den Zettel aus dem
|
||||||
|
# Accisbue heraus, und ſagte ſeiner Frau, daß
|
||||||
|
# e das, was da wre, herbeyſaffen mte.
|
||||||
|
# Jndeß mangelten do einige Generalia, die
|
||||||
|
# alſo wegfielen. — Hartkopf gieng ſelb
|
||||||
|
# mit und berbrate es. —""")
|
||||||
expected = textwrap.dedent("""\
|
expected = textwrap.dedent("""\
|
||||||
ber die vielen Sorgen wegen deelben vergaß
|
über die vielen Sorgen wegen deſſelben vergaß
|
||||||
Hartkopf, der Frau Amtmnnin das ver⸗
|
Hartkopf, der Frau Amtmännin das ver-
|
||||||
ſproene zu berliefern. — Ein Erpreer
|
ſprochene zu überliefern. – Ein Erpreſſer
|
||||||
wurde an ihn abgeſit, um ihn ums Him⸗
|
wurde an ihn abgeſchickt, um ihn ums Him-
|
||||||
melswien zu ſagen, daß er das Verſproene
|
melswillen zu ſagen, daß er das Verſprochene
|
||||||
glei den Augenbli berbringen mte, die
|
gleich den Augenblick überbringen möchte, die
|
||||||
Frau Amtmnnin htte auf ihn verlaen,
|
Frau Amtmännin hätte ſich auf ihn verlaſſen,
|
||||||
und nun wßte e nit, was e anfangen
|
und nun wüßte ſie nicht, was ſie anfangen
|
||||||
ſote. Den Augenbli ſote er kommen,
|
ſollte. Den Augenblick ſollte er kommen,
|
||||||
ſon vergieng e in ihrer Ang. — Die
|
ſonſt vergieng ſie in ihrer Angſt. – Die
|
||||||
Ge wren ſon angekommen, und es fehlte
|
Gäſte wären ſchon angekommen, und es fehlte
|
||||||
ihr do no an aem. —
|
ihr doch noch an allem. –
|
||||||
Hartkopf mußte er bennen, und
|
Hartkopf mußte ſich erſt beſinnen, und
|
||||||
endli na langem Nadenken fiel es ihm er
|
endlich nach langem Nachdenken fiel es ihm erſt
|
||||||
wieder ein. — Er langte den Zettel aus dem
|
wieder ein. – Er langte den Zettel aus dem
|
||||||
Accisbue heraus, und ſagte ſeiner Frau, daß
|
Accisbuche heraus, und ſagte ſeiner Frau, daß
|
||||||
e das, was da wre, herbeyſaffen mte.
|
ſie das, was da wäre, herbeyſchaffen möchte.
|
||||||
Jndeß mangelten do einige Generalia, die
|
Jndeß mangelten doch einige Generalia, die
|
||||||
alſo wegfielen. — Hartkopf gieng ſelb
|
alſo wegfielen. – Hartkopf gieng ſelbſt
|
||||||
mit und berbrate es. —""")
|
mit und überbrachte es. –""")
|
||||||
assert result == expected
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
|
@ -92,7 +117,8 @@ def test_page_order():
|
||||||
tree = ET.parse(os.path.join(data_dir, 'order.page.xml'))
|
tree = ET.parse(os.path.join(data_dir, 'order.page.xml'))
|
||||||
result = page_text(tree)
|
result = page_text(tree)
|
||||||
|
|
||||||
assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL)
|
print(result)
|
||||||
|
assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die', result, re.DOTALL)
|
||||||
|
|
||||||
|
|
||||||
def test_page_mixed_regions():
|
def test_page_mixed_regions():
|
||||||
|
@ -106,5 +132,15 @@ def test_page_mixed_regions():
|
||||||
|
|
||||||
def test_text():
|
def test_text():
|
||||||
assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
|
assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
|
||||||
assert "wieder ein. — Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
|
assert "wieder ein. – Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
|
||||||
assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt'))
|
assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt'))
|
||||||
|
|
||||||
|
|
||||||
|
def test_plain(tmp_path):
|
||||||
|
with working_directory(str(tmp_path)):
|
||||||
|
with open('ocr.txt', 'w') as ocrf:
|
||||||
|
ocrf.write('AAAAB')
|
||||||
|
|
||||||
|
result = plain_text('ocr.txt')
|
||||||
|
expected = 'AAAAB'
|
||||||
|
assert result == expected
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue