You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
dinglehopper/qurator/dinglehopper/tests/test_integ_word_error_rate_...

44 lines
1.6 KiB
Python

from __future__ import division, print_function
import os
import pytest
from lxml import etree as ET
from .. import word_error_rate, words, page_text, alto_text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@pytest.mark.integration
def test_word_error_rate_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. → 3 changed words
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line
assert len(list(words(gt))) == gt_word_count
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
assert word_error_rate(gt, ocr) == 3/gt_word_count
@pytest.mark.integration
def test_word_error_rate_between_page_alto():
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
assert gt == ocr
assert word_error_rate(gt, ocr) == 0
@pytest.mark.integration
def test_word_error_rate_between_page_alto_2():
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
gt_word_count = 14+18+17+14+17+17+3 # Manually verified word count per line
assert len(list(words(gt))) == gt_word_count
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
assert word_error_rate(gt, ocr) == 7/gt_word_count # Manually verified, 6 words are wrong, 1 got split (=2 errors)