mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-07-14 21:09:56 +02:00
36 lines
1.3 KiB
Python
36 lines
1.3 KiB
Python
|
from __future__ import division, print_function
|
||
|
|
||
|
import os
|
||
|
|
||
|
import pytest
|
||
|
from lxml import etree as ET
|
||
|
|
||
|
from .. import character_error_rate, page_text, alto_text
|
||
|
|
||
|
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||
|
|
||
|
|
||
|
@pytest.mark.integration
|
||
|
def test_character_error_rate_between_page_files():
|
||
|
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||
|
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||
|
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||
|
assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311) # 2 TextRegions, 1 \n
|
||
|
|
||
|
|
||
|
@pytest.mark.integration
|
||
|
def test_character_error_rate_between_page_alto():
|
||
|
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
|
||
|
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
|
||
|
|
||
|
assert gt == ocr
|
||
|
assert character_error_rate(gt, ocr) == 0
|
||
|
|
||
|
|
||
|
@pytest.mark.integration
|
||
|
def test_character_error_rate_between_page_alto_2():
|
||
|
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
|
||
|
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
|
||
|
|
||
|
assert character_error_rate(gt, ocr) == 8/591 # Manually verified
|