dinglehopper/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py

from __future__ import division, print_function

import os

import pytest
from lxml import etree as ET

from .. import character_error_rate, page_text, alto_text

data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')


@pytest.mark.integration
def test_character_error_rate_between_page_files():
    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
    gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
    ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
    assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311)  # 2 TextRegions, 1 \n


@pytest.mark.integration
def test_character_error_rate_between_page_alto():
    gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
    ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))

    assert gt == ocr
    assert character_error_rate(gt, ocr) == 0


@pytest.mark.integration
def test_character_error_rate_between_page_alto_2():
    gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
    ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))

    assert character_error_rate(gt, ocr) == 8/591  # Manually verified
Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector" This reverts commit a3c1eee8f31349edcfb1e36920763bcecceb1129, reversing changes made to dc76213ffc1fbabc2c45f0e52ced55449bdf2e83. 2019-12-09 12:44:05 +01:00			`from __future__ import division, print_function`

			`import os`

			`import pytest`
			`from lxml import etree as ET`

			`from .. import character_error_rate, page_text, alto_text`

			`data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')`


			`@pytest.mark.integration`
			`def test_character_error_rate_between_page_files():`
			`# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.`
			`gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))`
			`ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))`
			`assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311) # 2 TextRegions, 1 \n`


			`@pytest.mark.integration`
			`def test_character_error_rate_between_page_alto():`
			`gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))`
			`ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))`

			`assert gt == ocr`
			`assert character_error_rate(gt, ocr) == 0`


			`@pytest.mark.integration`
			`def test_character_error_rate_between_page_alto_2():`
			`gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))`
			`ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))`

			`assert character_error_rate(gt, ocr) == 8/591 # Manually verified`