1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-07-06 17:09:59 +02:00
dinglehopper/qurator/dinglehopper/tests/test_integ_align.py
Gerber, Mike f94e8b9b1c Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector"
This reverts commit a3c1eee8f31349edcfb1e36920763bcecceb1129, reversing
changes made to dc76213ffc1fbabc2c45f0e52ced55449bdf2e83.
2019-12-09 12:44:05 +01:00

23 lines
776 B
Python

from __future__ import division, print_function
import os
import pytest
from lxml import etree as ET
from .. import align, page_text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@pytest.mark.integration
def test_align_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
# → 4 elements in the alignment should be different.
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
result = list(align(gt, ocr))
assert sum(left != right for left, right in result) == 4