You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
28 lines
932 B
Python
28 lines
932 B
Python
5 years ago
|
from __future__ import division, print_function
|
||
|
|
||
|
import os
|
||
|
|
||
|
import pytest
|
||
|
from lxml import etree as ET
|
||
|
|
||
|
from .. import align, page_text
|
||
|
|
||
4 years ago
|
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||
5 years ago
|
|
||
|
|
||
|
@pytest.mark.integration
|
||
|
def test_align_page_files():
|
||
|
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||
4 years ago
|
# → 2 elements in the alignment should be different, the ligature is
|
||
|
# (currently) not counted due to normalization.
|
||
5 years ago
|
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
|
||
|
|
||
4 years ago
|
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
||
|
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
|
||
5 years ago
|
|
||
|
result = list(align(gt, ocr))
|
||
4 years ago
|
for left, right in result:
|
||
|
if left != right:
|
||
|
print(left, right)
|
||
|
assert sum(left != right for left, right in result) == 2
|