1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-06-09 11:50:00 +02:00

✔ Add @cneud's former 40 GB problem files to the test suite

This commit is contained in:
Gerber, Mike 2023-03-02 16:24:08 +01:00
parent 0f0819512e
commit 0fd4ea1973
3 changed files with 40052 additions and 0 deletions

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,29 @@
from __future__ import division, print_function
import os
import pytest
from lxml import etree as ET
from uniseg.graphemecluster import grapheme_clusters
from .. import character_error_rate, page_text, alto_text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
def test_bigger_texts():
gt = page_text(
ET.parse(os.path.join(data_dir, "bigger-texts", "00008228", "00008228.gt.xml"))
)
ocr = alto_text(
ET.parse(
os.path.join(
data_dir, "bigger-texts", "00008228", "00008228-00236534.gt4hist.xml"
)
)
)
# Only interested in a result here: In earlier versions this would have used
# tens of GB of RAM and should now not break a sweat.
assert character_error_rate(gt, ocr) >= 0.0