✔ Add @cneud's former 40 GB problem files to the test suite
parent
0f0819512e
commit
0fd4ea1973
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,29 @@
|
|||||||
|
from __future__ import division, print_function
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from lxml import etree as ET
|
||||||
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
|
from .. import character_error_rate, page_text, alto_text
|
||||||
|
|
||||||
|
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_bigger_texts():
|
||||||
|
gt = page_text(
|
||||||
|
ET.parse(os.path.join(data_dir, "bigger-texts", "00008228", "00008228.gt.xml"))
|
||||||
|
)
|
||||||
|
ocr = alto_text(
|
||||||
|
ET.parse(
|
||||||
|
os.path.join(
|
||||||
|
data_dir, "bigger-texts", "00008228", "00008228-00236534.gt4hist.xml"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Only interested in a result here: In earlier versions this would have used
|
||||||
|
# tens of GB of RAM and should now not break a sweat.
|
||||||
|
assert character_error_rate(gt, ocr) >= 0.0
|
Loading…
Reference in New Issue