mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 11:50:00 +02:00
✔ Add @cneud's former 40 GB problem files to the test suite
This commit is contained in:
parent
0f0819512e
commit
0fd4ea1973
3 changed files with 40052 additions and 0 deletions
File diff suppressed because it is too large
Load diff
22865
qurator/dinglehopper/tests/data/bigger-texts/00008228/00008228.gt.xml
Normal file
22865
qurator/dinglehopper/tests/data/bigger-texts/00008228/00008228.gt.xml
Normal file
File diff suppressed because it is too large
Load diff
29
qurator/dinglehopper/tests/test_integ_bigger_texts.py
Normal file
29
qurator/dinglehopper/tests/test_integ_bigger_texts.py
Normal file
|
@ -0,0 +1,29 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
from .. import character_error_rate, page_text, alto_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_bigger_texts():
|
||||
gt = page_text(
|
||||
ET.parse(os.path.join(data_dir, "bigger-texts", "00008228", "00008228.gt.xml"))
|
||||
)
|
||||
ocr = alto_text(
|
||||
ET.parse(
|
||||
os.path.join(
|
||||
data_dir, "bigger-texts", "00008228", "00008228-00236534.gt4hist.xml"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Only interested in a result here: In earlier versions this would have used
|
||||
# tens of GB of RAM and should now not break a sweat.
|
||||
assert character_error_rate(gt, ocr) >= 0.0
|
Loading…
Add table
Add a link
Reference in a new issue