1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-06-30 22:19:57 +02:00
dinglehopper/qurator/dinglehopper/tests/test_ocr_files.py
Gerber, Mike f94e8b9b1c Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector"
This reverts commit a3c1eee8f31349edcfb1e36920763bcecceb1129, reversing
changes made to dc76213ffc1fbabc2c45f0e52ced55449bdf2e83.
2019-12-09 12:44:05 +01:00

110 lines
4.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import re
import lxml.etree as ET
import textwrap
import pytest
from .. import alto_namespace, alto_text, page_namespace, page_text, text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
def test_alto_namespace():
tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
assert alto_namespace(tree) == 'http://www.loc.gov/standards/alto/ns-v3#'
def test_alto_text():
tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
result = alto_text(tree)
expected = textwrap.dedent("""\
über die vielen Sorgen wegen deſſelben vergaß
Hartkopf, der Frau Amtmännin das ver-
ſprochene zu überliefern.""")
assert result == expected
def test_alto_text_ALTO1():
tree = ET.parse(os.path.join(data_dir, 'test.alto1.xml'))
assert "being erected at the Broadway stock" in alto_text(tree)
def test_alto_text_ALTO2():
tree = ET.parse(os.path.join(data_dir, 'test.alto2.xml'))
assert "Halbmonde, die genau durch einen Ouerstrich halbiert\nsind und an beiden Enden" in alto_text(tree)
def test_alto_text_ALTO3():
tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
assert "über die vielen Sorgen wegen deſſelben vergaß" in alto_text(tree)
def test_page_namespace():
tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
assert page_namespace(tree) == 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15'
def test_page_test():
tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
result = page_text(tree)
expected = textwrap.dedent("""\
ber die vielen Sorgen wegen deelben vergaß
Hartkopf, der Frau Amtmnnin das ver⸗
ſproene zu berliefern. — Ein Erpreer
wurde an ihn abgeſit, um ihn ums Him⸗
melswien zu ſagen, daß er das Verſproene
glei den Augenbli berbringen mte, die
Frau Amtmnnin htte  auf ihn verlaen,
und nun wßte e nit, was e anfangen
ſote. Den Augenbli ſote er kommen,
ſon vergieng e in ihrer Ang. — Die
Ge wren ſon angekommen, und es fehlte
ihr do no an aem. —
Hartkopf mußte  er bennen, und
endli na langem Nadenken fiel es ihm er
wieder ein. — Er langte den Zettel aus dem
Accisbue heraus, und ſagte ſeiner Frau, daß
e das, was da wre, herbeyſaffen mte.
Jndeß mangelten do einige Generalia, die
alſo wegfielen. — Hartkopf gieng ſelb
mit und berbrate es. —""")
assert result == expected
def test_page_with_empty_region():
# This file contains an empty TextRegion:
#
# <TextRegion id="region0000">
# <Coords points="488,133 1197,133 1197,193 488,193"/>
# <TextEquiv>
# <Unicode></Unicode>
# </TextEquiv>
# </TextRegion>
tree = ET.parse(os.path.join(data_dir, 'brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml'))
result = page_text(tree)
assert result
def test_page_order():
# This file contains TextRegions where file order is not the same as reading order.
tree = ET.parse(os.path.join(data_dir, 'order.page.xml'))
result = page_text(tree)
assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL)
def test_page_mixed_regions():
# This file contains ImageRegions and TextRegions in the ReadingOrder
tree = ET.parse(os.path.join(data_dir, 'mixed-regions.page.xml'))
with pytest.warns(UserWarning, match=r'Not a TextRegion'):
result = page_text(tree)
assert 'non exaudiam uos. Chriſtiani uero quia orant iuxta' in result
def test_text():
assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
assert "wieder ein. — Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt'))