mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-30 22:19:57 +02:00
This reverts commit a3c1eee8f31349edcfb1e36920763bcecceb1129, reversing changes made to dc76213ffc1fbabc2c45f0e52ced55449bdf2e83.
110 lines
4.1 KiB
Python
110 lines
4.1 KiB
Python
import os
|
||
import re
|
||
|
||
import lxml.etree as ET
|
||
import textwrap
|
||
|
||
import pytest
|
||
|
||
from .. import alto_namespace, alto_text, page_namespace, page_text, text
|
||
|
||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||
|
||
|
||
def test_alto_namespace():
|
||
tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
|
||
assert alto_namespace(tree) == 'http://www.loc.gov/standards/alto/ns-v3#'
|
||
|
||
|
||
def test_alto_text():
|
||
tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
|
||
result = alto_text(tree)
|
||
expected = textwrap.dedent("""\
|
||
über die vielen Sorgen wegen deſſelben vergaß
|
||
Hartkopf, der Frau Amtmännin das ver-
|
||
ſprochene zu überliefern.""")
|
||
assert result == expected
|
||
|
||
|
||
def test_alto_text_ALTO1():
|
||
tree = ET.parse(os.path.join(data_dir, 'test.alto1.xml'))
|
||
assert "being erected at the Broadway stock" in alto_text(tree)
|
||
|
||
|
||
def test_alto_text_ALTO2():
|
||
tree = ET.parse(os.path.join(data_dir, 'test.alto2.xml'))
|
||
assert "Halbmonde, die genau durch einen Ouerstrich halbiert\nsind und an beiden Enden" in alto_text(tree)
|
||
|
||
|
||
def test_alto_text_ALTO3():
|
||
tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
|
||
assert "über die vielen Sorgen wegen deſſelben vergaß" in alto_text(tree)
|
||
|
||
|
||
def test_page_namespace():
|
||
tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
|
||
assert page_namespace(tree) == 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15'
|
||
|
||
|
||
def test_page_test():
|
||
tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
|
||
result = page_text(tree)
|
||
expected = textwrap.dedent("""\
|
||
ber die vielen Sorgen wegen deelben vergaß
|
||
Hartkopf, der Frau Amtmnnin das ver⸗
|
||
ſproene zu berliefern. — Ein Erpreer
|
||
wurde an ihn abgeſit, um ihn ums Him⸗
|
||
melswien zu ſagen, daß er das Verſproene
|
||
glei den Augenbli berbringen mte, die
|
||
Frau Amtmnnin htte auf ihn verlaen,
|
||
und nun wßte e nit, was e anfangen
|
||
ſote. Den Augenbli ſote er kommen,
|
||
ſon vergieng e in ihrer Ang. — Die
|
||
Ge wren ſon angekommen, und es fehlte
|
||
ihr do no an aem. —
|
||
Hartkopf mußte er bennen, und
|
||
endli na langem Nadenken fiel es ihm er
|
||
wieder ein. — Er langte den Zettel aus dem
|
||
Accisbue heraus, und ſagte ſeiner Frau, daß
|
||
e das, was da wre, herbeyſaffen mte.
|
||
Jndeß mangelten do einige Generalia, die
|
||
alſo wegfielen. — Hartkopf gieng ſelb
|
||
mit und berbrate es. —""")
|
||
assert result == expected
|
||
|
||
|
||
def test_page_with_empty_region():
|
||
# This file contains an empty TextRegion:
|
||
#
|
||
# <TextRegion id="region0000">
|
||
# <Coords points="488,133 1197,133 1197,193 488,193"/>
|
||
# <TextEquiv>
|
||
# <Unicode></Unicode>
|
||
# </TextEquiv>
|
||
# </TextRegion>
|
||
tree = ET.parse(os.path.join(data_dir, 'brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml'))
|
||
result = page_text(tree)
|
||
assert result
|
||
|
||
|
||
def test_page_order():
|
||
# This file contains TextRegions where file order is not the same as reading order.
|
||
tree = ET.parse(os.path.join(data_dir, 'order.page.xml'))
|
||
result = page_text(tree)
|
||
|
||
assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL)
|
||
|
||
|
||
def test_page_mixed_regions():
|
||
# This file contains ImageRegions and TextRegions in the ReadingOrder
|
||
tree = ET.parse(os.path.join(data_dir, 'mixed-regions.page.xml'))
|
||
with pytest.warns(UserWarning, match=r'Not a TextRegion'):
|
||
result = page_text(tree)
|
||
|
||
assert 'non exaudiam uos. Chriſtiani uero quia orant iuxta' in result
|
||
|
||
|
||
def test_text():
|
||
assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
|
||
assert "wieder ein. — Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
|
||
assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt'))
|