You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
page2tsv/tests/test_imexport.py

47 lines
1.7 KiB
Python

from pathlib import Path
from shutil import copytree
from pytest import fixture
from ocrd_utils import pushd_popd
from ocrd_models.ocrd_page import parse
from ocrd import Resolver
from qurator.tsvtools.ocrd_processors import OcrdNeatExportProcessor, OcrdNeatImportProcessor
@fixture
def testws(tmpdir):
copytree('tests/testws', f'{tmpdir}/ws')
return Resolver().workspace_from_url(f'{tmpdir}/ws/mets.xml')
def test_imexport(testws):
wsdir = testws.directory
exporter = OcrdNeatExportProcessor(workspace=testws, input_file_grp='TESS', output_file_grp='OUT')
exporter.process()
outfile = Path(wsdir, 'OUT/FILE_0005_OUT.tsv')
assert outfile.exists()
assert 'Ein Welt-Stantenbund 0 174 1116 169 280 region0000_line0001' in outfile.read_text()
assert outfile.read_text().splitlines()[1] == '# https://content.staatsbibliothek-berlin.de/dc/PPN680203753-0005/left,top,width,height/full/0/default.jpg'
outfile.write_text(outfile.read_text().replace('Stantenbund', 'Staatenbund'))
importer = OcrdNeatImportProcessor(workspace=testws, input_file_grp='TESS,OUT', output_file_grp='TESS-CORRECTED')
importer.process()
origfile = Path(wsdir, 'TESS/FILE_0005_TESS.xml')
corrfile = Path(wsdir, 'TESS-CORRECTED/FILE_0005_TESS-CORRECTED.xml')
assert origfile.exists()
assert corrfile.exists()
origpage = parse(origfile)
corrpage = parse(corrfile)
origline = origpage.get_Page().get_TextRegion()[0].get_TextLine()[1].get_TextEquiv()[0].Unicode
corrline = corrpage.get_Page().get_TextRegion()[0].get_TextLine()[1].get_TextEquiv()[0].Unicode
assert 'Stantenbund' in origline
assert 'Stantenbund' not in corrline
assert 'Staatenbund' not in origline
assert 'Staatenbund' in corrline