page2tsv/tests/test_imexport.py

from pathlib import Path
from shutil import copytree
from pytest import fixture

from ocrd_utils import pushd_popd
from ocrd_models.ocrd_page import parse
from ocrd import Resolver

from qurator.tsvtools.ocrd_processors import OcrdNeatExportProcessor, OcrdNeatImportProcessor

@fixture
def testws(tmpdir):
    copytree('tests/testws', f'{tmpdir}/ws')
    return Resolver().workspace_from_url(f'{tmpdir}/ws/mets.xml')

def test_imexport(testws):
    wsdir = testws.directory
    exporter = OcrdNeatExportProcessor(workspace=testws, input_file_grp='TESS', output_file_grp='OUT')
    exporter.process()
    outfile = Path(wsdir, 'OUT/FILE_0005_OUT.tsv')
    assert outfile.exists()
    assert 'Ein Welt-Stantenbund	0	174	1116	169	280		region0000_line0001' in outfile.read_text()
    assert outfile.read_text().splitlines()[1] == '# https://content.staatsbibliothek-berlin.de/dc/PPN680203753-0005/left,top,width,height/full/0/default.jpg'

    outfile.write_text(outfile.read_text().replace('Stantenbund', 'Staatenbund'))

    importer = OcrdNeatImportProcessor(workspace=testws, input_file_grp='TESS,OUT', output_file_grp='TESS-CORRECTED')
    importer.process()

    origfile = Path(wsdir, 'TESS/FILE_0005_TESS.xml')
    corrfile = Path(wsdir, 'TESS-CORRECTED/FILE_0005_TESS-CORRECTED.xml')

    assert origfile.exists()
    assert corrfile.exists()

    origpage = parse(origfile)
    corrpage = parse(corrfile)

    origline = origpage.get_Page().get_TextRegion()[0].get_TextLine()[1].get_TextEquiv()[0].Unicode
    corrline = corrpage.get_Page().get_TextRegion()[0].get_TextLine()[1].get_TextEquiv()[0].Unicode

    assert 'Stantenbund' in origline
    assert 'Stantenbund' not in corrline

    assert 'Staatenbund' not in origline
    assert 'Staatenbund' in corrline