|
|
@ -3,6 +3,7 @@ import shutil
|
|
|
|
import subprocess
|
|
|
|
import subprocess
|
|
|
|
import urllib.request
|
|
|
|
import urllib.request
|
|
|
|
from lxml import etree
|
|
|
|
from lxml import etree
|
|
|
|
|
|
|
|
from glob import glob
|
|
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
import pytest
|
|
|
|
from ocrd.resolver import Resolver
|
|
|
|
from ocrd.resolver import Resolver
|
|
|
@ -10,10 +11,15 @@ from ocrd.resolver import Resolver
|
|
|
|
from ocrd_calamari import CalamariRecognize
|
|
|
|
from ocrd_calamari import CalamariRecognize
|
|
|
|
from .base import assets
|
|
|
|
from .base import assets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml')
|
|
|
|
METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml')
|
|
|
|
WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
|
|
|
|
WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
|
|
|
|
CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json')
|
|
|
|
CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Because XML namespace versions are so much fun, we not only use one, we use TWO!
|
|
|
|
|
|
|
|
NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
|
|
|
|
|
|
|
|
NSMAP_GT = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
@pytest.fixture
|
|
|
|
def workspace():
|
|
|
|
def workspace():
|
|
|
@ -44,12 +50,20 @@ def workspace():
|
|
|
|
ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)
|
|
|
|
ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)
|
|
|
|
subprocess.call(['convert', ff, '-threshold', '50%', ff])
|
|
|
|
subprocess.call(['convert', ff, '-threshold', '50%', ff])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text
|
|
|
|
|
|
|
|
for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-LINE"):
|
|
|
|
|
|
|
|
workspace.download_file(of)
|
|
|
|
|
|
|
|
for to_remove in ["//pc:Word", "//pc:TextEquiv"]:
|
|
|
|
|
|
|
|
for ff in glob(os.path.join(WORKSPACE_DIR, "OCR-D-GT-SEG-LINE", "*")):
|
|
|
|
|
|
|
|
tree = etree.parse(ff)
|
|
|
|
|
|
|
|
for e in tree.xpath(to_remove, namespaces=NSMAP_GT):
|
|
|
|
|
|
|
|
e.getparent().remove(e)
|
|
|
|
|
|
|
|
tree.write(ff, xml_declaration=True, encoding="utf-8")
|
|
|
|
|
|
|
|
|
|
|
|
return workspace
|
|
|
|
return workspace
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_recognize(workspace):
|
|
|
|
def test_recognize(workspace):
|
|
|
|
# XXX Should remove GT text to really test this
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CalamariRecognize(
|
|
|
|
CalamariRecognize(
|
|
|
|
workspace,
|
|
|
|
workspace,
|
|
|
|
input_file_grp="OCR-D-GT-SEG-LINE",
|
|
|
|
input_file_grp="OCR-D-GT-SEG-LINE",
|
|
|
@ -81,8 +95,6 @@ def test_word_segmentation(workspace):
|
|
|
|
assert os.path.exists(page1)
|
|
|
|
assert os.path.exists(page1)
|
|
|
|
tree = etree.parse(page1)
|
|
|
|
tree = etree.parse(page1)
|
|
|
|
|
|
|
|
|
|
|
|
NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# The result should contain a TextLine that contains the text "December"
|
|
|
|
# The result should contain a TextLine that contains the text "December"
|
|
|
|
line = tree.xpath(".//pc:TextLine[pc:TextEquiv/pc:Unicode[contains(text(),'December')]]", namespaces=NSMAP)[0]
|
|
|
|
line = tree.xpath(".//pc:TextLine[pc:TextEquiv/pc:Unicode[contains(text(),'December')]]", namespaces=NSMAP)[0]
|
|
|
|
assert line
|
|
|
|
assert line
|
|
|
|