mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-11-09 22:24:13 +01:00
wip
This commit is contained in:
parent
874cfc247f
commit
ec1fd93dad
6 changed files with 349 additions and 144 deletions
6
.github/workflows/test-eynollah.yml
vendored
6
.github/workflows/test-eynollah.yml
vendored
|
|
@ -31,7 +31,7 @@ jobs:
|
||||||
src: "./src"
|
src: "./src"
|
||||||
|
|
||||||
- name: Try to restore models_eynollah
|
- name: Try to restore models_eynollah
|
||||||
uses: actions/cache/restore@v4
|
uses: actions/cache/restore@v4
|
||||||
id: all_model_cache
|
id: all_model_cache
|
||||||
with:
|
with:
|
||||||
path: models_eynollah
|
path: models_eynollah
|
||||||
|
|
@ -40,8 +40,8 @@ jobs:
|
||||||
- name: Download models
|
- name: Download models
|
||||||
if: steps.all_model_cache.outputs.cache-hit != 'true'
|
if: steps.all_model_cache.outputs.cache-hit != 'true'
|
||||||
run: |
|
run: |
|
||||||
make models
|
make models
|
||||||
ls -la models_eynollah
|
ls -la models_eynollah
|
||||||
|
|
||||||
- uses: actions/cache/save@v4
|
- uses: actions/cache/save@v4
|
||||||
if: steps.all_model_cache.outputs.cache-hit != 'true'
|
if: steps.all_model_cache.outputs.cache-hit != 'true'
|
||||||
|
|
|
||||||
45
Makefile
45
Makefile
|
|
@ -14,17 +14,9 @@ WGET = wget -O
|
||||||
#SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz
|
#SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz
|
||||||
#SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz
|
#SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz
|
||||||
#SEG_MODEL := https://zenodo.org/records/17194824/files/models_layout_v0_5_0.tar.gz?download=1
|
#SEG_MODEL := https://zenodo.org/records/17194824/files/models_layout_v0_5_0.tar.gz?download=1
|
||||||
SEG_MODEL := https://zenodo.org/records/17295988/files/models_layout_v0_6_0.tar.gz?download=1
|
EYNOLLAH_MODELS_URL := https://zenodo.org/records/17295988/files/models_all_v0_7_0.zip
|
||||||
SEG_MODELFILE = $(notdir $(patsubst %?download=1,%,$(SEG_MODEL)))
|
EYNOLLAH_MODELS_ZIP = $(notdir $(SEG_MODEL))
|
||||||
SEG_MODELNAME = $(SEG_MODELFILE:%.tar.gz=%)
|
EYNOLLAH_MODELS_DIR = $(SEG_MODELFILE:%.zip=%)
|
||||||
|
|
||||||
BIN_MODEL := https://zenodo.org/records/17295988/files/models_binarization_v0_6_0.tar.gz?download=1
|
|
||||||
BIN_MODELFILE = $(notdir $(BIN_MODEL))
|
|
||||||
BIN_MODELNAME := default-2021-03-09
|
|
||||||
|
|
||||||
OCR_MODEL := https://zenodo.org/records/17295988/files/models_ocr_v0_6_0.tar.gz?download=1
|
|
||||||
OCR_MODELFILE = $(notdir $(patsubst %?download=1,%,$(OCR_MODEL)))
|
|
||||||
OCR_MODELNAME = $(OCR_MODELFILE:%.tar.gz=%)
|
|
||||||
|
|
||||||
PYTEST_ARGS ?= -vv --isolate
|
PYTEST_ARGS ?= -vv --isolate
|
||||||
|
|
||||||
|
|
@ -49,33 +41,23 @@ help:
|
||||||
@echo " EXTRAS comma-separated list of features (like 'OCR,plotting') for 'install' [$(EXTRAS)]"
|
@echo " EXTRAS comma-separated list of features (like 'OCR,plotting') for 'install' [$(EXTRAS)]"
|
||||||
@echo " DOCKER_TAG Docker image tag for 'docker' [$(DOCKER_TAG)]"
|
@echo " DOCKER_TAG Docker image tag for 'docker' [$(DOCKER_TAG)]"
|
||||||
@echo " PYTEST_ARGS pytest args for 'test' (Set to '-s' to see log output during test execution, '-vv' to see individual tests. [$(PYTEST_ARGS)]"
|
@echo " PYTEST_ARGS pytest args for 'test' (Set to '-s' to see log output during test execution, '-vv' to see individual tests. [$(PYTEST_ARGS)]"
|
||||||
@echo " SEG_MODEL URL of 'models' archive to download for segmentation 'test' [$(SEG_MODEL)]"
|
@echo " ALL_MODELS URL of archive of all models [$(ALL_MODELS)]"
|
||||||
@echo " BIN_MODEL URL of 'models' archive to download for binarization 'test' [$(BIN_MODEL)]"
|
|
||||||
@echo " OCR_MODEL URL of 'models' archive to download for binarization 'test' [$(OCR_MODEL)]"
|
|
||||||
@echo ""
|
@echo ""
|
||||||
|
|
||||||
# END-EVAL
|
# END-EVAL
|
||||||
|
|
||||||
|
|
||||||
# Download and extract models to $(PWD)/models_layout_v0_6_0
|
# Download and extract models to $(PWD)/models_layout_v0_6_0
|
||||||
models: $(BIN_MODELNAME) $(SEG_MODELNAME) $(OCR_MODELNAME)
|
models: $(EYNOLLAH_MODELS_DIR)
|
||||||
|
|
||||||
# do not download these files if we already have the directories
|
# do not download these files if we already have the directories
|
||||||
.INTERMEDIATE: $(BIN_MODELFILE) $(SEG_MODELFILE) $(OCR_MODELFILE)
|
.INTERMEDIATE: $(EYNOLLAH_MODELS_ZIP)
|
||||||
|
|
||||||
$(BIN_MODELFILE):
|
$(EYNOLLAH_MODELS_ZIP):
|
||||||
$(WGET) $@ $(BIN_MODEL)
|
$(WGET) $@ $(EYNOLLAH_MODELS_URL)
|
||||||
$(SEG_MODELFILE):
|
|
||||||
$(WGET) $@ $(SEG_MODEL)
|
|
||||||
$(OCR_MODELFILE):
|
|
||||||
$(WGET) $@ $(OCR_MODEL)
|
|
||||||
|
|
||||||
$(BIN_MODELNAME): $(BIN_MODELFILE)
|
$(EYNOLLAH_MODELS_DIR): $(EYNOLLAH_MODELS_ZIP)
|
||||||
tar zxf $<
|
unzip $<
|
||||||
$(SEG_MODELNAME): $(SEG_MODELFILE)
|
|
||||||
tar zxf $<
|
|
||||||
$(OCR_MODELNAME): $(OCR_MODELFILE)
|
|
||||||
tar zxf $<
|
|
||||||
|
|
||||||
build:
|
build:
|
||||||
$(PIP) install build
|
$(PIP) install build
|
||||||
|
|
@ -89,13 +71,8 @@ install:
|
||||||
install-dev:
|
install-dev:
|
||||||
$(PIP) install -e .$(and $(EXTRAS),[$(EXTRAS)])
|
$(PIP) install -e .$(and $(EXTRAS),[$(EXTRAS)])
|
||||||
|
|
||||||
ifeq (OCR,$(findstring OCR, $(EXTRAS)))
|
deps-test: $(EYNOLLAH_MODELS_ZIP)
|
||||||
deps-test: $(OCR_MODELNAME)
|
|
||||||
endif
|
|
||||||
deps-test: $(BIN_MODELNAME) $(SEG_MODELNAME)
|
|
||||||
$(PIP) install -r requirements-test.txt
|
$(PIP) install -r requirements-test.txt
|
||||||
ifeq (OCR,$(findstring OCR, $(EXTRAS)))
|
|
||||||
ln -rs $(OCR_MODELNAME)/* $(SEG_MODELNAME)/
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
smoke-test: TMPDIR != mktemp -d
|
smoke-test: TMPDIR != mktemp -d
|
||||||
|
|
|
||||||
|
|
@ -85,9 +85,9 @@ def package(
|
||||||
copies.add((src, dist_dir))
|
copies.add((src, dist_dir))
|
||||||
mkdirs.add(dist_dir)
|
mkdirs.add(dist_dir)
|
||||||
for dir in mkdirs:
|
for dir in mkdirs:
|
||||||
print(f"mkdir -p {dir}")
|
print(f"mkdir -vp {dir}")
|
||||||
for (src, dst) in copies:
|
for (src, dst) in copies:
|
||||||
print(f"cp -r {src} {dst}")
|
print(f"cp -vr {src} {dst}")
|
||||||
for dir in mkdirs:
|
for dir in mkdirs:
|
||||||
zip_path = Path(f'../{dir.parent.name}.zip')
|
zip_path = Path(f'../{dir.parent.name}.zip')
|
||||||
print(f"(cd {dir}/..; zip -r {zip_path} models_eynollah)")
|
print(f"(cd {dir}/..; zip -vr {zip_path} models_eynollah)")
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
|
||||||
category="enhancement",
|
category="enhancement",
|
||||||
variant='',
|
variant='',
|
||||||
filename="models_eynollah/eynollah-enhancement_20210425",
|
filename="models_eynollah/eynollah-enhancement_20210425",
|
||||||
dists=['enhancement', 'layout'],
|
dists=['enhancement', 'layout', 'ci'],
|
||||||
dist_url=dist_url("enhancement"),
|
dist_url=dist_url("enhancement"),
|
||||||
type=KerasModel,
|
type=KerasModel,
|
||||||
),
|
),
|
||||||
|
|
@ -23,7 +23,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
|
||||||
category="binarization",
|
category="binarization",
|
||||||
variant='',
|
variant='',
|
||||||
filename="models_eynollah/eynollah-binarization-hybrid_20230504",
|
filename="models_eynollah/eynollah-binarization-hybrid_20230504",
|
||||||
dists=['layout', 'binarization'],
|
dists=['layout', 'binarization', ],
|
||||||
dist_url=dist_url("binarization"),
|
dist_url=dist_url("binarization"),
|
||||||
type=KerasModel,
|
type=KerasModel,
|
||||||
),
|
),
|
||||||
|
|
|
||||||
|
|
@ -16,114 +16,12 @@ from ocrd_models.constants import NAMESPACES as NS
|
||||||
|
|
||||||
testdir = Path(__file__).parent.resolve()
|
testdir = Path(__file__).parent.resolve()
|
||||||
|
|
||||||
MODELS_LAYOUT = environ.get('MODELS_LAYOUT', str(testdir.joinpath('..', 'models_layout_v0_6_0').resolve()))
|
|
||||||
MODELS_OCR = environ.get('MODELS_OCR', str(testdir.joinpath('..', 'models_ocr_v0_6_0').resolve()))
|
MODELS_OCR = environ.get('MODELS_OCR', str(testdir.joinpath('..', 'models_ocr_v0_6_0').resolve()))
|
||||||
MODELS_BIN = environ.get('MODELS_BIN', str(testdir.joinpath('..', 'default-2021-03-09').resolve()))
|
MODELS_BIN = environ.get('MODELS_BIN', str(testdir.joinpath('..', 'default-2021-03-09').resolve()))
|
||||||
|
|
||||||
def only_eynollah(logrec):
|
def only_eynollah(logrec):
|
||||||
return logrec.name.startswith('eynollah')
|
return logrec.name.startswith('eynollah')
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"options",
|
|
||||||
[
|
|
||||||
[], # defaults
|
|
||||||
#["--allow_scaling", "--curved-line"],
|
|
||||||
["--allow_scaling", "--curved-line", "--full-layout"],
|
|
||||||
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"],
|
|
||||||
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based",
|
|
||||||
"--textline_light", "--light_version"],
|
|
||||||
# -ep ...
|
|
||||||
# -eoi ...
|
|
||||||
# FIXME: find out whether OCR extra was installed, otherwise skip these
|
|
||||||
["--do_ocr"],
|
|
||||||
["--do_ocr", "--light_version", "--textline_light"],
|
|
||||||
["--do_ocr", "--transformer_ocr"],
|
|
||||||
#["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"],
|
|
||||||
["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"],
|
|
||||||
# --skip_layout_and_reading_order
|
|
||||||
], ids=str)
|
|
||||||
def test_run_eynollah_layout_filename(tmp_path, pytestconfig, caplog, options):
|
|
||||||
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
|
|
||||||
outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml'
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_LAYOUT,
|
|
||||||
'-i', str(infile),
|
|
||||||
'-o', str(outfile.parent),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(layout_cli, args + options, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
assert str(infile) in logmsgs
|
|
||||||
assert outfile.exists()
|
|
||||||
tree = page_from_file(str(outfile)).etree
|
|
||||||
regions = tree.xpath("//page:TextRegion", namespaces=NS)
|
|
||||||
assert len(regions) >= 2, "result is inaccurate"
|
|
||||||
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
|
|
||||||
assert len(regions) >= 2, "result is inaccurate"
|
|
||||||
lines = tree.xpath("//page:TextLine", namespaces=NS)
|
|
||||||
assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"options",
|
|
||||||
[
|
|
||||||
["--tables"],
|
|
||||||
["--tables", "--full-layout"],
|
|
||||||
["--tables", "--full-layout", "--textline_light", "--light_version"],
|
|
||||||
], ids=str)
|
|
||||||
def test_run_eynollah_layout_filename2(tmp_path, pytestconfig, caplog, options):
|
|
||||||
infile = testdir.joinpath('resources/euler_rechenkunst01_1738_0025.tif')
|
|
||||||
outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml'
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_LAYOUT,
|
|
||||||
'-i', str(infile),
|
|
||||||
'-o', str(outfile.parent),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(layout_cli, args + options, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
assert str(infile) in logmsgs
|
|
||||||
assert outfile.exists()
|
|
||||||
tree = page_from_file(str(outfile)).etree
|
|
||||||
regions = tree.xpath("//page:TextRegion", namespaces=NS)
|
|
||||||
assert len(regions) >= 2, "result is inaccurate"
|
|
||||||
regions = tree.xpath("//page:TableRegion", namespaces=NS)
|
|
||||||
# model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP
|
|
||||||
assert len(regions) >= 1, "result is inaccurate"
|
|
||||||
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
|
|
||||||
assert len(regions) >= 2, "result is inaccurate"
|
|
||||||
lines = tree.xpath("//page:TextLine", namespaces=NS)
|
|
||||||
assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line
|
|
||||||
|
|
||||||
def test_run_eynollah_layout_directory(tmp_path, pytestconfig, caplog):
|
|
||||||
indir = testdir.joinpath('resources')
|
|
||||||
outdir = tmp_path
|
|
||||||
args = [
|
|
||||||
'-m', MODELS_LAYOUT,
|
|
||||||
'-di', str(indir),
|
|
||||||
'-o', str(outdir),
|
|
||||||
]
|
|
||||||
if pytestconfig.getoption('verbose') > 0:
|
|
||||||
args.extend(['-l', 'DEBUG'])
|
|
||||||
caplog.set_level(logging.INFO)
|
|
||||||
runner = CliRunner()
|
|
||||||
with caplog.filtering(only_eynollah):
|
|
||||||
result = runner.invoke(layout_cli, args, catch_exceptions=False)
|
|
||||||
assert result.exit_code == 0, result.stdout
|
|
||||||
logmsgs = [logrec.message for logrec in caplog.records]
|
|
||||||
assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Job done in')]) == 2
|
|
||||||
assert any(logmsg for logmsg in logmsgs if logmsg.startswith('All jobs done in'))
|
|
||||||
assert len(list(outdir.iterdir())) == 2
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"options",
|
"options",
|
||||||
[
|
[
|
||||||
|
|
|
||||||
330
tests/test_run_layout.py
Normal file
330
tests/test_run_layout.py
Normal file
|
|
@ -0,0 +1,330 @@
|
||||||
|
from os import environ
|
||||||
|
from pathlib import Path
|
||||||
|
import pytest
|
||||||
|
import logging
|
||||||
|
from PIL import Image
|
||||||
|
from eynollah.cli import (
|
||||||
|
layout as layout_cli,
|
||||||
|
binarization as binarization_cli,
|
||||||
|
enhancement as enhancement_cli,
|
||||||
|
machine_based_reading_order as mbreorder_cli,
|
||||||
|
ocr as ocr_cli,
|
||||||
|
)
|
||||||
|
from click.testing import CliRunner
|
||||||
|
from ocrd_modelfactory import page_from_file
|
||||||
|
from ocrd_models.constants import NAMESPACES as NS
|
||||||
|
|
||||||
|
testdir = Path(__file__).parent.resolve()
|
||||||
|
|
||||||
|
MODELS_LAYOUT = environ.get('MODELS_LAYOUT', str(testdir.joinpath('..', 'models_layout_v0_6_0').resolve()))
|
||||||
|
|
||||||
|
def only_eynollah(logrec):
|
||||||
|
return logrec.name.startswith('eynollah')
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"options",
|
||||||
|
[
|
||||||
|
[], # defaults
|
||||||
|
#["--allow_scaling", "--curved-line"],
|
||||||
|
["--allow_scaling", "--curved-line", "--full-layout"],
|
||||||
|
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"],
|
||||||
|
["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based",
|
||||||
|
"--textline_light", "--light_version"],
|
||||||
|
# -ep ...
|
||||||
|
# -eoi ...
|
||||||
|
# FIXME: find out whether OCR extra was installed, otherwise skip these
|
||||||
|
["--do_ocr"],
|
||||||
|
["--do_ocr", "--light_version", "--textline_light"],
|
||||||
|
["--do_ocr", "--transformer_ocr"],
|
||||||
|
#["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light"],
|
||||||
|
["--do_ocr", "--transformer_ocr", "--light_version", "--textline_light", "--full-layout"],
|
||||||
|
# --skip_layout_and_reading_order
|
||||||
|
], ids=str)
|
||||||
|
def test_run_eynollah_layout_filename(tmp_path, pytestconfig, caplog, options):
|
||||||
|
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
|
||||||
|
outfile = tmp_path / 'kant_aufklaerung_1784_0020.xml'
|
||||||
|
args = [
|
||||||
|
'-m', MODELS_LAYOUT,
|
||||||
|
'-i', str(infile),
|
||||||
|
'-o', str(outfile.parent),
|
||||||
|
]
|
||||||
|
if pytestconfig.getoption('verbose') > 0:
|
||||||
|
args.extend(['-l', 'DEBUG'])
|
||||||
|
caplog.set_level(logging.INFO)
|
||||||
|
runner = CliRunner()
|
||||||
|
with caplog.filtering(only_eynollah):
|
||||||
|
result = runner.invoke(layout_cli, args + options, catch_exceptions=False)
|
||||||
|
assert result.exit_code == 0, result.stdout
|
||||||
|
logmsgs = [logrec.message for logrec in caplog.records]
|
||||||
|
assert str(infile) in logmsgs
|
||||||
|
assert outfile.exists()
|
||||||
|
tree = page_from_file(str(outfile)).etree
|
||||||
|
regions = tree.xpath("//page:TextRegion", namespaces=NS)
|
||||||
|
assert len(regions) >= 2, "result is inaccurate"
|
||||||
|
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
|
||||||
|
assert len(regions) >= 2, "result is inaccurate"
|
||||||
|
lines = tree.xpath("//page:TextLine", namespaces=NS)
|
||||||
|
assert len(lines) == 31, "result is inaccurate" # 29 paragraph lines, 1 page and 1 catch-word line
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"options",
|
||||||
|
[
|
||||||
|
["--tables"],
|
||||||
|
["--tables", "--full-layout"],
|
||||||
|
["--tables", "--full-layout", "--textline_light", "--light_version"],
|
||||||
|
], ids=str)
|
||||||
|
def test_run_eynollah_layout_filename2(tmp_path, pytestconfig, caplog, options):
|
||||||
|
infile = testdir.joinpath('resources/euler_rechenkunst01_1738_0025.tif')
|
||||||
|
outfile = tmp_path / 'euler_rechenkunst01_1738_0025.xml'
|
||||||
|
args = [
|
||||||
|
'-m', MODELS_LAYOUT,
|
||||||
|
'-i', str(infile),
|
||||||
|
'-o', str(outfile.parent),
|
||||||
|
]
|
||||||
|
if pytestconfig.getoption('verbose') > 0:
|
||||||
|
args.extend(['-l', 'DEBUG'])
|
||||||
|
caplog.set_level(logging.INFO)
|
||||||
|
runner = CliRunner()
|
||||||
|
with caplog.filtering(only_eynollah):
|
||||||
|
result = runner.invoke(layout_cli, args + options, catch_exceptions=False)
|
||||||
|
assert result.exit_code == 0, result.stdout
|
||||||
|
logmsgs = [logrec.message for logrec in caplog.records]
|
||||||
|
assert str(infile) in logmsgs
|
||||||
|
assert outfile.exists()
|
||||||
|
tree = page_from_file(str(outfile)).etree
|
||||||
|
regions = tree.xpath("//page:TextRegion", namespaces=NS)
|
||||||
|
assert len(regions) >= 2, "result is inaccurate"
|
||||||
|
regions = tree.xpath("//page:TableRegion", namespaces=NS)
|
||||||
|
# model/decoding is not very precise, so (depending on mode) we can get fractures/splits/FP
|
||||||
|
assert len(regions) >= 1, "result is inaccurate"
|
||||||
|
regions = tree.xpath("//page:SeparatorRegion", namespaces=NS)
|
||||||
|
assert len(regions) >= 2, "result is inaccurate"
|
||||||
|
lines = tree.xpath("//page:TextLine", namespaces=NS)
|
||||||
|
assert len(lines) >= 2, "result is inaccurate" # mostly table (if detected correctly), but 1 page and 1 catch-word line
|
||||||
|
|
||||||
|
def test_run_eynollah_layout_directory(tmp_path, pytestconfig, caplog):
|
||||||
|
indir = testdir.joinpath('resources')
|
||||||
|
outdir = tmp_path
|
||||||
|
args = [
|
||||||
|
'-m', MODELS_LAYOUT,
|
||||||
|
'-di', str(indir),
|
||||||
|
'-o', str(outdir),
|
||||||
|
]
|
||||||
|
if pytestconfig.getoption('verbose') > 0:
|
||||||
|
args.extend(['-l', 'DEBUG'])
|
||||||
|
caplog.set_level(logging.INFO)
|
||||||
|
runner = CliRunner()
|
||||||
|
with caplog.filtering(only_eynollah):
|
||||||
|
result = runner.invoke(layout_cli, args, catch_exceptions=False)
|
||||||
|
assert result.exit_code == 0, result.stdout
|
||||||
|
logmsgs = [logrec.message for logrec in caplog.records]
|
||||||
|
assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Job done in')]) == 2
|
||||||
|
assert any(logmsg for logmsg in logmsgs if logmsg.startswith('All jobs done in'))
|
||||||
|
assert len(list(outdir.iterdir())) == 2
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"options",
|
||||||
|
[
|
||||||
|
[], # defaults
|
||||||
|
["--no-patches"],
|
||||||
|
], ids=str)
|
||||||
|
def test_run_eynollah_binarization_filename(tmp_path, pytestconfig, caplog, options):
|
||||||
|
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
|
||||||
|
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
|
||||||
|
args = [
|
||||||
|
'-m', MODELS_BIN,
|
||||||
|
'-i', str(infile),
|
||||||
|
'-o', str(outfile),
|
||||||
|
]
|
||||||
|
if pytestconfig.getoption('verbose') > 0:
|
||||||
|
args.extend(['-l', 'DEBUG'])
|
||||||
|
caplog.set_level(logging.INFO)
|
||||||
|
runner = CliRunner()
|
||||||
|
with caplog.filtering(only_eynollah):
|
||||||
|
result = runner.invoke(binarization_cli, args + options, catch_exceptions=False)
|
||||||
|
assert result.exit_code == 0, result.stdout
|
||||||
|
logmsgs = [logrec.message for logrec in caplog.records]
|
||||||
|
assert any(True for logmsg in logmsgs if logmsg.startswith('Predicting'))
|
||||||
|
assert outfile.exists()
|
||||||
|
with Image.open(infile) as original_img:
|
||||||
|
original_size = original_img.size
|
||||||
|
with Image.open(outfile) as binarized_img:
|
||||||
|
binarized_size = binarized_img.size
|
||||||
|
assert original_size == binarized_size
|
||||||
|
|
||||||
|
def test_run_eynollah_binarization_directory(tmp_path, pytestconfig, caplog):
|
||||||
|
indir = testdir.joinpath('resources')
|
||||||
|
outdir = tmp_path
|
||||||
|
args = [
|
||||||
|
'-m', MODELS_BIN,
|
||||||
|
'-di', str(indir),
|
||||||
|
'-o', str(outdir),
|
||||||
|
]
|
||||||
|
if pytestconfig.getoption('verbose') > 0:
|
||||||
|
args.extend(['-l', 'DEBUG'])
|
||||||
|
caplog.set_level(logging.INFO)
|
||||||
|
runner = CliRunner()
|
||||||
|
with caplog.filtering(only_eynollah):
|
||||||
|
result = runner.invoke(binarization_cli, args, catch_exceptions=False)
|
||||||
|
assert result.exit_code == 0, result.stdout
|
||||||
|
logmsgs = [logrec.message for logrec in caplog.records]
|
||||||
|
assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Predicting')]) == 2
|
||||||
|
assert len(list(outdir.iterdir())) == 2
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"options",
|
||||||
|
[
|
||||||
|
[], # defaults
|
||||||
|
["-sos"],
|
||||||
|
], ids=str)
|
||||||
|
def test_run_eynollah_enhancement_filename(tmp_path, pytestconfig, caplog, options):
|
||||||
|
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
|
||||||
|
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
|
||||||
|
args = [
|
||||||
|
'-m', MODELS_LAYOUT,
|
||||||
|
'-i', str(infile),
|
||||||
|
'-o', str(outfile.parent),
|
||||||
|
]
|
||||||
|
if pytestconfig.getoption('verbose') > 0:
|
||||||
|
args.extend(['-l', 'DEBUG'])
|
||||||
|
caplog.set_level(logging.INFO)
|
||||||
|
runner = CliRunner()
|
||||||
|
with caplog.filtering(only_eynollah):
|
||||||
|
result = runner.invoke(enhancement_cli, args + options, catch_exceptions=False)
|
||||||
|
assert result.exit_code == 0, result.stdout
|
||||||
|
logmsgs = [logrec.message for logrec in caplog.records]
|
||||||
|
assert any(True for logmsg in logmsgs if logmsg.startswith('Image was enhanced')), logmsgs
|
||||||
|
assert outfile.exists()
|
||||||
|
with Image.open(infile) as original_img:
|
||||||
|
original_size = original_img.size
|
||||||
|
with Image.open(outfile) as enhanced_img:
|
||||||
|
enhanced_size = enhanced_img.size
|
||||||
|
assert (original_size == enhanced_size) == ("-sos" in options)
|
||||||
|
|
||||||
|
def test_run_eynollah_enhancement_directory(tmp_path, pytestconfig, caplog):
|
||||||
|
indir = testdir.joinpath('resources')
|
||||||
|
outdir = tmp_path
|
||||||
|
args = [
|
||||||
|
'-m', MODELS_LAYOUT,
|
||||||
|
'-di', str(indir),
|
||||||
|
'-o', str(outdir),
|
||||||
|
]
|
||||||
|
if pytestconfig.getoption('verbose') > 0:
|
||||||
|
args.extend(['-l', 'DEBUG'])
|
||||||
|
caplog.set_level(logging.INFO)
|
||||||
|
runner = CliRunner()
|
||||||
|
with caplog.filtering(only_eynollah):
|
||||||
|
result = runner.invoke(enhancement_cli, args, catch_exceptions=False)
|
||||||
|
assert result.exit_code == 0, result.stdout
|
||||||
|
logmsgs = [logrec.message for logrec in caplog.records]
|
||||||
|
assert len([logmsg for logmsg in logmsgs if logmsg.startswith('Image was enhanced')]) == 2
|
||||||
|
assert len(list(outdir.iterdir())) == 2
|
||||||
|
|
||||||
|
def test_run_eynollah_mbreorder_filename(tmp_path, pytestconfig, caplog):
|
||||||
|
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.xml')
|
||||||
|
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
|
||||||
|
args = [
|
||||||
|
'-m', MODELS_LAYOUT,
|
||||||
|
'-i', str(infile),
|
||||||
|
'-o', str(outfile.parent),
|
||||||
|
]
|
||||||
|
if pytestconfig.getoption('verbose') > 0:
|
||||||
|
args.extend(['-l', 'DEBUG'])
|
||||||
|
caplog.set_level(logging.INFO)
|
||||||
|
runner = CliRunner()
|
||||||
|
with caplog.filtering(only_eynollah):
|
||||||
|
result = runner.invoke(mbreorder_cli, args, catch_exceptions=False)
|
||||||
|
assert result.exit_code == 0, result.stdout
|
||||||
|
logmsgs = [logrec.message for logrec in caplog.records]
|
||||||
|
# FIXME: mbreorder has no logging!
|
||||||
|
#assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
|
||||||
|
assert outfile.exists()
|
||||||
|
#in_tree = page_from_file(str(infile)).etree
|
||||||
|
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
|
||||||
|
out_tree = page_from_file(str(outfile)).etree
|
||||||
|
out_order = out_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
|
||||||
|
#assert len(out_order) >= 2, "result is inaccurate"
|
||||||
|
#assert in_order != out_order
|
||||||
|
assert out_order == ['r_1_1', 'r_2_1', 'r_2_2', 'r_2_3']
|
||||||
|
|
||||||
|
def test_run_eynollah_mbreorder_directory(tmp_path, pytestconfig, caplog):
|
||||||
|
indir = testdir.joinpath('resources')
|
||||||
|
outdir = tmp_path
|
||||||
|
args = [
|
||||||
|
'-m', MODELS_LAYOUT,
|
||||||
|
'-di', str(indir),
|
||||||
|
'-o', str(outdir),
|
||||||
|
]
|
||||||
|
if pytestconfig.getoption('verbose') > 0:
|
||||||
|
args.extend(['-l', 'DEBUG'])
|
||||||
|
caplog.set_level(logging.INFO)
|
||||||
|
runner = CliRunner()
|
||||||
|
with caplog.filtering(only_eynollah):
|
||||||
|
result = runner.invoke(mbreorder_cli, args, catch_exceptions=False)
|
||||||
|
assert result.exit_code == 0, result.stdout
|
||||||
|
logmsgs = [logrec.message for logrec in caplog.records]
|
||||||
|
# FIXME: mbreorder has no logging!
|
||||||
|
#assert len([logmsg for logmsg in logmsgs if logmsg.startswith('???')]) == 2
|
||||||
|
assert len(list(outdir.iterdir())) == 2
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"options",
|
||||||
|
[
|
||||||
|
[], # defaults
|
||||||
|
["-doit", #str(outrenderfile.parent)],
|
||||||
|
],
|
||||||
|
["-trocr"],
|
||||||
|
], ids=str)
|
||||||
|
def test_run_eynollah_ocr_filename(tmp_path, pytestconfig, caplog, options):
|
||||||
|
infile = testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')
|
||||||
|
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.xml')
|
||||||
|
outrenderfile = tmp_path.joinpath('render').joinpath('kant_aufklaerung_1784_0020.png')
|
||||||
|
outrenderfile.parent.mkdir()
|
||||||
|
args = [
|
||||||
|
'-m', MODELS_OCR,
|
||||||
|
'-i', str(infile),
|
||||||
|
'-dx', str(infile.parent),
|
||||||
|
'-o', str(outfile.parent),
|
||||||
|
]
|
||||||
|
if pytestconfig.getoption('verbose') > 0:
|
||||||
|
args.extend(['-l', 'DEBUG'])
|
||||||
|
caplog.set_level(logging.DEBUG)
|
||||||
|
runner = CliRunner()
|
||||||
|
if "-doit" in options:
|
||||||
|
options.insert(options.index("-doit") + 1, str(outrenderfile.parent))
|
||||||
|
with caplog.filtering(only_eynollah):
|
||||||
|
result = runner.invoke(ocr_cli, args + options, catch_exceptions=False)
|
||||||
|
assert result.exit_code == 0, result.stdout
|
||||||
|
logmsgs = [logrec.message for logrec in caplog.records]
|
||||||
|
# FIXME: ocr has no logging!
|
||||||
|
#assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
|
||||||
|
assert outfile.exists()
|
||||||
|
if "-doit" in options:
|
||||||
|
assert outrenderfile.exists()
|
||||||
|
#in_tree = page_from_file(str(infile)).etree
|
||||||
|
#in_order = in_tree.xpath("//page:OrderedGroup//@regionRef", namespaces=NS)
|
||||||
|
out_tree = page_from_file(str(outfile)).etree
|
||||||
|
out_texts = out_tree.xpath("//page:TextLine/page:TextEquiv[last()]/page:Unicode/text()", namespaces=NS)
|
||||||
|
assert len(out_texts) >= 2, ("result is inaccurate", out_texts)
|
||||||
|
assert sum(map(len, out_texts)) > 100, ("result is inaccurate", out_texts)
|
||||||
|
|
||||||
|
def test_run_eynollah_ocr_directory(tmp_path, pytestconfig, caplog):
|
||||||
|
indir = testdir.joinpath('resources')
|
||||||
|
outdir = tmp_path
|
||||||
|
args = [
|
||||||
|
'-m', MODELS_OCR,
|
||||||
|
'-di', str(indir),
|
||||||
|
'-dx', str(indir),
|
||||||
|
'-o', str(outdir),
|
||||||
|
]
|
||||||
|
if pytestconfig.getoption('verbose') > 0:
|
||||||
|
args.extend(['-l', 'DEBUG'])
|
||||||
|
caplog.set_level(logging.INFO)
|
||||||
|
runner = CliRunner()
|
||||||
|
with caplog.filtering(only_eynollah):
|
||||||
|
result = runner.invoke(ocr_cli, args, catch_exceptions=False)
|
||||||
|
assert result.exit_code == 0, result.stdout
|
||||||
|
logmsgs = [logrec.message for logrec in caplog.records]
|
||||||
|
# FIXME: ocr has no logging!
|
||||||
|
#assert any(True for logmsg in logmsgs if logmsg.startswith('???')), logmsgs
|
||||||
|
assert len(list(outdir.iterdir())) == 2
|
||||||
Loading…
Add table
Add a link
Reference in a new issue