Compare commits
167 Commits
@ -1,20 +0,0 @@
|
||||
version: 2.1
|
||||
|
||||
jobs:
|
||||
black:
|
||||
parameters:
|
||||
python-version:
|
||||
type: string
|
||||
docker:
|
||||
- image: cimg/python:<< parameters.python-version >>
|
||||
steps:
|
||||
- checkout
|
||||
- run: pip3 install --upgrade pip
|
||||
- run: pip3 install black
|
||||
- run: black .
|
||||
|
||||
workflows:
|
||||
black:
|
||||
jobs:
|
||||
- black:
|
||||
python-version: "3.11"
|
@ -0,0 +1,5 @@
|
||||
src/dinglehopper/tests
|
||||
dist
|
||||
build
|
||||
*.egg-info
|
||||
.git
|
@ -0,0 +1,20 @@
|
||||
name: 'Test Report'
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: ['test']
|
||||
types:
|
||||
- completed
|
||||
permissions:
|
||||
contents: read
|
||||
actions: read
|
||||
checks: write
|
||||
jobs:
|
||||
report:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: dorny/test-reporter@v1
|
||||
with:
|
||||
artifact: /test-results-(.*)/
|
||||
name: 'Tests Results - $1'
|
||||
path: '*junit.xml'
|
||||
reporter: java-junit
|
@ -0,0 +1,16 @@
|
||||
variables:
|
||||
http_proxy: "http://http-proxy.sbb.spk-berlin.de:3128/"
|
||||
https_proxy: "http://http-proxy.sbb.spk-berlin.de:3128/"
|
||||
HTTP_PROXY: "http://http-proxy.sbb.spk-berlin.de:3128/"
|
||||
HTTPS_PROXY: "http://http-proxy.sbb.spk-berlin.de:3128/"
|
||||
|
||||
stages:
|
||||
- triggers
|
||||
|
||||
mirror:
|
||||
stage: triggers
|
||||
trigger:
|
||||
include: .gitlab/mirror.yml
|
||||
strategy: depend
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
@ -0,0 +1,47 @@
|
||||
stages:
|
||||
- check
|
||||
- pull
|
||||
- push
|
||||
|
||||
default:
|
||||
image: debian
|
||||
|
||||
|
||||
check:
|
||||
stage: check
|
||||
|
||||
script:
|
||||
- whoami; env
|
||||
- if [ -z "$CI_COMMIT_BRANCH" ]; then echo "Not on a branch" >&2; exit 3; fi
|
||||
|
||||
|
||||
pull-gitlab:
|
||||
stage: pull
|
||||
script:
|
||||
- echo "This is redundant"
|
||||
|
||||
pull-github:
|
||||
stage: pull
|
||||
before_script:
|
||||
- apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
|
||||
script:
|
||||
- git remote remove github 2>/dev/null || true
|
||||
- git remote add github https://github.com/qurator-spk/dinglehopper.git
|
||||
- git remote -v
|
||||
|
||||
- git pull github "$CI_COMMIT_BRANCH"
|
||||
|
||||
|
||||
push-gitlab:
|
||||
stage: push
|
||||
before_script:
|
||||
- apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
|
||||
script:
|
||||
- git push origin "$CI_COMMIT_SHA":"$CI_COMMIT_BRANCH"
|
||||
|
||||
push-github:
|
||||
stage: push
|
||||
before_script:
|
||||
- apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
|
||||
script:
|
||||
- git push github "$CI_COMMIT_SHA":"$CI_COMMIT_BRANCH"
|
@ -0,0 +1,38 @@
|
||||
ARG DOCKER_BASE_IMAGE
|
||||
FROM $DOCKER_BASE_IMAGE
|
||||
ARG VCS_REF
|
||||
ARG BUILD_DATE
|
||||
LABEL \
|
||||
maintainer="https://github.com/qurator-spk/dinglehopper/issues" \
|
||||
org.label-schema.vcs-ref=$VCS_REF \
|
||||
org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
|
||||
org.label-schema.build-date=$BUILD_DATE \
|
||||
org.opencontainers.image.vendor="qurator" \
|
||||
org.opencontainers.image.title="dinglehopper" \
|
||||
org.opencontainers.image.description="An OCR evaluation tool" \
|
||||
org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \
|
||||
org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \
|
||||
org.opencontainers.image.revision=$VCS_REF \
|
||||
org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.base.name=ocrd/core
|
||||
|
||||
ENV LANG=C.UTF-8
|
||||
ENV LC_ALL=C.UTF-8
|
||||
|
||||
# avoid HOME/.local/share (hard to predict USER here)
|
||||
# so let XDG_DATA_HOME coincide with fixed system location
|
||||
# (can still be overridden by derived stages)
|
||||
ENV XDG_DATA_HOME /usr/local/share
|
||||
# avoid the need for an extra volume for persistent resource user db
|
||||
# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
|
||||
ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
|
||||
|
||||
WORKDIR /build/dinglehopper
|
||||
COPY . .
|
||||
COPY ocrd-tool.json .
|
||||
# prepackage ocrd-tool.json as ocrd-all-tool.json
|
||||
RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
|
||||
RUN make install && rm -rf /build/dinglehopper
|
||||
|
||||
WORKDIR /data
|
||||
VOLUME /data
|
@ -0,0 +1,33 @@
|
||||
PYTHON = python3
|
||||
PIP = pip3
|
||||
PYTHONIOENCODING=utf8
|
||||
PYTEST_ARGS = -vv
|
||||
|
||||
DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0
|
||||
DOCKER_TAG = ocrd/dinglehopper
|
||||
|
||||
help:
|
||||
@echo
|
||||
@echo " Targets"
|
||||
@echo
|
||||
@echo " install Install full Python package via pip"
|
||||
@echo " docker Build the ocrd/dinglehopper docker image"
|
||||
|
||||
# Install Python package via pip
|
||||
install:
|
||||
$(PIP) install .
|
||||
|
||||
install-dev:
|
||||
$(PIP) install -e .
|
||||
|
||||
test:
|
||||
pytest $(PYTEST_ARGS)
|
||||
|
||||
docker:
|
||||
docker build \
|
||||
--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
|
||||
--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
|
||||
--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
|
||||
-t $(DOCKER_TAG) .
|
||||
|
||||
.PHONY: help install install-dev test docker
|
@ -1,8 +1,14 @@
|
||||
pytest
|
||||
pytest-cov
|
||||
pytest-mypy
|
||||
black
|
||||
pre-commit
|
||||
|
||||
ruff ; python_version >= "3.7"
|
||||
pytest-ruff ; python_version >= "3.7"
|
||||
ruff
|
||||
pytest-ruff
|
||||
|
||||
mypy
|
||||
types-lxml
|
||||
types-setuptools
|
||||
pytest-mypy
|
||||
|
||||
liccheck
|
||||
|
@ -1,14 +1,14 @@
|
||||
click
|
||||
jinja2
|
||||
lxml
|
||||
uniseg
|
||||
uniseg >= 0.9.1
|
||||
numpy
|
||||
colorama
|
||||
MarkupSafe
|
||||
ocrd >= 2.20.1
|
||||
ocrd >= 3.3.0
|
||||
attrs
|
||||
multimethod == 1.3 # latest version to officially support Python 3.5
|
||||
multimethod >= 1.3
|
||||
tqdm
|
||||
rapidfuzz >= 2.4.2
|
||||
six # XXX workaround OCR-D/core#730
|
||||
rapidfuzz >= 2.7.0
|
||||
chardet
|
||||
importlib_resources
|
||||
|
@ -1,78 +1,76 @@
|
||||
import json
|
||||
from functools import cached_property
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
import click
|
||||
from ocrd_models import OcrdFileType
|
||||
from ocrd import Processor
|
||||
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
||||
from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
|
||||
from pkg_resources import resource_string
|
||||
from ocrd_utils import make_file_id
|
||||
|
||||
from .cli import process as cli_process
|
||||
|
||||
OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
|
||||
|
||||
|
||||
@click.command()
|
||||
@ocrd_cli_options
|
||||
def ocrd_dinglehopper(*args, **kwargs):
|
||||
return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)
|
||||
|
||||
|
||||
class OcrdDinglehopperEvaluate(Processor):
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
|
||||
super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
|
||||
|
||||
def process(self):
|
||||
assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
|
||||
assert_file_grp_cardinality(self.output_file_grp, 1)
|
||||
@cached_property
|
||||
def executable(self):
|
||||
return 'ocrd-dinglehopper'
|
||||
|
||||
log = getLogger("processor.OcrdDinglehopperEvaluate")
|
||||
def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
|
||||
|
||||
assert self.parameter
|
||||
metrics = self.parameter["metrics"]
|
||||
textequiv_level = self.parameter["textequiv_level"]
|
||||
gt_grp, ocr_grp = self.input_file_grp.split(",")
|
||||
|
||||
input_file_tuples = self.zip_input_files(on_error="abort")
|
||||
for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
|
||||
if not gt_file or not ocr_file:
|
||||
# file/page was not found in this group
|
||||
continue
|
||||
gt_file = self.workspace.download_file(gt_file)
|
||||
ocr_file = self.workspace.download_file(ocr_file)
|
||||
page_id = gt_file.pageId
|
||||
|
||||
log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)
|
||||
|
||||
file_id = make_file_id(ocr_file, self.output_file_grp)
|
||||
report_prefix = os.path.join(self.output_file_grp, file_id)
|
||||
|
||||
# Process the files
|
||||
try:
|
||||
os.mkdir(self.output_file_grp)
|
||||
except FileExistsError:
|
||||
pass
|
||||
cli_process(
|
||||
gt_file.local_filename,
|
||||
ocr_file.local_filename,
|
||||
report_prefix,
|
||||
metrics=metrics,
|
||||
textequiv_level=textequiv_level,
|
||||
# wrong number of inputs: let fail
|
||||
gt_file, ocr_file = input_files
|
||||
# missing on either side: skip (zip_input_files already warned)
|
||||
if not gt_file or not ocr_file:
|
||||
return
|
||||
# missing download (i.e. OCRD_DOWNLOAD_INPUT=false):
|
||||
if not gt_file.local_filename:
|
||||
if config.OCRD_MISSING_INPUT == 'ABORT':
|
||||
raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype)
|
||||
return
|
||||
if not ocr_file.local_filename:
|
||||
if config.OCRD_MISSING_INPUT == 'ABORT':
|
||||
raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype)
|
||||
return
|
||||
|
||||
page_id = gt_file.pageId
|
||||
|
||||
file_id = make_file_id(ocr_file, self.output_file_grp)
|
||||
cli_process(
|
||||
gt_file.local_filename,
|
||||
ocr_file.local_filename,
|
||||
file_id,
|
||||
self.output_file_grp,
|
||||
metrics=metrics,
|
||||
textequiv_level=textequiv_level,
|
||||
)
|
||||
|
||||
# Add reports to the workspace
|
||||
for report_suffix, mimetype in [
|
||||
[".html", "text/html"],
|
||||
[".json", "application/json"],
|
||||
]:
|
||||
output_file_id = file_id + report_suffix
|
||||
output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
|
||||
if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
|
||||
raise FileExistsError(f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set")
|
||||
self.workspace.add_file(
|
||||
file_id=output_file_id,
|
||||
file_grp=self.output_file_grp,
|
||||
page_id=page_id,
|
||||
mimetype=mimetype,
|
||||
local_filename=file_id + report_suffix,
|
||||
)
|
||||
|
||||
# Add reports to the workspace
|
||||
for report_suffix, mimetype in [
|
||||
[".html", "text/html"],
|
||||
[".json", "application/json"],
|
||||
]:
|
||||
self.workspace.add_file(
|
||||
file_id=file_id + report_suffix,
|
||||
file_grp=self.output_file_grp,
|
||||
page_id=page_id,
|
||||
mimetype=mimetype,
|
||||
local_filename=report_prefix + report_suffix,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ocrd_dinglehopper()
|
||||
|
@ -1 +1 @@
|
||||
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
|
||||
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
|
||||
|
@ -0,0 +1,35 @@
|
||||
from __future__ import division, print_function
|
||||
|
||||
import math
|
||||
|
||||
import pytest
|
||||
|
||||
from .. import character_error_rate, plain_text
|
||||
from .util import working_directory
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
@pytest.mark.parametrize(
|
||||
"gt_file_content,ocr_file_content,cer_expected",
|
||||
[
|
||||
("", "Lorem ipsum", math.inf),
|
||||
("Lorem ipsum", "", 1.0),
|
||||
("\ufeff", "Lorem ipsum", math.inf),
|
||||
("Lorem ipsum", "\ufeff", 1.0),
|
||||
("", "", 0.0),
|
||||
("\ufeff", "", 0.0),
|
||||
("", "\ufeff", 0.0),
|
||||
],
|
||||
)
|
||||
def test_empty_files(tmp_path, gt_file_content, ocr_file_content, cer_expected):
|
||||
with working_directory(tmp_path):
|
||||
|
||||
with open("gt.txt", "w") as gtf:
|
||||
gtf.write(gt_file_content)
|
||||
with open("ocr.txt", "w") as ocrf:
|
||||
ocrf.write(ocr_file_content)
|
||||
|
||||
gt_text = plain_text("gt.txt")
|
||||
ocr_text = plain_text("ocr.txt")
|
||||
|
||||
assert character_error_rate(gt_text, ocr_text) == cer_expected
|
Loading…
Reference in New Issue