mirror of
https://github.com/qurator-spk/ocrd-galley.git
synced 2025-06-08 22:29:56 +02:00
Merge branch 'master' of code.dev.sbb.berlin:qurator/ocrd-galley
This commit is contained in:
commit
82d3d71ed4
14 changed files with 49 additions and 42 deletions
16
.github/workflows/test.yml
vendored
Normal file
16
.github/workflows/test.yml
vendored
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
name: Test
|
||||||
|
|
||||||
|
on: push
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
name: Setup, Build, Publish, and Deploy
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
run: |-
|
||||||
|
FORCE_DOWNLOAD=y ./build
|
|
@ -1,7 +1,7 @@
|
||||||
FROM ubuntu:18.04
|
FROM ubuntu:18.04
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir --use-feature=2020-resolver"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG OCRD_VERSION_MINIMUM="2.18.1"
|
ARG OCRD_VERSION_MINIMUM="2.21.0"
|
||||||
ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
|
ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
|
||||||
ENV PIP_DEFAULT_TIMEOUT=120
|
ENV PIP_DEFAULT_TIMEOUT=120
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
FROM my_ocrd_workflow-core
|
FROM my_ocrd_workflow-core
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir --use-feature=2020-resolver"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG DINGLEHOPPER_COMMIT="6e47acd"
|
ARG DINGLEHOPPER_COMMIT="6e47acd"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,7 @@ FROM my_ocrd_workflow-core-cuda10.1
|
||||||
|
|
||||||
|
|
||||||
# XXX https://github.com/OCR-D/core/issues/642
|
# XXX https://github.com/OCR-D/core/issues/642
|
||||||
#ARG PIP_INSTALL="pip install --no-cache-dir --use-feature=2020-resolver"
|
#ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG OCRD_CALAMARI_VERSION="1.0.0"
|
ARG OCRD_CALAMARI_VERSION="1.0.0"
|
||||||
|
|
||||||
|
@ -20,8 +20,7 @@ COPY data/mirror/github.com/Calamari-OCR/calamari_models/gt4histocr
|
||||||
|
|
||||||
|
|
||||||
# Check pip dependencies
|
# Check pip dependencies
|
||||||
# XXX https://github.com/OCR-D/core/issues/642
|
RUN pip check
|
||||||
#RUN pip check
|
|
||||||
|
|
||||||
|
|
||||||
# Default command
|
# Default command
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
FROM my_ocrd_workflow-core
|
FROM my_ocrd_workflow-core
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir --use-feature=2020-resolver"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
|
|
||||||
|
|
||||||
# Build pip installable stuff
|
# Build pip installable stuff
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
FROM my_ocrd_workflow-core
|
FROM my_ocrd_workflow-core
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir --use-feature=2020-resolver"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG OCRD_CIS_VERSION="0.1.5"
|
ARG OCRD_CIS_VERSION="0.1.5"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
FROM my_ocrd_workflow-core
|
FROM my_ocrd_workflow-core
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir --use-feature=2020-resolver"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG OCRD_FILEFORMAT_VERSION="0.1.1"
|
ARG OCRD_FILEFORMAT_VERSION="0.1.1"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
FROM my_ocrd_workflow-core
|
FROM my_ocrd_workflow-core
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir --use-feature=2020-resolver"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG OCRD_OLENA_VERSION="1.2.0"
|
ARG OCRD_OLENA_VERSION="1.2.0"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
FROM my_ocrd_workflow-core
|
FROM my_ocrd_workflow-core
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir --use-feature=2020-resolver"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG TESSDATA_BEST_VERSION="4.0.0"
|
ARG TESSDATA_BEST_VERSION="4.0.0"
|
||||||
ENV TESSDATA_PREFIX /usr/local/share/tessdata
|
ENV TESSDATA_PREFIX /usr/local/share/tessdata
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
FROM my_ocrd_workflow-core
|
FROM my_ocrd_workflow-core
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir --use-feature=2020-resolver"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG SBB_BINARIZATION_COMMIT="4d145cc"
|
ARG SBB_BINARIZATION_COMMIT="4d145cc"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
FROM my_ocrd_workflow-core-cuda10.0
|
FROM my_ocrd_workflow-core-cuda10.0
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir --use-feature=2020-resolver"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG SBB_TEXTLINE_DETECTOR_COMMIT="247d5f9"
|
ARG SBB_TEXTLINE_DETECTOR_COMMIT="247d5f9"
|
||||||
|
|
||||||
|
|
||||||
|
|
37
README.md
37
README.md
|
@ -27,13 +27,25 @@ including all dependencies in Docker.
|
||||||
|
|
||||||
How to use
|
How to use
|
||||||
----------
|
----------
|
||||||
It's easiest to use it as pre-built containers. To run the containers on an
|
**Currently, due to problems with the Travis CI, we do not provide pre-built
|
||||||
example workspace:
|
containers anymore.***
|
||||||
|
|
||||||
|
To build the containers yourself using Docker:
|
||||||
|
~~~
|
||||||
|
cd ~/devel/ocrd-galley/
|
||||||
|
./build
|
||||||
|
~~~
|
||||||
|
|
||||||
|
You can then install the wrappers into a Python venv:
|
||||||
|
~~~
|
||||||
|
cd ~/devel/ocrd-galley/wrapper
|
||||||
|
pip install .
|
||||||
|
~~~
|
||||||
|
|
||||||
|
You may then use the script `my_ocrd_workflow` to use your self-built
|
||||||
|
containers on an example workspace:
|
||||||
|
|
||||||
~~~
|
~~~
|
||||||
# Update to the latest stable containers
|
|
||||||
(cd ~/devel/ocrd-galley/; ./run-docker-hub-update)
|
|
||||||
|
|
||||||
# Download an example workspace
|
# Download an example workspace
|
||||||
cd /tmp
|
cd /tmp
|
||||||
wget https://qurator-data.de/examples/actevedef_718448162.first-page.zip
|
wget https://qurator-data.de/examples/actevedef_718448162.first-page.zip
|
||||||
|
@ -41,18 +53,9 @@ unzip actevedef_718448162.first-page.zip
|
||||||
|
|
||||||
# Run the workflow on it
|
# Run the workflow on it
|
||||||
cd actevedef_718448162.first-page
|
cd actevedef_718448162.first-page
|
||||||
~/devel/ocrd-galley/run-docker-hub
|
~/devel/ocrd-galley/my_ocrd_workflow
|
||||||
~~~
|
~~~
|
||||||
|
|
||||||
### Build the containers yourself
|
|
||||||
To build the containers yourself using Docker:
|
|
||||||
~~~
|
|
||||||
cd ~/devel/ocrd-galley/
|
|
||||||
./build
|
|
||||||
~~~
|
|
||||||
You may then use the script `run` to use your self-built containers, analogous to
|
|
||||||
the example above.
|
|
||||||
|
|
||||||
### Viewing results
|
### Viewing results
|
||||||
You may then examine the results using
|
You may then examine the results using
|
||||||
[PRImA's PAGE Viewer](https://www.primaresearch.org/tools/PAGEViewer):
|
[PRImA's PAGE Viewer](https://www.primaresearch.org/tools/PAGEViewer):
|
||||||
|
@ -83,7 +86,7 @@ The document must be specified by its PPN, for example:
|
||||||
~~~
|
~~~
|
||||||
~/devel/ocrd-galley/ppn2ocr PPN77164308X
|
~/devel/ocrd-galley/ppn2ocr PPN77164308X
|
||||||
cd PPN77164308X
|
cd PPN77164308X
|
||||||
~/devel/ocrd-galley/run-docker-hub -I BEST --skip-validation
|
~/devel/ocrd-galley/my_ocrd_workflow -I BEST --skip-validation
|
||||||
~~~
|
~~~
|
||||||
|
|
||||||
This produces a workspace directory `PPN77164308X` with the OCR results in it;
|
This produces a workspace directory `PPN77164308X` with the OCR results in it;
|
||||||
|
@ -101,7 +104,7 @@ for the given images.
|
||||||
~~~
|
~~~
|
||||||
~/devel/ocrd-galley/ocrd-workspace-from-images 0005.png
|
~/devel/ocrd-galley/ocrd-workspace-from-images 0005.png
|
||||||
cd workspace-xxxxx # output by the last command
|
cd workspace-xxxxx # output by the last command
|
||||||
~/devel/ocrd-galley/run-docker-hub
|
~/devel/ocrd-galley/my_ocrd_workflow
|
||||||
~~~
|
~~~
|
||||||
|
|
||||||
This produces a workspace from the files and then runs the OCR workflow on it.
|
This produces a workspace from the files and then runs the OCR workflow on it.
|
||||||
|
|
|
@ -70,7 +70,7 @@ main() {
|
||||||
do_validate
|
do_validate
|
||||||
|
|
||||||
|
|
||||||
ocrd-calamari-recognize --overwrite -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI -P checkpoint "/var/lib/calamari-models/GT4HistOCR/2019-07-22T15_49+0200/*.ckpt.json" -P textequiv_level "$TEXTEQUIV_LEVEL"
|
ocrd-calamari-recognize --overwrite -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI -P checkpoint "/var/lib/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/*.ckpt.json" -P textequiv_level "$TEXTEQUIV_LEVEL"
|
||||||
ocrd-tesserocr-recognize --overwrite -I OCR-D-SEG-LINE -O OCR-D-OCR-TESS -P model "GT4HistOCR_2000000" -P textequiv_level "$TEXTEQUIV_LEVEL"
|
ocrd-tesserocr-recognize --overwrite -I OCR-D-SEG-LINE -O OCR-D-OCR-TESS -P model "GT4HistOCR_2000000" -P textequiv_level "$TEXTEQUIV_LEVEL"
|
||||||
do_validate
|
do_validate
|
||||||
|
|
||||||
|
|
|
@ -1,11 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
# Update the my_ocrd_workflow containers
|
|
||||||
|
|
||||||
DOCKER_IMAGE_PREFIX=mikegerber/my_ocrd_workflow
|
|
||||||
DOCKER_IMAGE_TAG=stable
|
|
||||||
|
|
||||||
|
|
||||||
sub_images=`ls -1 Dockerfile-* | sed 's/Dockerfile-//'`
|
|
||||||
for x in $sub_images; do
|
|
||||||
docker pull $DOCKER_IMAGE_PREFIX-$x:$DOCKER_IMAGE_TAG
|
|
||||||
done
|
|
Loading…
Add table
Add a link
Reference in a new issue