From 02eae7b6fad17f678188a9330071a8e179f06832 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 14 Aug 2020 14:37:20 +0200 Subject: [PATCH 01/16] =?UTF-8?q?=E2=9C=A8=20Move=20processors=20into=20th?= =?UTF-8?q?eir=20own=20Docker=20container?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 86 -------------------------- Dockerfile-boxed-base | 41 ++++++++++++ Dockerfile-boxed-dinglehopper | 18 ++++++ Dockerfile-boxed-ocrd_calamari | 24 +++++++ Dockerfile-boxed-ocrd_olena | 33 ++++++++++ Dockerfile-boxed-ocrd_tesserocr | 35 +++++++++++ Dockerfile-boxed-sbb_textline_detector | 22 +++++++ build | 7 ++- requirements.txt | 13 ---- run | 47 ++++++++------ run-docker-hub | 4 -- 11 files changed, 208 insertions(+), 122 deletions(-) delete mode 100644 Dockerfile create mode 100644 Dockerfile-boxed-base create mode 100644 Dockerfile-boxed-dinglehopper create mode 100644 Dockerfile-boxed-ocrd_calamari create mode 100644 Dockerfile-boxed-ocrd_olena create mode 100644 Dockerfile-boxed-ocrd_tesserocr create mode 100644 Dockerfile-boxed-sbb_textline_detector delete mode 100644 requirements.txt delete mode 100755 run-docker-hub diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index e786a5d..0000000 --- a/Dockerfile +++ /dev/null @@ -1,86 +0,0 @@ -FROM ubuntu:18.04 - -ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 -ENV PIP_DEFAULT_TIMEOUT=120 - -ENV OCRD_OLENA_VERSION 1.2.0 -ENV TESSDATA_BEST_VERSION 4.0.0 -ENV TESSDATA_PREFIX /usr/local/share/tessdata - - -RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ - apt-get update && \ - apt-get install -y \ - curl xz-utils \ - python3-pip \ - git \ - software-properties-common \ -# For clstm on Ubuntu 19.04: - swig libeigen3-dev libpng-dev libprotobuf-dev \ -# For cv2: - libsm6 libxrender1 \ -# For ocrd_olena: - imagemagick \ -# XML utils - libxml2-utils \ - xmlstarlet \ - && \ - apt-get clean && rm -rf /var/lib/apt/lists/* - - -# Install Leptonica and Tesseract. -RUN add-apt-repository ppa:alex-p/tesseract-ocr && \ - apt-get update && \ - apt-get install -y \ - tesseract-ocr \ - libtesseract-dev \ - && \ - apt-get clean && rm -rf /var/lib/apt/lists/* - -# Set up OCR-D logging -COPY ocrd_logging.py /etc/ - - -# Build ocrd_olena -# XXX .deb needs an update -RUN curl -sSL --retry 3 -O https://qurator-data.de/~mike.gerber/olena_2.1-0+ocrd-git/olena-bin_2.1-0+ocrd-git_amd64.deb && \ - dpkg -i --force-depends olena-bin_2.1-0+ocrd-git_amd64.deb && \ - rm -f olena-bin_2.1-0+ocrd-git_amd64.deb && \ - apt-get update && \ - apt-get -f install -y && \ - apt-get clean && rm -rf /var/lib/apt/lists/* -RUN pip3 install --no-cache-dir --upgrade pip && \ - curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \ - mkdir ocrd_olena && \ - tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \ - cd ocrd_olena && \ - sed -i 's/^install: deps$/install:/' Makefile && \ - pip3 install --no-cache-dir --use-feature=2020-resolver ocrd && \ - make install PREFIX=/usr/local && \ - cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz - - -# Copy OCR models -RUN mkdir -p /var/lib/calamari-models -COPY data/calamari-models/GT4HistOCR /var/lib/calamari-models/GT4HistOCR -RUN mkdir -p $TESSDATA_PREFIX -ADD data/mirror/github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}-repacked.tar.gz $TESSDATA_PREFIX/ -COPY data/tesseract-models/GT4HistOCR/GT4HistOCR_2000000.traineddata $TESSDATA_PREFIX/ -COPY data/textline_detection /var/lib/textline_detection - - -# Install requirements -# Using pipdeptree here to get more info than from pip3 check -COPY requirements.txt /tmp/ -RUN pip3 install --no-cache-dir --upgrade pip && \ - pip3 install --no-cache-dir --use-feature=2020-resolver -r /tmp/requirements.txt && \ - pip3 install --no-cache-dir pipdeptree && \ - pipdeptree -w fail - - -COPY my_ocrd_workflow /usr/bin/ -COPY xsd/* /usr/share/xml/ - - -WORKDIR /data -ENTRYPOINT ["/usr/bin/my_ocrd_workflow"] diff --git a/Dockerfile-boxed-base b/Dockerfile-boxed-base new file mode 100644 index 0000000..3fad690 --- /dev/null +++ b/Dockerfile-boxed-base @@ -0,0 +1,41 @@ +FROM ubuntu:18.04 + +ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 +ENV PIP_DEFAULT_TIMEOUT=120 + + +RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ + apt-get update && \ + apt-get install -y \ + curl xz-utils \ + python3-pip \ +# For add-apt-repository: + software-properties-common \ +# XML utils + libxml2-utils \ + xmlstarlet \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + +# Set up OCR-D logging +COPY ocrd_logging.py /etc/ + + +# Build pip installable stuff +RUN pip3 install --no-cache-dir --upgrade pip && \ + pip3 install --no-cache-dir \ +# Resolve conflicts early: + 'setuptools >= 41.0.0' \ + 'ocrd >= 2.13.1' + + +# Check pip dependencies +RUN pip3 check + + +WORKDIR /data + +# Default command +CMD ['ocrd'] diff --git a/Dockerfile-boxed-dinglehopper b/Dockerfile-boxed-dinglehopper new file mode 100644 index 0000000..aa4749c --- /dev/null +++ b/Dockerfile-boxed-dinglehopper @@ -0,0 +1,18 @@ +FROM boxed-base + + +ENV DINGLEHOPPER_COMMIT 2b98f69 + + +# Build pip installable stuff +RUN pip3 install --no-cache-dir \ +# Now the real stuff: + https://github.com/qurator-spk/dinglehopper/archive/$DINGLEHOPPER_COMMIT.tar.gz + + +# Check pip dependencies +RUN pip3 check + + +# Default command +CMD ["ocrd-dinglehopper"] diff --git a/Dockerfile-boxed-ocrd_calamari b/Dockerfile-boxed-ocrd_calamari new file mode 100644 index 0000000..a64a0c1 --- /dev/null +++ b/Dockerfile-boxed-ocrd_calamari @@ -0,0 +1,24 @@ +FROM boxed-base + + +# Build pip installable stuff +RUN pip3 install --no-cache-dir \ +# Resolve conflicts early: + 'tensorflow-gpu == 1.15.*' \ + 'calamari-ocr == 0.3.5' \ +# Now the real stuff: + 'ocrd_calamari >= 0.0.7' + + +# Copy OCR models +RUN mkdir -p /var/lib/calamari-models +COPY data/calamari-models/GT4HistOCR /var/lib/calamari-models/GT4HistOCR + + + +# Check pip dependencies +RUN pip3 check + + +# Default command +CMD ["ocrd-calamari-recognize"] diff --git a/Dockerfile-boxed-ocrd_olena b/Dockerfile-boxed-ocrd_olena new file mode 100644 index 0000000..98f36bf --- /dev/null +++ b/Dockerfile-boxed-ocrd_olena @@ -0,0 +1,33 @@ +FROM boxed-base + +ENV OCRD_OLENA_VERSION 1.2.0 + +# Build ocrd_olena +RUN apt-get update && \ + apt-get install -y \ + imagemagick \ + && \ + apt-get clean && rm -rf /var/lib/apt/lists/* +RUN curl -sSL --retry 3 -O https://qurator-data.de/~mike.gerber/olena_2.1-0+ocrd-git/olena-bin_2.1-0+ocrd-git_amd64.deb && \ + dpkg -i --force-depends olena-bin_2.1-0+ocrd-git_amd64.deb && \ + rm -f olena-bin_2.1-0+ocrd-git_amd64.deb && \ + apt-get update && \ + apt-get -f install -y && \ + apt-get clean && rm -rf /var/lib/apt/lists/* +RUN pip3 install --no-cache-dir --upgrade pip && \ + curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \ + mkdir ocrd_olena && \ + tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \ + cd ocrd_olena && \ + sed -i 's/^install: deps$/install:/' Makefile && \ + pip3 install --no-cache-dir --use-feature=2020-resolver ocrd && \ + make install PREFIX=/usr/local && \ + cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz + + +# Check pip dependencies +RUN pip3 check + + +# Default command +CMD ['ocrd-olena-binarize'] diff --git a/Dockerfile-boxed-ocrd_tesserocr b/Dockerfile-boxed-ocrd_tesserocr new file mode 100644 index 0000000..c0ca4ff --- /dev/null +++ b/Dockerfile-boxed-ocrd_tesserocr @@ -0,0 +1,35 @@ +FROM boxed-base + + +ENV TESSDATA_BEST_VERSION 4.0.0 +ENV TESSDATA_PREFIX /usr/local/share/tessdata + + +# Install Leptonica and Tesseract. +RUN add-apt-repository ppa:alex-p/tesseract-ocr && \ + apt-get update && \ + apt-get install -y \ + tesseract-ocr \ + libtesseract-dev \ + && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + + +# Copy OCR models +RUN mkdir -p $TESSDATA_PREFIX +ADD data/mirror/github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}-repacked.tar.gz $TESSDATA_PREFIX/ +COPY data/tesseract-models/GT4HistOCR/GT4HistOCR_2000000.traineddata $TESSDATA_PREFIX/ + + +# Build pip installable stuff +RUN pip3 install --no-cache-dir \ +# Now the real stuff: + 'ocrd_tesserocr >= 0.9.0' + + +# Check pip dependencies +RUN pip3 check + + +# Default command +CMD ["ocrd-tesserocr-recognize"] diff --git a/Dockerfile-boxed-sbb_textline_detector b/Dockerfile-boxed-sbb_textline_detector new file mode 100644 index 0000000..4274725 --- /dev/null +++ b/Dockerfile-boxed-sbb_textline_detector @@ -0,0 +1,22 @@ +FROM boxed-base + + +ENV SBB_TEXTLINE_DETECTOR_COMMIT 8b01d9e + + +# Build pip installable stuff +RUN pip3 install --no-cache-dir \ +# Now the real stuff: + https://github.com/qurator-spk/sbb_textline_detector/archive/$SBB_TEXTLINE_DETECTOR_COMMIT.tar.gz + + +# Copy OCR models +COPY data/textline_detection /var/lib/textline_detection + + +# Check pip dependencies +RUN pip3 check + + +# Default command +CMD ["ocrd-sbb-textline-detector"] diff --git a/build b/build index 227a914..3a50a4d 100755 --- a/build +++ b/build @@ -21,4 +21,9 @@ get_from_web() { handle_data -docker build --cache-from my_ocrd_workflow -t my_ocrd_workflow . +docker build -t boxed-base -f Dockerfile-boxed-base . +docker build -t boxed-ocrd_calamari -f Dockerfile-boxed-ocrd_calamari . +docker build -t boxed-dinglehopper -f Dockerfile-boxed-dinglehopper . +docker build -t boxed-ocrd_olena -f Dockerfile-boxed-ocrd_olena . +docker build -t boxed-ocrd_tesserocr -f Dockerfile-boxed-ocrd_tesserocr . +docker build -t boxed-sbb_textline_detector -f Dockerfile-boxed-sbb_textline_detector . diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index d97256c..0000000 --- a/requirements.txt +++ /dev/null @@ -1,13 +0,0 @@ -tensorflow-gpu < 2.0 # Needed for sbb_text_linedetector - -ocrd >= 2.13.1 - -# XXX See https://github.com/OCR-D/ocrd_tesserocr/issues/135 -# ocrd_tesserocr >= 0.8.XXX -https://github.com/mikegerber/ocrd_tesserocr/archive/fix/set-pcgtsid.tar.gz - -ocrd_calamari >= 0.0.7 - -https://github.com/qurator-spk/sbb_textline_detector/archive/8b01d9e.tar.gz - -https://github.com/qurator-spk/dinglehopper/archive/2b98f69.tar.gz diff --git a/run b/run index 7fbf16d..5fb76c4 100755 --- a/run +++ b/run @@ -1,31 +1,42 @@ -#!/bin/sh -# Run the my_ocrd_workflow container on the current workspace +#!/bin/bash set -e # Abort on error -DOCKER_IMAGE=${DOCKER_IMAGE:-my_ocrd_workflow:latest} # default to locally built - -if echo "$DOCKER_IMAGE" | grep -q "/"; then - docker pull "$DOCKER_IMAGE" -fi +self=`realpath $0` +self_dir=`dirname "$self"` -# XXX Work around podman vs docker uid behaviour +# Docker run options +docker_run_options="--rm -t" +docker_run_options="$docker_run_options --mount type=bind,src=\"$(pwd)\",target=/data" +# In podman, the container always runs as the real user == uid 0 in container if docker -v 2>&1 | grep -q podman; then user="0:0" else user="`id -u`:`id -g`" fi - - -# The container currently needs to run privileged to allow it to read from e.g. +docker_run_options="$docker_run_options --user $user" +docker_run_options="$docker_run_options -e LOG_LEVEL=$LOG_LEVEL" +# The containers currently need to run privileged to allow it to read from e.g. # /home on SELinux secured systems such as Fedora. We might want to use udica # instead in the future. +docker_run_options="$docker_run_options --privileged=true" + + +# Build aliases for the containerized ocrd processors +build_alias() { + local command=$1 + local docker_image=$2 + + alias $command="docker run $docker_run_options $docker_image $command" +} +shopt -s expand_aliases # Required for non-interactive shells +build_alias ocrd boxed-base +build_alias ocrd-olena-binarize boxed-ocrd_olena +build_alias ocrd-sbb-textline-detector boxed-sbb_textline_detector +build_alias ocrd-calamari-recognize boxed-ocrd_calamari +build_alias ocrd-tesserocr-recognize boxed-ocrd_tesserocr +build_alias ocrd-dinglehopper boxed-dinglehopper + -docker run --privileged=true --rm -t \ - \ - --user $user \ - --mount type=bind,src="$(pwd)",target=/data \ - \ - -e LOG_LEVEL=$LOG_LEVEL \ - $DOCKER_IMAGE "$@" +. $self_dir/my_ocrd_workflow diff --git a/run-docker-hub b/run-docker-hub deleted file mode 100755 index 9e4339e..0000000 --- a/run-docker-hub +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh -# Run the my_ocrd_workflow container on the current workspace - -DOCKER_IMAGE=mikegerber/my_ocrd_workflow:stable `dirname $0`/run "$@" From 92391747a76d0991a4cd0a09618804c2644bc59b Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 14 Aug 2020 14:38:02 +0200 Subject: [PATCH 02/16] =?UTF-8?q?=F0=9F=A7=B9=20Remove=20obsolete=20xsd/?= =?UTF-8?q?=20directory?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- xsd/pagecontent.2017-07-15.xsd | 2137 ------------------------- xsd/pagecontent.2018-07-15.xsd | 2496 ----------------------------- xsd/pagecontent.2019-07-15.xsd | 2674 -------------------------------- 3 files changed, 7307 deletions(-) delete mode 100644 xsd/pagecontent.2017-07-15.xsd delete mode 100644 xsd/pagecontent.2018-07-15.xsd delete mode 100644 xsd/pagecontent.2019-07-15.xsd diff --git a/xsd/pagecontent.2017-07-15.xsd b/xsd/pagecontent.2017-07-15.xsd deleted file mode 100644 index b4b2266..0000000 --- a/xsd/pagecontent.2017-07-15.xsd +++ /dev/null @@ -1,2137 +0,0 @@ - - - - - - Page Content - Ground Truth and Storage - - - - - - - - - - - - - - - The timestamp has to be in UTC (Coordinated - Universal Time) and not local time. - - - - - - - The timestamp has to be in UTC (Coordinated - Universal Time) and not local time. - - - - - - - - - - External reference of any kind - - - - - - - - Alternative document page images (e.g. - black-and-white) - - - - - - - - - - - - - - - - Unassigned regions are considered to be in the - (virtual) default layer which is to be treated - as below any other layers. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - For generic use - - - - - Page type - - - - - - The primary language used in the page (lower-level definitions override the page-level definition) - - - - - - - The secondary language used in the page (lower-level definitions override the page-level definition) - - - - - - - The primary script used in the page (lower-level definitions override the page-level definition) - - - - - - - The secondary script used in the page (lower-level definitions override the page-level definition) - - - - - - - The direction in which text in a region should be - read (within lines) (lower-level definitions override the page-level definition) - - - - - - Inner-block order of text lines (in addition to “readingDirection” which is the inner-text line order of words and characters) (lower-level definitions override the page-level definition) - - - - - - - Pure text is represented as a text region. This includes - drop capitals, but practically ornate text may be - considered as a graphic. - - - - - - - - - - - - - - - The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - The nature of the text in the region - - - - - - - The degree of space in points between the lines of - text (line spacing) - - - - - - - The direction in which text in a region should be - read (within lines) - - - - - - Inner-block order of text lines (in addition to “readingDirection” which is the inner-text line order of words and characters) - - - - - The angle the baseline of text withing a region has to be rotated (relative to the rectangle encapsulating the region) in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - Defines whether a region of text is indented or not - - - - - - Text align - - - - - - The primary language used in the region - - - - - - - The secondary language used in the region - - - - - - - The primary script used in the region - - - - - - - The secondary script used in the region - - - - - - - - - - - Point list with format "x1,y1 x2,y2 ..." - - - - - - - - - - Multiple connected points that mark the baseline - of the glyphs - - - - - - - - - - - - - - - - Overrides primaryLanguage attribute of parent text - region - - - - - - - The primary script used in the text line - - - - - - - The secondary script used in the text line - - - - - - - The direction in which text in a text line should be read - - - - - - - Overrides the production attribute of the parent - text region - - - - - - For generic use - - - - - - - - - - - - - - - - - - - - Overrides primaryLanguage attribute of parent line - and/or text region - - - - - - - The primary script used in the word - - - - - - - The secondary script used in the word - - - - - - - The direction in which characters in a word should be read - - - - - - - Overrides the production attribute of the parent - text line and/or text region. - - - - - - For generic use - - - - - - - - - - - Container for graphemes, grapheme groups and - non-printing characters - - - - - - - - - - - - - - - - - - The script used for the glyph - - - - - - - Overrides the production attribute of the parent - word / text line / text region. - - - - - - For generic use - - - - - - - - - - Text in a "simple" form (ASCII or extended ASCII - as mostly used for typing). I.e. no use of - special characters for ligatures (should be - stored as two separate characters) etc. - - - - - - - Correct encoding of the original, always using - the corresponding Unicode code point. I.e. - ligatures have to be represented as one - character etc. - - - - - - - Used for sort order in case multiple TextEquivs are defined. The text content with the lowest index should be interpreted as the main text content. - - - - - - - - - - OCR confidence value (between 0 and 1) - - - - - - - - - - - Type of text content (is it free text or a number, for instance) -This is only a descriptive attribute, the text type is not checked during XML validation - - - - - Refinement for dataType attribute. Can be a regular expression, for instance. - - - - - - - - - An image is considered to be more intricate and complex - than a graphic. These can be photos or drawings. - - - - - - - The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - The colour bit depth required for the region - - - - - - - The background colour of the region - - - - - - - Specifies whether the region also contains - text - - - - - - - - - - A line drawing is a single colour illustration without - solid areas. - - - - - - - The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - The pen (foreground) colour of the region - - - - - - - The background colour of the region - - - - - - - Specifies whether the region also contains - text - - - - - - - - - - Regions containing simple graphics, such as a company - logo, should be marked as graphic regions. - - - - - - - The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - The type of graphic in the region - - - - - - - An approximation of the number of colours - used in the region - - - - - - - Specifies whether the region also contains - text. - - - - - - - - - - Tabular data in any form is represented with a table - region. Rows and columns may or may not have separator - lines; these lines are not separator regions. - - - - - - - The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - The number of rows present in the table - - - - - - - The number of columns present in the table - - - - - - - The colour of the lines used in the region - - - - - - - The background colour of the region - - - - - - - Specifies the presence of line separators - - - - - - - Specifies whether the region also contains - text - - - - - - - - - - Regions containing charts or graphs of any type, should - be marked as chart regions. - - - - - - - The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - The type of chart in the region - - - - - - - An approximation of the number of colours - used in the region - - - - - - - The background colour of the region - - - - - - - Specifies whether the region also contains - text - - - - - - - - - - Separators are lines that lie between columns and - paragraphs and can be used to logically separate - different articles from each other. - - - - - - - The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - The colour of the separator - - - - - - - - - - Regions containing equations and mathematical symbols - should be marked as maths regions. - - - - - - - The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - The background colour of the region - - - - - - - - - - Regions containing chemical formulas. - - - - - - - - The angle the rectangle encapsulating a - region has to be rotated in clockwise - direction in order to correct the present - skew (negative values indicate - anti-clockwise rotation). Range: - -179.999,180 - - - - - - - - The background colour of the region - - - - - - - - - - - Regions containing musical notations. - - - - - - - The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - The background colour of the region - - - - - - - - - - Regions containing advertisements. - - - - - - - The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - - The background colour of the region - - - - - - - - - - Noise regions are regions where no real data lies, only - false data created by artifacts on the document or - scanner noise. - - - - - - - - - - To be used if the region type cannot be ascertained. - - - - - - - - - - Determines the effective area on the paper of a printed page. Its size is equal for all pages of a book (exceptions: titlepage, multipage pictures). -It contains all living elements (except marginals) like body type, footnotes, headings, running titles. -It does not contain pagenumber (if not part of running title), marginals, signature mark, preview words. - - - - - - - - - - Definition of the reading order within the page. To express a reading order between elements they have to be included in an OrderedGroup. Groups may contain further groups. - - - - - - - - - - Numbered region - - - - Position (order number) of this item within the current hierarchy level. - - - - - - - - Indexed group containing ordered elements - - - - - - - - - - - - - - - - - - Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members. - - - - - Position (order number) of this item within the - current hierarchy level. - - - - - - - - - Is this group a continuation of another group (from - previous column or page, for example)? - - - - - - - - - - - Indexed group containing unordered elements - - - - - - - - - - - - - - - - Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members. - - - - - Position (order number) of this item within the - current hierarchy level. - - - - - - - - Is this group a continuation of another group (from previous column or page, for example)? - - - - - - - - - - - - Numbered group (contains ordered elements) - - - - - - - - - - - - - - - - - Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members. - - - - - - Is this group a continuation of another group (from previous column or page, for example)? - - - - - - - - - Numbered group (contains unordered elements) - - - - - - - - - - - - - - - - Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members. - - - - - - Is this group a continuation of another group (from previous column or page, for example)? - - - - - - - Border of the actual page (if the scanned image contains parts not belonging to the page). - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - iso15924 2016-07-14 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Can be used to express the z-index of overlapping - regions. An element with a greater z-index is always in - front of another element with lower z-index. - - - - - - - - - - - - - - - - - - - - - - - - - - Point list with format "x1,y1 x2,y2 ..." - - - - - - - - - - Container for one-to-one relations between layout - objects (for example: DropCap - paragraph, caption - - image) - - - - - - - - - - - One-to-one relation between to layout object. Use 'link' - for loose relations and 'join' for strong relations - (where something is fragmented for instance). - - Examples for 'link': caption - image floating - - paragraph paragraph - paragraph (when a pragraph is - split across columns and the last word of the first - paragraph DOES NOT continue in the second paragraph) - drop-cap - paragraph (when the drop-cap is a whole word) - - Examples for 'join': word - word (separated word at the - end of a line) drop-cap - paragraph (when the drop-cap - is not a whole word) paragraph - paragraph (when a - pragraph is split across columns and the last word of - the first paragraph DOES continue in the second - paragraph) - - - - - - - - - - - - - - - - For generic use - - - - - - Text production type - - - - - - - - - - - - - - - Monospace (fixed-pitch, non-proportional) or - proportional font - - - - - - For instance: Arial, Times New Roman. Add more - information if necessary (e.g. blackletter, - antiqua). - - - - - - - Serif or sans-serif typeface - - - - - - - - The size of the characters in points - - - - - - The x-height or corpus size refers to the distance between the baseline and the mean line of lower-case letters in a typeface. The unit is assumed to be pixels. - - - - - - The degree of space (in points) between the - characters in a string of text - - - - - - - Text colour in RGB encoded format (red value) + (256 x green value) + (65536 x blue value) - - - - - Background colour - - - - - Background colour in RGB encoded format (red value) + (256 x green value) + (65536 x blue value) - - - - - - Specifies whether the colour of the text appears - reversed against a background colour - - - - - - - - - - - - - - - - - - - - - - Roles the region takes (e.g. in context of a - parent region) - - - - - - - - - - - - - - - - - - - - - - - - - - - - For generic use - - - - - - Is this region a continuation of another region (in previous column or page, for example)? - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Examples: "123.456", "+1234.456", "-1234.456", "-.456", "-456" - - - - Examples: "123.456", "+1234.456", "-1.2344e56", "-.45E-6", "INF", "-INF", "NaN" - - - - Examples: "123456", "+00000012", "-1", "-456" - - - - Examples: "true", "false", "1", "0" - - - - Examples: "2001-10-26", "2001-10-26+02:00", "2001-10-26Z", "2001-10-26+00:00", "-2001-10-26", "-20000-04-01" - - - - Examples: "21:32:52", "21:32:52+02:00", "19:32:52Z", "19:32:52+00:00", "21:32:52.12679" - - - - Examples: "2001-10-26T21:32:52", "2001-10-26T21:32:52+02:00", "2001-10-26T19:32:52Z", "2001-10-26T19:32:52+00:00", "-2001-10-26T21:32:52", "2001-10-26T21:32:52.12679" - - - - Generic text string - - - - An XSD type that is not listed or a custom type (use dataTypeDetails attribute) - - - - - - - - - - Container for graphemes, grapheme groups and - non-printing characters - - - - - - - - - - - - Base type for graphemes, grapheme groups and non-printing characters - - - - - - - - - Order index of grapheme, group, or non-printing character within the parent container (graphemes or glyph or grapheme group) - - - - - - - - - - - Type of character represented by the grapheme/group/non-printing character element - - - - - - - - - - - For generic use - - - For generic use - - - - - Represents a sub-element of a glyph. Smallest graphical unit that can be assigned a Unicode code point - - - - - - - - - - - - - A glyph component without visual representation but with Unicode code point. Non-visual / non-printing / control character. Part of grapheme container (of glyph) or grapheme sub group. - - - - - - - - - - - - - - - - - - - - - Container for user-defined attributes - - - - - - - - - Structured custom data defined by name, type and value. - - - - - - - - - - - - - - - - - - - - Cell position in table starting with row 0 - - - - Cell position in table starting with column 0 - - - - Number of rows the cell spans (optional; default is 1) - - - - Number of columns the cell spans (optional; default is 1) - - - - - - - - Data for a region that takes on the role of a table cell within a parent table region - - - - \ No newline at end of file diff --git a/xsd/pagecontent.2018-07-15.xsd b/xsd/pagecontent.2018-07-15.xsd deleted file mode 100644 index c6b7e93..0000000 --- a/xsd/pagecontent.2018-07-15.xsd +++ /dev/null @@ -1,2496 +0,0 @@ - - - - - - Page Content - Ground Truth and Storage - - - - - - - - - - - - - - - The timestamp has to be in UTC (Coordinated - Universal Time) and not local time. - - - - - - - The timestamp has to be in UTC (Coordinated - Universal Time) and not local time. - - - - - - - - - - - - External reference of any kind - - - - - - - Semantic labels / tags - - - - - - Type of metadata (e.g. author) - - - - - - - - - - - - - - - E.g. imagePhotometricInterpretation - - - - - - E.g. RGB - - - - - - - - - - A semantic label / tag - - - - - - - - Reference to external model / ontology / schema - - - - - - E.g. an RDF resource identifier (to be used as subject or object of an RDF triple) - - - - - Prefix for all labels (e.g. first part of an URI) - - - - - - - - - Semantic label - - - - - The label / tag (e.g. 'person'). Can be an RDF resource identifier (e.g. object of an RDF triple). - - - - - - - Additional information on the label (e.g. 'YYYY-mm-dd' for a date label). Can be used as predicate of an RDF triple. - - - - - - - - - - - - Alternative document page images (e.g. - black-and-white) - - - - - - - - - - - - - - - - Unassigned regions are considered to be in the - (virtual) default layer which is to be treated - as below any other layers. - - - - - - - - - Semantic labels / tags - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Contains the image file name including the file extension. - - - - - - Specifies the width of the image. - - - - - Specifies the height of the image. - - - - - Specifies the image resolution in width. - - - - - Specifies the image resolution in height. - - - - - - Specifies the unit of the resolution information - referring to a standardised unit of measurement (pixels per inch, pixels per centimeter or other). - - - - - - - - - - - - - - For generic use - - - - - Page type - - - - - - The primary language used in the page (lower-level definitions override the page-level definition) - - - - - - - The secondary language used in the page (lower-level definitions override the page-level definition) - - - - - - - The primary script used in the page (lower-level definitions override the page-level definition) - - - - - - - The secondary script used in the page (lower-level definitions override the page-level definition) - - - - - - - The direction in which text in a region should be - read (within lines) (lower-level definitions override the page-level definition) - - - - - - Inner-block order of text lines (in addition to “readingDirection” which is the inner-text line order of words and characters) (lower-level definitions override the page-level definition) - - - - - Confidence value for whole page (between 0 and 1) - - - - - - - - Pure text is represented as a text region. This includes - drop capitals, but practically ornate text may be - considered as a graphic. - - - - - - - - - - - - - - - The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - The nature of the text in the region - - - - - - - The degree of space in points between the lines of - text (line spacing) - - - - - - - The direction in which text in a region should be - read (within lines) - - - - - - Inner-block order of text lines (in addition to “readingDirection” which is the inner-text line order of words and characters) - - - - - The angle the baseline of text withing a region has to be rotated (relative to the rectangle encapsulating the region) in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - Defines whether a region of text is indented or not - - - - - - Text align - - - - - - The primary language used in the region - - - - - - - The secondary language used in the region - - - - - - - The primary script used in the region - - - - - - - The secondary script used in the region - - - - - - - - - - - Point list with format "x1,y1 x2,y2 ..." - - - - - Confidence value (between 0 and 1) - - - - - - - - - Alternative text line images (e.g. - black-and-white) - - - - - - - - Multiple connected points that mark the baseline - of the glyphs - - - - - - - - - - - - - Semantic labels / tags - - - - - - - - Overrides primaryLanguage attribute of parent text - region - - - - - - - The primary script used in the text line - - - - - - - The secondary script used in the text line - - - - - - - The direction in which text in a text line should be read - - - - - - - Overrides the production attribute of the parent - text region - - - - - - For generic use - - - - - - - Position (order number) of this text line within the - parent text region. - - - - - - - - - - - Alternative word images (e.g. - black-and-white) - - - - - - - - - - - - - - Semantic labels / tags - - - - - - - - Overrides primaryLanguage attribute of parent line - and/or text region - - - - - - - The primary script used in the word - - - - - - - The secondary script used in the word - - - - - - - The direction in which characters in a word should be read - - - - - - - Overrides the production attribute of the parent - text line and/or text region. - - - - - - For generic use - - - - - - - - - - Alternative glyph images (e.g. - black-and-white) - - - - - - - - Container for graphemes, grapheme groups and - non-printing characters - - - - - - - - - - - Semantic labels / tags - - - - - - - - - - - - The script used for the glyph - - - - - - - Overrides the production attribute of the parent - word / text line / text region. - - - - - - For generic use - - - - - - - - - - Text in a "simple" form (ASCII or extended ASCII - as mostly used for typing). I.e. no use of - special characters for ligatures (should be - stored as two separate characters) etc. - - - - - - - Correct encoding of the original, always using - the corresponding Unicode code point. I.e. - ligatures have to be represented as one - character etc. - - - - - - - Used for sort order in case multiple TextEquivs are defined. The text content with the lowest index should be interpreted as the main text content. - - - - - - - - - - OCR confidence value (between 0 and 1) - - - - - Type of text content (is it free text or a number, for instance) -This is only a descriptive attribute, the text type is not checked during XML validation - - - - - Refinement for dataType attribute. Can be a regular expression, for instance. - - - - - - - - - An image is considered to be more intricate and complex - than a graphic. These can be photos or drawings. - - - - - - - The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - The colour bit depth required for the region - - - - - - - The background colour of the region - - - - - - - Specifies whether the region also contains - text - - - - - - - - - - A line drawing is a single colour illustration without - solid areas. - - - - - - - The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - The pen (foreground) colour of the region - - - - - - - The background colour of the region - - - - - - - Specifies whether the region also contains - text - - - - - - - - - - Regions containing simple graphics, such as a company - logo, should be marked as graphic regions. - - - - - - - The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - The type of graphic in the region - - - - - - - An approximation of the number of colours - used in the region - - - - - - - Specifies whether the region also contains - text. - - - - - - - - - - Tabular data in any form is represented with a table - region. Rows and columns may or may not have separator - lines; these lines are not separator regions. - - - - - - - - Table grid (visible or virtual grid lines) - - - - - - The angle the rectangle encapsulating a - region has to be rotated in clockwise - direction in order to correct the present - skew (negative values indicate - anti-clockwise rotation). Range: - -179.999,180 - - - - - - - The number of rows present in the table - - - - - - - The number of columns present in the table - - - - - - - The colour of the lines used in the region - - - - - - - The background colour of the region - - - - - - - Specifies the presence of line separators - - - - - - - Specifies whether the region also contains - text - - - - - - - - - Matrix of grid points defining the table grid on the page - - - - - One row in the grid point matrix. Points with x,y coordinates. (note: for a table with n table rows there should be n+1 grid rows) - - - - - - Points with x,y coordinates. - - - - - The grid row index - - - - - - - - - - Regions containing charts or graphs of any type, should - be marked as chart regions. - - - - - - - The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - The type of chart in the region - - - - - - - An approximation of the number of colours - used in the region - - - - - - - The background colour of the region - - - - - - - Specifies whether the region also contains - text - - - - - - - - - - Separators are lines that lie between columns and - paragraphs and can be used to logically separate - different articles from each other. - - - - - - - The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - The colour of the separator - - - - - - - - - - Regions containing equations and mathematical symbols - should be marked as maths regions. - - - - - - - The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - The background colour of the region - - - - - - - - - - Regions containing chemical formulas. - - - - - - - - The angle the rectangle encapsulating a - region has to be rotated in clockwise - direction in order to correct the present - skew (negative values indicate - anti-clockwise rotation). Range: - -179.999,180 - - - - - - - - The background colour of the region - - - - - - - - - - - Regions containing maps. - - - - - - - - The angle the rectangle encapsulating a - region has to be rotated in clockwise - direction in order to correct the present - skew (negative values indicate - anti-clockwise rotation). Range: - -179.999,180 - - - - - - - - - - Regions containing musical notations. - - - - - - - The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - The background colour of the region - - - - - - - - - - Regions containing advertisements. - - - - - - - The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation). -Range: -179.999,180 - - - - - - - The background colour of the region - - - - - - - - - - Noise regions are regions where no real data lies, only - false data created by artifacts on the document or - scanner noise. - - - - - - - - - - To be used if the region type cannot be ascertained. - - - - - - - - - - Regions containing content that is not covered by the default types (text, graphic, image, line drawing, chart, table, separator, maths, map, music, chem, advert, noise, unknown) - - - - - - - - Information on the type of content represented by this region - - - - - - - - - - Determines the effective area on the paper of a printed page. Its size is equal for all pages of a book (exceptions: titlepage, multipage pictures). -It contains all living elements (except marginals) like body type, footnotes, headings, running titles. -It does not contain pagenumber (if not part of running title), marginals, signature mark, preview words. - - - - - - - - - - Definition of the reading order within the page. To express a reading order between elements they have to be included in an OrderedGroup. Groups may contain further groups. - - - - - - - - - - Confidence value (between 0 and 1) - - - - - - - - Numbered region - - - - Position (order number) of this item within the current hierarchy level. - - - - - - - - Indexed group containing ordered elements - - - - - - - - Semantic labels / tags - - - - - - - - - - - - - - Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members. - - - - - Position (order number) of this item within the - current hierarchy level. - - - - - - - - - Is this group a continuation of another group (from - previous column or page, for example)? - - - - - - - - - - - Indexed group containing unordered elements - - - - - - - - Semantic labels / tags - - - - - - - - - - - - - Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members. - - - - - Position (order number) of this item within the - current hierarchy level. - - - - - - - - Is this group a continuation of another group (from previous column or page, for example)? - - - - - - - - - - - - Numbered group (contains ordered elements) - - - - - - - - Semantic labels / tags - - - - - - - - - - - - - - Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members. - - - - - - Is this group a continuation of another group (from previous column or page, for example)? - - - - - - - - - Numbered group (contains unordered elements) - - - - - - - - Semantic labels / tags - - - - - - - - - - - - - Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members. - - - - - - Is this group a continuation of another group (from previous column or page, for example)? - - - - - - - Border of the actual page (if the scanned image contains parts not belonging to the page). - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ISO 639.x 2016-07-14 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - iso15924 2016-07-14 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Can be used to express the z-index of overlapping - regions. An element with a greater z-index is always in - front of another element with lower z-index. - - - - - - - - - - - - - - - - - - - - - - - Confidence value (between 0 and 1) - - - - - - - Point list with format "x1,y1 x2,y2 ..." - - - - - - - - - - Container for one-to-one relations between layout - objects (for example: DropCap - paragraph, caption - - image) - - - - - - - - - - - One-to-one relation between to layout object. Use 'link' - for loose relations and 'join' for strong relations - (where something is fragmented for instance). - - Examples for 'link': caption - image floating - - paragraph paragraph - paragraph (when a paragraph is - split across columns and the last word of the first - paragraph DOES NOT continue in the second paragraph) - drop-cap - paragraph (when the drop-cap is a whole word) - - Examples for 'join': word - word (separated word at the - end of a line) drop-cap - paragraph (when the drop-cap - is not a whole word) paragraph - paragraph (when a - pragraph is split across columns and the last word of - the first paragraph DOES continue in the second - paragraph) - - - - - - - Semantic labels / tags - - - - - - - - - - - - - - - - - For generic use - - - - - - Text production type - - - - - - - - - - - - - - - Monospace (fixed-pitch, non-proportional) or - proportional font - - - - - - For instance: Arial, Times New Roman. Add more - information if necessary (e.g. blackletter, - antiqua). - - - - - - - Serif or sans-serif typeface - - - - - - - - The size of the characters in points - - - - - - The x-height or corpus size refers to the distance between the baseline and the mean line of lower-case letters in a typeface. The unit is assumed to be pixels. - - - - - - The degree of space (in points) between the - characters in a string of text - - - - - - - Text colour in RGB encoded format (red value) + (256 x green value) + (65536 x blue value) - - - - - Background colour - - - - - Background colour in RGB encoded format (red value) + (256 x green value) + (65536 x blue value) - - - - - - Specifies whether the colour of the text appears - reversed against a background colour - - - - - - - - - - - - - - - - - - - Alternative region images (e.g. - black-and-white) - - - - - - - - - Semantic labels / tags - - - - - - Roles the region takes (e.g. in context of a - parent region) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - For generic use - - - - - - Is this region a continuation of another region (in previous column or page, for example)? - - - - - - - - - Confidence value (between 0 and 1) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Examples: "123.456", "+1234.456", "-1234.456", "-.456", "-456" - - - - Examples: "123.456", "+1234.456", "-1.2344e56", "-.45E-6", "INF", "-INF", "NaN" - - - - Examples: "123456", "+00000012", "-1", "-456" - - - - Examples: "true", "false", "1", "0" - - - - Examples: "2001-10-26", "2001-10-26+02:00", "2001-10-26Z", "2001-10-26+00:00", "-2001-10-26", "-20000-04-01" - - - - Examples: "21:32:52", "21:32:52+02:00", "19:32:52Z", "19:32:52+00:00", "21:32:52.12679" - - - - Examples: "2001-10-26T21:32:52", "2001-10-26T21:32:52+02:00", "2001-10-26T19:32:52Z", "2001-10-26T19:32:52+00:00", "-2001-10-26T21:32:52", "2001-10-26T21:32:52.12679" - - - - Generic text string - - - - An XSD type that is not listed or a custom type (use dataTypeDetails attribute) - - - - - - - - - - Container for graphemes, grapheme groups and - non-printing characters - - - - - - - - - - - - Base type for graphemes, grapheme groups and non-printing characters - - - - - - - - - Order index of grapheme, group, or non-printing character within the parent container (graphemes or glyph or grapheme group) - - - - - - - - - - - Type of character represented by the grapheme/group/non-printing character element - - - - - - - - - - - For generic use - - - For generic use - - - - - Represents a sub-element of a glyph. Smallest graphical unit that can be assigned a Unicode code point - - - - - - - - - - - - - A glyph component without visual representation but with Unicode code point. Non-visual / non-printing / control character. Part of grapheme container (of glyph) or grapheme sub group. - - - - - - - - - - - - - - - - - - - - - Container for user-defined attributes - - - - - - - - - Structured custom data defined by name, type and value. - - - - - - - - - - - - - - - - - - - - Cell position in table starting with row 0 - - - - Cell position in table starting with column 0 - - - - Number of rows the cell spans (optional; default is 1) - - - - Number of columns the cell spans (optional; default is 1) - - - - - Is the cell a column or row header? - - - - - - - - - - Data for a region that takes on the role of a table cell within a parent table region - - - - diff --git a/xsd/pagecontent.2019-07-15.xsd b/xsd/pagecontent.2019-07-15.xsd deleted file mode 100644 index 40c6f24..0000000 --- a/xsd/pagecontent.2019-07-15.xsd +++ /dev/null @@ -1,2674 +0,0 @@ - - - - - - - Page Content - Ground Truth and Storage - - - - - - - - - - - - - - - - The timestamp has to be in UTC (Coordinated - Universal Time) and not local time. - - - - - - - The timestamp has to be in UTC - (Coordinated Universal Time) - and not local time. - - - - - - - - - - - - - External reference of any kind - - - - - - - - Semantic labels / tags - - - - - - - Type of metadata (e.g. author) - - - - - - - - - - - - - - - E.g. imagePhotometricInterpretation - - - - - - E.g. RGB - - - - - - - - - - A semantic label / tag - - - - - - - - Reference to external model / ontology / schema - - - - - - - E.g. an RDF resource identifier - (to be used as subject or object of an RDF triple) - - - - - - - Prefix for all labels (e.g. first part of an URI) - - - - - - - - Semantic label - - - - - The label / tag (e.g. 'person'). - Can be an RDF resource identifier - (e.g. object of an RDF triple). - - - - - - - Additional information on the label - (e.g. 'YYYY-mm-dd' for a date label). - Can be used as predicate of an RDF triple. - - - - - - - - - - - - Alternative document page images - (e.g. black-and-white). - - - - - - - - - - Order of blocks within the page. - - - - - - Unassigned regions are considered to be in the - (virtual) default layer which is to be treated - as below any other layers. - - - - - - - - Default text style - - - - - - - Semantic labels / tags - - - - - - - - - - - - - - - - - - - - - - - - Contains the image file name including the file extension. - - - - - - Specifies the width of the image. - - - - - Specifies the height of the image. - - - - - Specifies the image resolution in width. - - - - - Specifies the image resolution in height. - - - - - - Specifies the unit of the resolution information - referring to a standardised unit of measurement - (pixels per inch, pixels per centimeter or other). - - - - - - - - - - - - - For generic use - - - - - - The angle the rectangle encapsulating the page - (or its Border) has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - (The rotated image can be further referenced - via “AlternativeImage”.) - Range: -179.999,180 - - - - - - - The type of the page within the document - (e.g. cover page). - - - - - - - The primary language used in the page - (lower-level definitions override the page-level definition). - - - - - - - The secondary language used in the page - (lower-level definitions override the page-level definition). - - - - - - - The primary script used in the page - (lower-level definitions override the page-level definition). - - - - - - - The secondary script used in the page - (lower-level definitions override the page-level definition). - - - - - - - The direction in which text within lines - should be read (order of words and characters), - in addition to “textLineOrder” - (lower-level definitions override the page-level definition). - - - - - - - The order of text lines within a block, - in addition to “readingDirection” - (lower-level definitions override the page-level definition). - - - - - - Confidence value for whole page (between 0 and 1) - - - - - - - Pure text is represented as a text region. This includes - drop capitals, but practically ornate text may be - considered as a graphic. - - - - - - - - - - - - - The angle the rectangle encapsulating the region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - (The rotated image can be further referenced - via “AlternativeImage”.) - Range: -179.999,180 - - - - - - - The nature of the text in the region - - - - - - - The degree of space in points between the lines of - text (line spacing) - - - - - - - The direction in which text within lines - should be read (order of words and characters), - in addition to “textLineOrder”. - - - - - - - The order of text lines within the block, - in addition to “readingDirection”. - - - - - - - The angle the baseline of text within the region - has to be rotated (relative to the rectangle - encapsulating the region) in clockwise direction - in order to correct the present skew, - in addition to “orientation” - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - - - - - - - Defines whether a region of text is indented or not - - - - - - Text align - - - - - - The primary language used in the region - - - - - - - The secondary language used in the region - - - - - - - The primary script used in the region - - - - - - - The secondary script used in the region - - - - - - - - - - - Point list with format "x1,y1 x2,y2 ..." - - - - - Confidence value (between 0 and 1) - - - - - - - - - Alternative text line images (e.g. - black-and-white) - - - - - - - - Multiple connected points that mark the baseline - of the glyphs - - - - - - - - - - - - - - Semantic labels / tags - - - - - - - - Overrides primaryLanguage attribute of parent text - region - - - - - - - The primary script used in the text line - - - - - - - The secondary script used in the text line - - - - - - - The direction in which text within the line - should be read (order of words and characters). - - - - - - - Overrides the production attribute of the parent - text region - - - - - - For generic use - - - - - - - Position (order number) of this text line within the - parent text region. - - - - - - - - - - Alternative word images (e.g. - black-and-white) - - - - - - - - - - - - - - - - Semantic labels / tags - - - - - - - - Overrides primaryLanguage attribute of parent line - and/or text region - - - - - - - The primary script used in the word - - - - - - - The secondary script used in the word - - - - - - - The direction in which text within the word - should be read (order of characters). - - - - - - - Overrides the production attribute of the parent - text line and/or text region. - - - - - - For generic use - - - - - - - - - - Alternative glyph images (e.g. - black-and-white) - - - - - - - - Container for graphemes, grapheme groups and - non-printing characters - - - - - - - - - - - - Semantic labels / tags - - - - - - - - - - The script used for the glyph - - - - - - - Overrides the production attribute of the parent - word / text line / text region. - - - - - - For generic use - - - - - - - - - - Text in a "simple" form (ASCII or extended ASCII - as mostly used for typing). I.e. no use of - special characters for ligatures (should be - stored as two separate characters) etc. - - - - - - - Correct encoding of the original, always using - the corresponding Unicode code point. I.e. - ligatures have to be represented as one - character etc. - - - - - - - - Used for sort order in case multiple TextEquivs are defined. - The text content with the lowest index should be interpreted - as the main text content. - - - - - - - - - - - OCR confidence value (between 0 and 1) - - - - - - Type of text content (is it free text or a number, for instance). - This is only a descriptive attribute, the text type - is not checked during XML validation. - - - - - - - Refinement for dataType attribute. Can be a regular expression, for instance. - - - - - - - - - - An image is considered to be more intricate and complex - than a graphic. These can be photos or drawings. - - - - - - - - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - - - - - - - The colour bit depth required for the region - - - - - - - The background colour of the region - - - - - - - Specifies whether the region also contains - text - - - - - - - - - - A line drawing is a single colour illustration without - solid areas. - - - - - - - - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - - - - - - - The pen (foreground) colour of the region - - - - - - - The background colour of the region - - - - - - - Specifies whether the region also contains - text - - - - - - - - - - Regions containing simple graphics, such as a company - logo, should be marked as graphic regions. - - - - - - - - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - - - - - - - The type of graphic in the region - - - - - - - An approximation of the number of colours - used in the region - - - - - - - Specifies whether the region also contains - text. - - - - - - - - - - Tabular data in any form is represented with a table - region. Rows and columns may or may not have separator - lines; these lines are not separator regions. - - - - - - - - Table grid (visible or virtual grid lines) - - - - - - - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - - - - - - - The number of rows present in the table - - - - - - - The number of columns present in the table - - - - - - - The colour of the lines used in the region - - - - - - - The background colour of the region - - - - - - - Specifies the presence of line separators - - - - - - - Specifies whether the region also contains - text - - - - - - - - - - Matrix of grid points defining the table grid on the page. - - - - - - - One row in the grid point matrix. - Points with x,y coordinates. - (note: for a table with n table rows there should be n+1 grid rows) - - - - - - - - Points with x,y coordinates. - - - - - The grid row index - - - - - - - - - Regions containing charts or graphs of any type, should - be marked as chart regions. - - - - - - - - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - - - - - - - The type of chart in the region - - - - - - - An approximation of the number of colours - used in the region - - - - - - - The background colour of the region - - - - - - - Specifies whether the region also contains - text - - - - - - - - - - Separators are lines that lie between columns and - paragraphs and can be used to logically separate - different articles from each other. - - - - - - - - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - - - - - - - The colour of the separator - - - - - - - - - - Regions containing equations and mathematical symbols - should be marked as maths regions. - - - - - - - - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - - - - - - - The background colour of the region - - - - - - - - - - Regions containing chemical formulas. - - - - - - - - The angle the rectangle encapsulating a - region has to be rotated in clockwise - direction in order to correct the present - skew (negative values indicate - anti-clockwise rotation). Range: - -179.999,180 - - - - - - - The background colour of the region - - - - - - - - - - Regions containing maps. - - - - - - - - The angle the rectangle encapsulating a - region has to be rotated in clockwise - direction in order to correct the present - skew (negative values indicate - anti-clockwise rotation). Range: - -179.999,180 - - - - - - - - - - Regions containing musical notations. - - - - - - - - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - - - - - - - The background colour of the region - - - - - - - - - - Regions containing advertisements. - - - - - - - - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - - - - - - - The background colour of the region - - - - - - - - - - Noise regions are regions where no real data lies, only - false data created by artifacts on the document or - scanner noise. - - - - - - - - - - To be used if the region type cannot be ascertained. - - - - - - - - - - Regions containing content that is not covered - by the default types (text, graphic, image, - line drawing, chart, table, separator, maths, - map, music, chem, advert, noise, unknown). - - - - - - - - Information on the type of content represented by this region - - - - - - - - - - Determines the effective area on the paper of a printed page. - Its size is equal for all pages of a book - (exceptions: titlepage, multipage pictures). - It contains all living elements (except marginals) - like body type, footnotes, headings, running titles. - It does not contain pagenumber (if not part of running title), - marginals, signature mark, preview words. - - - - - - - - - - Definition of the reading order within the page. - To express a reading order between elements - they have to be included in an OrderedGroup. - Groups may contain further groups. - - - - - - - - - Confidence value (between 0 and 1) - - - - - - Numbered region - - - - Position (order number) of this item within the current hierarchy level. - - - - - - - - Indexed group containing ordered elements - - - - - - - Semantic labels / tags - - - - - - - - - - - - - Optional link to a parent region of nested regions. - The parent region doubles as reading order group. - Only the nested regions should be allowed as group members. - - - - - - - Position (order number) of this item within the - current hierarchy level. - - - - - - - - - Is this group a continuation of another group (from - previous column or page, for example)? - - - - - - For generic use - - - - - - - - Indexed group containing unordered elements - - - - - - - - Semantic labels / tags - - - - - - - - - - - - - Optional link to a parent region of nested regions. - The parent region doubles as reading order group. - Only the nested regions should be allowed as group members. - - - - - - - Position (order number) of this item within the - current hierarchy level. - - - - - - - - - Is this group a continuation of another group - (from previous column or page, for example)? - - - - - - For generic use - - - - - - - - - - - Numbered group (contains ordered elements) - - - - - - - - Semantic labels / tags - - - - - - - - - - - - - Optional link to a parent region of nested regions. - The parent region doubles as reading order group. - Only the nested regions should be allowed as group members. - - - - - - - - - Is this group a continuation of another group - (from previous column or page, for example)? - - - - - - For generic use - - - - - - - - Numbered group (contains unordered elements) - - - - - - - - Semantic labels / tags - - - - - - - - - - - - - Optional link to a parent region of nested regions. - The parent region doubles as reading order group. - Only the nested regions should be allowed as group members. - - - - - - - - - Is this group a continuation of another group - (from previous column or page, for example)? - - - - - - For generic use - - - - - - - - Border of the actual page (if the scanned image - contains parts not belonging to the page). - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ISO 639.x 2016-07-14 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - iso15924 2016-07-14 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Can be used to express the z-index of overlapping - regions. An element with a greater z-index is always in - front of another element with lower z-index. - - - - - - - - - - - - - - - - - - - - - - Confidence value (between 0 and 1) - - - - - - - Point list with format "x1,y1 x2,y2 ..." - - - - - - - - - - Container for one-to-one relations between layout - objects (for example: DropCap - paragraph, caption - - image). - - - - - - - - - - - One-to-one relation between to layout object. Use 'link' - for loose relations and 'join' for strong relations - (where something is fragmented for instance). - - Examples for 'link': caption - image floating - - paragraph paragraph - paragraph (when a paragraph is - split across columns and the last word of the first - paragraph DOES NOT continue in the second paragraph) - drop-cap - paragraph (when the drop-cap is a whole word) - - Examples for 'join': word - word (separated word at the - end of a line) drop-cap - paragraph (when the drop-cap - is not a whole word) paragraph - paragraph (when a - pragraph is split across columns and the last word of - the first paragraph DOES continue in the second - paragraph) - - - - - - Semantic labels / tags - - - - - - - - - - - - - - - - - - - For generic use - - - - - - - - Text production type - - - - - - - - - - - - - - - Monospace (fixed-pitch, non-proportional) or - proportional font. - - - - - - For instance: Arial, Times New Roman. - Add more information if necessary - (e.g. blackletter, antiqua). - - - - - - - Serif or sans-serif typeface. - - - - - - - - The size of the characters in points. - - - - - - - The x-height or corpus size refers to the distance - between the baseline and the mean line of - lower-case letters in a typeface. - The unit is assumed to be pixels. - - - - - - - The degree of space (in points) between - the characters in a string of text. - - - - - - - - Text colour in RGB encoded format - (red value) + (256 x green value) + (65536 x blue value). - - - - - - Background colour - - - - - - Background colour in RGB encoded format - (red value) + (256 x green value) + (65536 x blue value). - - - - - - - Specifies whether the colour of the text appears - reversed against a background colour. - - - - - - - - - Line style details if "underlined" is TRUE - - - - - - - - - - - - - - - - - Alternative region images - (e.g. black-and-white). - - - - - - - - - Semantic labels / tags - - - - - - Roles the region takes - (e.g. in context of a parent region). - - - - - - - - - - - - - - - - - - - - - - - - For generic use - - - - - - - Is this region a continuation of another region - (in previous column or page, for example)? - - - - - - - - - - - Confidence value (between 0 and 1) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Examples: - "123.456", "+1234.456", - "-1234.456", "-.456", "-456" - - - - - - - Examples: - "123.456", "+1234.456", "-1.2344e56", - "-.45E-6", "INF", "-INF", "NaN" - - - - - - - Examples: - "123456", "+00000012", "-1", "-456" - - - - - - - Examples: "true", "false", "1", "0" - - - - - - - Examples: - "2001-10-26", "2001-10-26+02:00", - "2001-10-26Z", "2001-10-26+00:00", - "-2001-10-26", "-20000-04-01" - - - - - - - Examples: - "21:32:52", "21:32:52+02:00", "19:32:52Z", - "19:32:52+00:00", "21:32:52.12679" - - - - - - - Examples: - "2001-10-26T21:32:52", "2001-10-26T21:32:52+02:00", - "2001-10-26T19:32:52Z", "2001-10-26T19:32:52+00:00", - "-2001-10-26T21:32:52", "2001-10-26T21:32:52.12679" - - - - - - Generic text string - - - - - - An XSD type that is not listed or a custom type - (use dataTypeDetails attribute). - - - - - - - - - - - - Container for graphemes, grapheme groups and - non-printing characters. - - - - - - - - - - - - Base type for graphemes, grapheme groups and non-printing characters. - - - - - - - - - - Order index of grapheme, group, or non-printing character - within the parent container (graphemes or glyph or grapheme group). - - - - - - - - - - - - - Type of character represented by the - grapheme, group, or non-printing character element. - - - - - - - - - - - - For generic use - - - - - For generic use - - - - - - - Represents a sub-element of a glyph. - Smallest graphical unit that can be - assigned a Unicode code point. - - - - - - - - - - - - - - A glyph component without visual representation - but with Unicode code point. - Non-visual / non-printing / control character. - Part of grapheme container (of glyph) or grapheme sub group. - - - - - - - - - - - - - - - - - - - - - Container for user-defined attributes - - - - - - - - - Structured custom data defined by name, type and value. - - - - - - - - - - - - - - - - - - - - Cell position in table starting with row 0 - - - - - Cell position in table starting with column 0 - - - - - Number of rows the cell spans (optional; default is 1) - - - - - Number of columns the cell spans (optional; default is 1) - - - - - - Is the cell a column or row header? - - - - - - - - - - Data for a region that takes on the role - of a table cell within a parent table region. - - - - - - - - - - - - - From 9680dd8299f213ea1b29c64ad7b72f6c05cbed1f Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 14 Aug 2020 17:17:25 +0200 Subject: [PATCH 03/16] =?UTF-8?q?=E2=9A=99=EF=B8=8F=20Install=20pip=20via?= =?UTF-8?q?=20get-pip.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile-boxed-base | 20 ++++++++++++++------ Dockerfile-boxed-ocrd_olena | 3 +-- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/Dockerfile-boxed-base b/Dockerfile-boxed-base index 3fad690..2b3c8c6 100644 --- a/Dockerfile-boxed-base +++ b/Dockerfile-boxed-base @@ -8,7 +8,9 @@ RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ apt-get update && \ apt-get install -y \ curl xz-utils \ - python3-pip \ + build-essential python3-dev \ +# For get-pip.py: + python3-distutils \ # For add-apt-repository: software-properties-common \ # XML utils @@ -23,11 +25,17 @@ RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ COPY ocrd_logging.py /etc/ -# Build pip installable stuff -RUN pip3 install --no-cache-dir --upgrade pip && \ - pip3 install --no-cache-dir \ -# Resolve conflicts early: - 'setuptools >= 41.0.0' \ +# Install pip (and setuptools) +# We use get-pip.py here to avoid +# a. having to upgrade from Ubuntu's pip +# b. the dreaded "old script wrapper" error message +RUN curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ + python3 get-pip.py && \ + rm -f get-pip.py + + +# Install pip installable-stuff +RUN pip3 install --no-cache-dir \ 'ocrd >= 2.13.1' diff --git a/Dockerfile-boxed-ocrd_olena b/Dockerfile-boxed-ocrd_olena index 98f36bf..09675a6 100644 --- a/Dockerfile-boxed-ocrd_olena +++ b/Dockerfile-boxed-ocrd_olena @@ -14,8 +14,7 @@ RUN curl -sSL --retry 3 -O https://qurator-data.de/~mike.gerber/olena_2.1-0+ocrd apt-get update && \ apt-get -f install -y && \ apt-get clean && rm -rf /var/lib/apt/lists/* -RUN pip3 install --no-cache-dir --upgrade pip && \ - curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \ +RUN curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \ mkdir ocrd_olena && \ tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \ cd ocrd_olena && \ From a820d72526523d835c7fdedc5bdee30fd4ca4920 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 14 Aug 2020 17:39:31 +0200 Subject: [PATCH 04/16] =?UTF-8?q?=F0=9F=A7=B9=20s/base/core?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile-boxed-base => Dockerfile-boxed-core | 0 Dockerfile-boxed-dinglehopper | 2 +- Dockerfile-boxed-ocrd_calamari | 2 +- Dockerfile-boxed-ocrd_olena | 2 +- Dockerfile-boxed-ocrd_tesserocr | 2 +- Dockerfile-boxed-sbb_textline_detector | 2 +- build | 2 +- run | 2 +- 8 files changed, 7 insertions(+), 7 deletions(-) rename Dockerfile-boxed-base => Dockerfile-boxed-core (100%) diff --git a/Dockerfile-boxed-base b/Dockerfile-boxed-core similarity index 100% rename from Dockerfile-boxed-base rename to Dockerfile-boxed-core diff --git a/Dockerfile-boxed-dinglehopper b/Dockerfile-boxed-dinglehopper index aa4749c..312121c 100644 --- a/Dockerfile-boxed-dinglehopper +++ b/Dockerfile-boxed-dinglehopper @@ -1,4 +1,4 @@ -FROM boxed-base +FROM boxed-core ENV DINGLEHOPPER_COMMIT 2b98f69 diff --git a/Dockerfile-boxed-ocrd_calamari b/Dockerfile-boxed-ocrd_calamari index a64a0c1..4fbe5d7 100644 --- a/Dockerfile-boxed-ocrd_calamari +++ b/Dockerfile-boxed-ocrd_calamari @@ -1,4 +1,4 @@ -FROM boxed-base +FROM boxed-core # Build pip installable stuff diff --git a/Dockerfile-boxed-ocrd_olena b/Dockerfile-boxed-ocrd_olena index 09675a6..0d4ced8 100644 --- a/Dockerfile-boxed-ocrd_olena +++ b/Dockerfile-boxed-ocrd_olena @@ -1,4 +1,4 @@ -FROM boxed-base +FROM boxed-core ENV OCRD_OLENA_VERSION 1.2.0 diff --git a/Dockerfile-boxed-ocrd_tesserocr b/Dockerfile-boxed-ocrd_tesserocr index c0ca4ff..125b6b0 100644 --- a/Dockerfile-boxed-ocrd_tesserocr +++ b/Dockerfile-boxed-ocrd_tesserocr @@ -1,4 +1,4 @@ -FROM boxed-base +FROM boxed-core ENV TESSDATA_BEST_VERSION 4.0.0 diff --git a/Dockerfile-boxed-sbb_textline_detector b/Dockerfile-boxed-sbb_textline_detector index 4274725..75386d1 100644 --- a/Dockerfile-boxed-sbb_textline_detector +++ b/Dockerfile-boxed-sbb_textline_detector @@ -1,4 +1,4 @@ -FROM boxed-base +FROM boxed-core ENV SBB_TEXTLINE_DETECTOR_COMMIT 8b01d9e diff --git a/build b/build index 3a50a4d..f19907e 100755 --- a/build +++ b/build @@ -21,7 +21,7 @@ get_from_web() { handle_data -docker build -t boxed-base -f Dockerfile-boxed-base . +docker build -t boxed-core -f Dockerfile-boxed-core . docker build -t boxed-ocrd_calamari -f Dockerfile-boxed-ocrd_calamari . docker build -t boxed-dinglehopper -f Dockerfile-boxed-dinglehopper . docker build -t boxed-ocrd_olena -f Dockerfile-boxed-ocrd_olena . diff --git a/run b/run index 5fb76c4..b2ce3ea 100755 --- a/run +++ b/run @@ -31,7 +31,7 @@ build_alias() { alias $command="docker run $docker_run_options $docker_image $command" } shopt -s expand_aliases # Required for non-interactive shells -build_alias ocrd boxed-base +build_alias ocrd boxed-core build_alias ocrd-olena-binarize boxed-ocrd_olena build_alias ocrd-sbb-textline-detector boxed-sbb_textline_detector build_alias ocrd-calamari-recognize boxed-ocrd_calamari From 73ffa01d12316c58dbcc15184ce6d7c3dd56ad71 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 14 Aug 2020 17:52:57 +0200 Subject: [PATCH 05/16] =?UTF-8?q?=F0=9F=8E=A8=20Rename=20boxed-*=20to=20my?= =?UTF-8?q?=5Focrd=5Fworkflow-*?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile-boxed-core => Dockerfile-core | 0 ...ile-boxed-dinglehopper => Dockerfile-dinglehopper | 2 +- ...e-boxed-ocrd_calamari => Dockerfile-ocrd_calamari | 2 +- Dockerfile-boxed-ocrd_olena => Dockerfile-ocrd_olena | 2 +- ...boxed-ocrd_tesserocr => Dockerfile-ocrd_tesserocr | 2 +- ...line_detector => Dockerfile-sbb_textline_detector | 2 +- build | 12 ++++++------ run | 12 ++++++------ 8 files changed, 17 insertions(+), 17 deletions(-) rename Dockerfile-boxed-core => Dockerfile-core (100%) rename Dockerfile-boxed-dinglehopper => Dockerfile-dinglehopper (91%) rename Dockerfile-boxed-ocrd_calamari => Dockerfile-ocrd_calamari (94%) rename Dockerfile-boxed-ocrd_olena => Dockerfile-ocrd_olena (97%) rename Dockerfile-boxed-ocrd_tesserocr => Dockerfile-ocrd_tesserocr (96%) rename Dockerfile-boxed-sbb_textline_detector => Dockerfile-sbb_textline_detector (93%) diff --git a/Dockerfile-boxed-core b/Dockerfile-core similarity index 100% rename from Dockerfile-boxed-core rename to Dockerfile-core diff --git a/Dockerfile-boxed-dinglehopper b/Dockerfile-dinglehopper similarity index 91% rename from Dockerfile-boxed-dinglehopper rename to Dockerfile-dinglehopper index 312121c..dfa9c1a 100644 --- a/Dockerfile-boxed-dinglehopper +++ b/Dockerfile-dinglehopper @@ -1,4 +1,4 @@ -FROM boxed-core +FROM my_ocrd_workflow-core ENV DINGLEHOPPER_COMMIT 2b98f69 diff --git a/Dockerfile-boxed-ocrd_calamari b/Dockerfile-ocrd_calamari similarity index 94% rename from Dockerfile-boxed-ocrd_calamari rename to Dockerfile-ocrd_calamari index 4fbe5d7..209a2d8 100644 --- a/Dockerfile-boxed-ocrd_calamari +++ b/Dockerfile-ocrd_calamari @@ -1,4 +1,4 @@ -FROM boxed-core +FROM my_ocrd_workflow-core # Build pip installable stuff diff --git a/Dockerfile-boxed-ocrd_olena b/Dockerfile-ocrd_olena similarity index 97% rename from Dockerfile-boxed-ocrd_olena rename to Dockerfile-ocrd_olena index 0d4ced8..25b447d 100644 --- a/Dockerfile-boxed-ocrd_olena +++ b/Dockerfile-ocrd_olena @@ -1,4 +1,4 @@ -FROM boxed-core +FROM my_ocrd_workflow-core ENV OCRD_OLENA_VERSION 1.2.0 diff --git a/Dockerfile-boxed-ocrd_tesserocr b/Dockerfile-ocrd_tesserocr similarity index 96% rename from Dockerfile-boxed-ocrd_tesserocr rename to Dockerfile-ocrd_tesserocr index 125b6b0..92cb263 100644 --- a/Dockerfile-boxed-ocrd_tesserocr +++ b/Dockerfile-ocrd_tesserocr @@ -1,4 +1,4 @@ -FROM boxed-core +FROM my_ocrd_workflow-core ENV TESSDATA_BEST_VERSION 4.0.0 diff --git a/Dockerfile-boxed-sbb_textline_detector b/Dockerfile-sbb_textline_detector similarity index 93% rename from Dockerfile-boxed-sbb_textline_detector rename to Dockerfile-sbb_textline_detector index 75386d1..ed4e44b 100644 --- a/Dockerfile-boxed-sbb_textline_detector +++ b/Dockerfile-sbb_textline_detector @@ -1,4 +1,4 @@ -FROM boxed-core +FROM my_ocrd_workflow-core ENV SBB_TEXTLINE_DETECTOR_COMMIT 8b01d9e diff --git a/build b/build index f19907e..d0ad1fa 100755 --- a/build +++ b/build @@ -21,9 +21,9 @@ get_from_web() { handle_data -docker build -t boxed-core -f Dockerfile-boxed-core . -docker build -t boxed-ocrd_calamari -f Dockerfile-boxed-ocrd_calamari . -docker build -t boxed-dinglehopper -f Dockerfile-boxed-dinglehopper . -docker build -t boxed-ocrd_olena -f Dockerfile-boxed-ocrd_olena . -docker build -t boxed-ocrd_tesserocr -f Dockerfile-boxed-ocrd_tesserocr . -docker build -t boxed-sbb_textline_detector -f Dockerfile-boxed-sbb_textline_detector . +docker build -t my_ocrd_workflow-core -f Dockerfile-core . +docker build -t my_ocrd_workflow-ocrd_calamari -f Dockerfile-ocrd_calamari . +docker build -t my_ocrd_workflow-dinglehopper -f Dockerfile-dinglehopper . +docker build -t my_ocrd_workflow-ocrd_olena -f Dockerfile-ocrd_olena . +docker build -t my_ocrd_workflow-ocrd_tesserocr -f Dockerfile-ocrd_tesserocr . +docker build -t my_ocrd_workflow-sbb_textline_detector -f Dockerfile-sbb_textline_detector . diff --git a/run b/run index b2ce3ea..de48bc9 100755 --- a/run +++ b/run @@ -31,12 +31,12 @@ build_alias() { alias $command="docker run $docker_run_options $docker_image $command" } shopt -s expand_aliases # Required for non-interactive shells -build_alias ocrd boxed-core -build_alias ocrd-olena-binarize boxed-ocrd_olena -build_alias ocrd-sbb-textline-detector boxed-sbb_textline_detector -build_alias ocrd-calamari-recognize boxed-ocrd_calamari -build_alias ocrd-tesserocr-recognize boxed-ocrd_tesserocr -build_alias ocrd-dinglehopper boxed-dinglehopper +build_alias ocrd my_ocrd_workflow-core +build_alias ocrd-olena-binarize my_ocrd_workflow-ocrd_olena +build_alias ocrd-sbb-textline-detector my_ocrd_workflow-sbb_textline_detector +build_alias ocrd-calamari-recognize my_ocrd_workflow-ocrd_calamari +build_alias ocrd-tesserocr-recognize my_ocrd_workflow-ocrd_tesserocr +build_alias ocrd-dinglehopper my_ocrd_workflow-dinglehopper . $self_dir/my_ocrd_workflow From 21c1f310b1411be1cf488ea2f32ce2c98fae9f0c Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 14 Aug 2020 17:54:13 +0200 Subject: [PATCH 06/16] =?UTF-8?q?=F0=9F=9A=A7=20Travis:=20Fix=20build=20st?= =?UTF-8?q?age?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .travis.yml | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 582aadc..70283c3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,14 +21,16 @@ jobs: - echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin # We are using the image my_ocrd_workflow to cache, so pull and tag it - - docker pull $DOCKER_USERNAME/my_ocrd_workflow - - docker tag $DOCKER_USERNAME/my_ocrd_workflow my_ocrd_workflow + # FIXME + #- docker pull $DOCKER_USERNAME/my_ocrd_workflow + #- docker tag $DOCKER_USERNAME/my_ocrd_workflow my_ocrd_workflow - FORCE_DOWNLOAD=y ./build - - docker tag my_ocrd_workflow $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT - - docker images - - docker push $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT + # FIXME + #- docker tag my_ocrd_workflow $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT + #- docker images + #- docker push $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT - stage: "Test" script: From b0157ff1a290f7ae159acafae444dddf952c8818 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 14 Aug 2020 18:02:42 +0200 Subject: [PATCH 07/16] =?UTF-8?q?=F0=9F=9A=A7=20Travis:=20Fix=20build=20st?= =?UTF-8?q?age?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .travis.yml | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index 70283c3..71f04a4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,21 +16,31 @@ stages: jobs: include: - - stage: "Build Docker image" + - stage: "Build Docker images" script: + - sub_images=`ls -1 Dockerfile-* | sed 's/Dockerfile-//'` - echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin - # We are using the image my_ocrd_workflow to cache, so pull and tag it - # FIXME - #- docker pull $DOCKER_USERNAME/my_ocrd_workflow - #- docker tag $DOCKER_USERNAME/my_ocrd_workflow my_ocrd_workflow + # We are using the images to cache, so pull and tag it + - | + for x in sub_images; do + docker pull $DOCKER_USERNAME/my_ocrd_workflow-$x || true + docker tag $DOCKER_USERNAME/my_ocrd_workflow-$x my_ocrd_workflow-$x || true + done - FORCE_DOWNLOAD=y ./build - # FIXME - #- docker tag my_ocrd_workflow $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT - #- docker images - #- docker push $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT + - | + set -e + for x in sub_images; do + docker tag my_ocrd_workflow-$x $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT + done + - docker images + - | + set -e + for x in sub_images; do + docker push $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT + done - stage: "Test" script: From eb820bac72e408cadd4739d6e92d79794ca06a06 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 14 Aug 2020 18:04:19 +0200 Subject: [PATCH 08/16] =?UTF-8?q?=F0=9F=9A=A7=20Travis:=20Fix=20build=20st?= =?UTF-8?q?age?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 71f04a4..a58ce2b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ git: submodules: false # Avoid trying to checkout private data/ submodule stages: - - name: "Build Docker image" + - name: "Build Docker images" - name: "Test" - name: "Deploy Docker image - latest" if: branch = master From 0074ac405e5b9ea03f2e3e54af94a591bd2cbbcd Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 14 Aug 2020 18:06:07 +0200 Subject: [PATCH 09/16] =?UTF-8?q?=F0=9F=9A=A7=20Travis:=20Fix=20build=20st?= =?UTF-8?q?age?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .travis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index a58ce2b..b68d2a9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,7 +23,7 @@ jobs: # We are using the images to cache, so pull and tag it - | - for x in sub_images; do + for x in $sub_images; do docker pull $DOCKER_USERNAME/my_ocrd_workflow-$x || true docker tag $DOCKER_USERNAME/my_ocrd_workflow-$x my_ocrd_workflow-$x || true done @@ -32,13 +32,13 @@ jobs: - | set -e - for x in sub_images; do + for x in $sub_images; do docker tag my_ocrd_workflow-$x $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT done - docker images - | set -e - for x in sub_images; do + for x in $sub_images; do docker push $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT done From fc853d4d138d7e2d88148353edabcc6d204926ec Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 24 Aug 2020 17:23:28 +0200 Subject: [PATCH 10/16] =?UTF-8?q?=F0=9F=90=9B=20Handle=20missing=20pip3=20?= =?UTF-8?q?in=20the=20main=20script?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- my_ocrd_workflow | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/my_ocrd_workflow b/my_ocrd_workflow index 30244ca..923a85a 100755 --- a/my_ocrd_workflow +++ b/my_ocrd_workflow @@ -82,7 +82,9 @@ main() { if [ "$LOG_LEVEL" = "DEBUG" -o "$LOG_LEVEL" = "TRACE" ]; then - pip3 list + if which pip3; then + pip3 list + fi fi main From 848a1eb6c381ef4f09dec6e393cfd1900aeced18 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 24 Aug 2020 17:25:13 +0200 Subject: [PATCH 11/16] =?UTF-8?q?=F0=9F=9A=A7=20Travis:=20Pull=20the=20cor?= =?UTF-8?q?rect=20images=20in=20the=20test=20stage?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .travis.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index b68d2a9..04b7523 100644 --- a/.travis.yml +++ b/.travis.yml @@ -44,8 +44,11 @@ jobs: - stage: "Test" script: - - docker pull $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT - - docker tag $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT my_ocrd_workflow + - | + for x in $sub_images; do + docker pull $DOCKER_USERNAME/my_ocrd_workflow-$x || true + docker tag $DOCKER_USERNAME/my_ocrd_workflow-$x my_ocrd_workflow-$x || true + done - curl -O https://qurator-data.de/examples/actevedef_718448162.first-page.zip - unzip actevedef_718448162.first-page.zip From d4cc0c16c09e459eaaf15aa02a9cec5707ef7810 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 24 Aug 2020 17:39:34 +0200 Subject: [PATCH 12/16] =?UTF-8?q?=F0=9F=9A=A7=20Travis:=20Fix=20tagging/pu?= =?UTF-8?q?shing/pulling=20the=20correct=20images?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .travis.yml | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/.travis.yml b/.travis.yml index 04b7523..23f53b4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -46,8 +46,8 @@ jobs: script: - | for x in $sub_images; do - docker pull $DOCKER_USERNAME/my_ocrd_workflow-$x || true - docker tag $DOCKER_USERNAME/my_ocrd_workflow-$x my_ocrd_workflow-$x || true + docker pull $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT + docker tag $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT my_ocrd_workflow-$x done - curl -O https://qurator-data.de/examples/actevedef_718448162.first-page.zip @@ -65,15 +65,21 @@ jobs: script: - echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin - - docker pull $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT - - docker tag $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT $DOCKER_USERNAME/my_ocrd_workflow:$DOCKER_TAG - - docker push $DOCKER_USERNAME/my_ocrd_workflow:$DOCKER_TAG + - | + for x in $sub_images; do + docker pull $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT + docker tag $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT my_ocrd_workflow-$x:$DOCKER_TAG + docker push $DOCKER_USERNAME/my_ocrd_workflow-$x:$DOCKER_TAG + done - stage: "Deploy Docker image - tagged" env: DOCKER_TAG=$TRAVIS_TAG script: - echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin - - docker pull $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT - - docker tag $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT $DOCKER_USERNAME/my_ocrd_workflow:$DOCKER_TAG - - docker push $DOCKER_USERNAME/my_ocrd_workflow:$DOCKER_TAG + - | + for x in $sub_images; do + docker pull $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT + docker tag $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT my_ocrd_workflow-$x:$DOCKER_TAG + docker push $DOCKER_USERNAME/my_ocrd_workflow-$x:$DOCKER_TAG + done From 436bb32a82ce0894f4a12baa81f7a8d451c24aea Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 24 Aug 2020 17:40:50 +0200 Subject: [PATCH 13/16] =?UTF-8?q?=F0=9F=9A=A7=20Travis:=20Fix=20tagging/pu?= =?UTF-8?q?shing/pulling=20the=20correct=20images?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 23f53b4..ae5ffb4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -68,7 +68,7 @@ jobs: - | for x in $sub_images; do docker pull $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT - docker tag $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT my_ocrd_workflow-$x:$DOCKER_TAG + docker tag $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT $DOCKER_USERNAME/my_ocrd_workflow-$x:$DOCKER_TAG docker push $DOCKER_USERNAME/my_ocrd_workflow-$x:$DOCKER_TAG done @@ -80,6 +80,6 @@ jobs: - | for x in $sub_images; do docker pull $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT - docker tag $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT my_ocrd_workflow-$x:$DOCKER_TAG + docker tag $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT $DOCKER_USERNAME/my_ocrd_workflow-$x:$DOCKER_TAG docker push $DOCKER_USERNAME/my_ocrd_workflow-$x:$DOCKER_TAG done From 17c6b15a1bdef4029212172962fd1a186d1eec1c Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 24 Aug 2020 18:13:13 +0200 Subject: [PATCH 14/16] =?UTF-8?q?=F0=9F=90=9B=20(Better)=20Handle=20missin?= =?UTF-8?q?g=20pip3=20in=20the=20main=20script?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- my_ocrd_workflow | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/my_ocrd_workflow b/my_ocrd_workflow index 923a85a..d3164be 100755 --- a/my_ocrd_workflow +++ b/my_ocrd_workflow @@ -82,9 +82,7 @@ main() { if [ "$LOG_LEVEL" = "DEBUG" -o "$LOG_LEVEL" = "TRACE" ]; then - if which pip3; then - pip3 list - fi + pip3 list || true fi main From 13c619a2f078c7db89de63da5d75bba040090a77 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 24 Aug 2020 18:45:30 +0200 Subject: [PATCH 15/16] =?UTF-8?q?=F0=9F=9A=A7=20Travis:=20Fix=20tagging/pu?= =?UTF-8?q?shing/pulling=20the=20correct=20images?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .travis.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.travis.yml b/.travis.yml index ae5ffb4..c4cdf46 100644 --- a/.travis.yml +++ b/.travis.yml @@ -44,6 +44,7 @@ jobs: - stage: "Test" script: + - sub_images=`ls -1 Dockerfile-* | sed 's/Dockerfile-//'` - | for x in $sub_images; do docker pull $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT @@ -63,6 +64,7 @@ jobs: - stage: "Deploy Docker image - latest" env: DOCKER_TAG=latest script: + - sub_images=`ls -1 Dockerfile-* | sed 's/Dockerfile-//'` - echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin - | @@ -75,6 +77,7 @@ jobs: - stage: "Deploy Docker image - tagged" env: DOCKER_TAG=$TRAVIS_TAG script: + - sub_images=`ls -1 Dockerfile-* | sed 's/Dockerfile-//'` - echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin - | From 902f32cb95ab027d5a670df4bd8e20a2a3dc8abf Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 24 Aug 2020 19:35:08 +0200 Subject: [PATCH 16/16] =?UTF-8?q?=F0=9F=A7=B9=20Move=20one-liner=20ocrd=5F?= =?UTF-8?q?logging.py=20to=20an=20echo=20statement?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile-core | 2 +- ocrd_logging.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 100644 ocrd_logging.py diff --git a/Dockerfile-core b/Dockerfile-core index 2b3c8c6..da4a850 100644 --- a/Dockerfile-core +++ b/Dockerfile-core @@ -22,7 +22,7 @@ RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ # Set up OCR-D logging -COPY ocrd_logging.py /etc/ +RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py # Install pip (and setuptools) diff --git a/ocrd_logging.py b/ocrd_logging.py deleted file mode 100644 index a380b99..0000000 --- a/ocrd_logging.py +++ /dev/null @@ -1 +0,0 @@ -setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))