diff --git a/.travis.yml b/.travis.yml
index 582aadc..c4cdf46 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,7 +7,7 @@ git:
submodules: false # Avoid trying to checkout private data/ submodule
stages:
- - name: "Build Docker image"
+ - name: "Build Docker images"
- name: "Test"
- name: "Deploy Docker image - latest"
if: branch = master
@@ -16,24 +16,40 @@ stages:
jobs:
include:
- - stage: "Build Docker image"
+ - stage: "Build Docker images"
script:
+ - sub_images=`ls -1 Dockerfile-* | sed 's/Dockerfile-//'`
- echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin
- # We are using the image my_ocrd_workflow to cache, so pull and tag it
- - docker pull $DOCKER_USERNAME/my_ocrd_workflow
- - docker tag $DOCKER_USERNAME/my_ocrd_workflow my_ocrd_workflow
+ # We are using the images to cache, so pull and tag it
+ - |
+ for x in $sub_images; do
+ docker pull $DOCKER_USERNAME/my_ocrd_workflow-$x || true
+ docker tag $DOCKER_USERNAME/my_ocrd_workflow-$x my_ocrd_workflow-$x || true
+ done
- FORCE_DOWNLOAD=y ./build
- - docker tag my_ocrd_workflow $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT
+ - |
+ set -e
+ for x in $sub_images; do
+ docker tag my_ocrd_workflow-$x $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT
+ done
- docker images
- - docker push $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT
+ - |
+ set -e
+ for x in $sub_images; do
+ docker push $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT
+ done
- stage: "Test"
script:
- - docker pull $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT
- - docker tag $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT my_ocrd_workflow
+ - sub_images=`ls -1 Dockerfile-* | sed 's/Dockerfile-//'`
+ - |
+ for x in $sub_images; do
+ docker pull $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT
+ docker tag $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT my_ocrd_workflow-$x
+ done
- curl -O https://qurator-data.de/examples/actevedef_718448162.first-page.zip
- unzip actevedef_718448162.first-page.zip
@@ -48,17 +64,25 @@ jobs:
- stage: "Deploy Docker image - latest"
env: DOCKER_TAG=latest
script:
+ - sub_images=`ls -1 Dockerfile-* | sed 's/Dockerfile-//'`
- echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin
- - docker pull $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT
- - docker tag $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT $DOCKER_USERNAME/my_ocrd_workflow:$DOCKER_TAG
- - docker push $DOCKER_USERNAME/my_ocrd_workflow:$DOCKER_TAG
+ - |
+ for x in $sub_images; do
+ docker pull $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT
+ docker tag $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT $DOCKER_USERNAME/my_ocrd_workflow-$x:$DOCKER_TAG
+ docker push $DOCKER_USERNAME/my_ocrd_workflow-$x:$DOCKER_TAG
+ done
- stage: "Deploy Docker image - tagged"
env: DOCKER_TAG=$TRAVIS_TAG
script:
+ - sub_images=`ls -1 Dockerfile-* | sed 's/Dockerfile-//'`
- echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin
- - docker pull $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT
- - docker tag $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT $DOCKER_USERNAME/my_ocrd_workflow:$DOCKER_TAG
- - docker push $DOCKER_USERNAME/my_ocrd_workflow:$DOCKER_TAG
+ - |
+ for x in $sub_images; do
+ docker pull $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT
+ docker tag $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT $DOCKER_USERNAME/my_ocrd_workflow-$x:$DOCKER_TAG
+ docker push $DOCKER_USERNAME/my_ocrd_workflow-$x:$DOCKER_TAG
+ done
diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index e786a5d..0000000
--- a/Dockerfile
+++ /dev/null
@@ -1,86 +0,0 @@
-FROM ubuntu:18.04
-
-ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
-ENV PIP_DEFAULT_TIMEOUT=120
-
-ENV OCRD_OLENA_VERSION 1.2.0
-ENV TESSDATA_BEST_VERSION 4.0.0
-ENV TESSDATA_PREFIX /usr/local/share/tessdata
-
-
-RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
- apt-get update && \
- apt-get install -y \
- curl xz-utils \
- python3-pip \
- git \
- software-properties-common \
-# For clstm on Ubuntu 19.04:
- swig libeigen3-dev libpng-dev libprotobuf-dev \
-# For cv2:
- libsm6 libxrender1 \
-# For ocrd_olena:
- imagemagick \
-# XML utils
- libxml2-utils \
- xmlstarlet \
- && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
-
-
-# Install Leptonica and Tesseract.
-RUN add-apt-repository ppa:alex-p/tesseract-ocr && \
- apt-get update && \
- apt-get install -y \
- tesseract-ocr \
- libtesseract-dev \
- && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
-
-# Set up OCR-D logging
-COPY ocrd_logging.py /etc/
-
-
-# Build ocrd_olena
-# XXX .deb needs an update
-RUN curl -sSL --retry 3 -O https://qurator-data.de/~mike.gerber/olena_2.1-0+ocrd-git/olena-bin_2.1-0+ocrd-git_amd64.deb && \
- dpkg -i --force-depends olena-bin_2.1-0+ocrd-git_amd64.deb && \
- rm -f olena-bin_2.1-0+ocrd-git_amd64.deb && \
- apt-get update && \
- apt-get -f install -y && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
-RUN pip3 install --no-cache-dir --upgrade pip && \
- curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \
- mkdir ocrd_olena && \
- tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \
- cd ocrd_olena && \
- sed -i 's/^install: deps$/install:/' Makefile && \
- pip3 install --no-cache-dir --use-feature=2020-resolver ocrd && \
- make install PREFIX=/usr/local && \
- cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz
-
-
-# Copy OCR models
-RUN mkdir -p /var/lib/calamari-models
-COPY data/calamari-models/GT4HistOCR /var/lib/calamari-models/GT4HistOCR
-RUN mkdir -p $TESSDATA_PREFIX
-ADD data/mirror/github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}-repacked.tar.gz $TESSDATA_PREFIX/
-COPY data/tesseract-models/GT4HistOCR/GT4HistOCR_2000000.traineddata $TESSDATA_PREFIX/
-COPY data/textline_detection /var/lib/textline_detection
-
-
-# Install requirements
-# Using pipdeptree here to get more info than from pip3 check
-COPY requirements.txt /tmp/
-RUN pip3 install --no-cache-dir --upgrade pip && \
- pip3 install --no-cache-dir --use-feature=2020-resolver -r /tmp/requirements.txt && \
- pip3 install --no-cache-dir pipdeptree && \
- pipdeptree -w fail
-
-
-COPY my_ocrd_workflow /usr/bin/
-COPY xsd/* /usr/share/xml/
-
-
-WORKDIR /data
-ENTRYPOINT ["/usr/bin/my_ocrd_workflow"]
diff --git a/Dockerfile-core b/Dockerfile-core
new file mode 100644
index 0000000..da4a850
--- /dev/null
+++ b/Dockerfile-core
@@ -0,0 +1,49 @@
+FROM ubuntu:18.04
+
+ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
+ENV PIP_DEFAULT_TIMEOUT=120
+
+
+RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
+ apt-get update && \
+ apt-get install -y \
+ curl xz-utils \
+ build-essential python3-dev \
+# For get-pip.py:
+ python3-distutils \
+# For add-apt-repository:
+ software-properties-common \
+# XML utils
+ libxml2-utils \
+ xmlstarlet \
+ && \
+ apt-get clean && \
+ rm -rf /var/lib/apt/lists/*
+
+
+# Set up OCR-D logging
+RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py
+
+
+# Install pip (and setuptools)
+# We use get-pip.py here to avoid
+# a. having to upgrade from Ubuntu's pip
+# b. the dreaded "old script wrapper" error message
+RUN curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+ python3 get-pip.py && \
+ rm -f get-pip.py
+
+
+# Install pip installable-stuff
+RUN pip3 install --no-cache-dir \
+ 'ocrd >= 2.13.1'
+
+
+# Check pip dependencies
+RUN pip3 check
+
+
+WORKDIR /data
+
+# Default command
+CMD ['ocrd']
diff --git a/Dockerfile-dinglehopper b/Dockerfile-dinglehopper
new file mode 100644
index 0000000..dfa9c1a
--- /dev/null
+++ b/Dockerfile-dinglehopper
@@ -0,0 +1,18 @@
+FROM my_ocrd_workflow-core
+
+
+ENV DINGLEHOPPER_COMMIT 2b98f69
+
+
+# Build pip installable stuff
+RUN pip3 install --no-cache-dir \
+# Now the real stuff:
+ https://github.com/qurator-spk/dinglehopper/archive/$DINGLEHOPPER_COMMIT.tar.gz
+
+
+# Check pip dependencies
+RUN pip3 check
+
+
+# Default command
+CMD ["ocrd-dinglehopper"]
diff --git a/Dockerfile-ocrd_calamari b/Dockerfile-ocrd_calamari
new file mode 100644
index 0000000..209a2d8
--- /dev/null
+++ b/Dockerfile-ocrd_calamari
@@ -0,0 +1,24 @@
+FROM my_ocrd_workflow-core
+
+
+# Build pip installable stuff
+RUN pip3 install --no-cache-dir \
+# Resolve conflicts early:
+ 'tensorflow-gpu == 1.15.*' \
+ 'calamari-ocr == 0.3.5' \
+# Now the real stuff:
+ 'ocrd_calamari >= 0.0.7'
+
+
+# Copy OCR models
+RUN mkdir -p /var/lib/calamari-models
+COPY data/calamari-models/GT4HistOCR /var/lib/calamari-models/GT4HistOCR
+
+
+
+# Check pip dependencies
+RUN pip3 check
+
+
+# Default command
+CMD ["ocrd-calamari-recognize"]
diff --git a/Dockerfile-ocrd_olena b/Dockerfile-ocrd_olena
new file mode 100644
index 0000000..25b447d
--- /dev/null
+++ b/Dockerfile-ocrd_olena
@@ -0,0 +1,32 @@
+FROM my_ocrd_workflow-core
+
+ENV OCRD_OLENA_VERSION 1.2.0
+
+# Build ocrd_olena
+RUN apt-get update && \
+ apt-get install -y \
+ imagemagick \
+ && \
+ apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN curl -sSL --retry 3 -O https://qurator-data.de/~mike.gerber/olena_2.1-0+ocrd-git/olena-bin_2.1-0+ocrd-git_amd64.deb && \
+ dpkg -i --force-depends olena-bin_2.1-0+ocrd-git_amd64.deb && \
+ rm -f olena-bin_2.1-0+ocrd-git_amd64.deb && \
+ apt-get update && \
+ apt-get -f install -y && \
+ apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \
+ mkdir ocrd_olena && \
+ tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \
+ cd ocrd_olena && \
+ sed -i 's/^install: deps$/install:/' Makefile && \
+ pip3 install --no-cache-dir --use-feature=2020-resolver ocrd && \
+ make install PREFIX=/usr/local && \
+ cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz
+
+
+# Check pip dependencies
+RUN pip3 check
+
+
+# Default command
+CMD ['ocrd-olena-binarize']
diff --git a/Dockerfile-ocrd_tesserocr b/Dockerfile-ocrd_tesserocr
new file mode 100644
index 0000000..92cb263
--- /dev/null
+++ b/Dockerfile-ocrd_tesserocr
@@ -0,0 +1,35 @@
+FROM my_ocrd_workflow-core
+
+
+ENV TESSDATA_BEST_VERSION 4.0.0
+ENV TESSDATA_PREFIX /usr/local/share/tessdata
+
+
+# Install Leptonica and Tesseract.
+RUN add-apt-repository ppa:alex-p/tesseract-ocr && \
+ apt-get update && \
+ apt-get install -y \
+ tesseract-ocr \
+ libtesseract-dev \
+ && \
+ apt-get clean && rm -rf /var/lib/apt/lists/*
+
+
+# Copy OCR models
+RUN mkdir -p $TESSDATA_PREFIX
+ADD data/mirror/github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}-repacked.tar.gz $TESSDATA_PREFIX/
+COPY data/tesseract-models/GT4HistOCR/GT4HistOCR_2000000.traineddata $TESSDATA_PREFIX/
+
+
+# Build pip installable stuff
+RUN pip3 install --no-cache-dir \
+# Now the real stuff:
+ 'ocrd_tesserocr >= 0.9.0'
+
+
+# Check pip dependencies
+RUN pip3 check
+
+
+# Default command
+CMD ["ocrd-tesserocr-recognize"]
diff --git a/Dockerfile-sbb_textline_detector b/Dockerfile-sbb_textline_detector
new file mode 100644
index 0000000..ed4e44b
--- /dev/null
+++ b/Dockerfile-sbb_textline_detector
@@ -0,0 +1,22 @@
+FROM my_ocrd_workflow-core
+
+
+ENV SBB_TEXTLINE_DETECTOR_COMMIT 8b01d9e
+
+
+# Build pip installable stuff
+RUN pip3 install --no-cache-dir \
+# Now the real stuff:
+ https://github.com/qurator-spk/sbb_textline_detector/archive/$SBB_TEXTLINE_DETECTOR_COMMIT.tar.gz
+
+
+# Copy OCR models
+COPY data/textline_detection /var/lib/textline_detection
+
+
+# Check pip dependencies
+RUN pip3 check
+
+
+# Default command
+CMD ["ocrd-sbb-textline-detector"]
diff --git a/build b/build
index 227a914..d0ad1fa 100755
--- a/build
+++ b/build
@@ -21,4 +21,9 @@ get_from_web() {
handle_data
-docker build --cache-from my_ocrd_workflow -t my_ocrd_workflow .
+docker build -t my_ocrd_workflow-core -f Dockerfile-core .
+docker build -t my_ocrd_workflow-ocrd_calamari -f Dockerfile-ocrd_calamari .
+docker build -t my_ocrd_workflow-dinglehopper -f Dockerfile-dinglehopper .
+docker build -t my_ocrd_workflow-ocrd_olena -f Dockerfile-ocrd_olena .
+docker build -t my_ocrd_workflow-ocrd_tesserocr -f Dockerfile-ocrd_tesserocr .
+docker build -t my_ocrd_workflow-sbb_textline_detector -f Dockerfile-sbb_textline_detector .
diff --git a/my_ocrd_workflow b/my_ocrd_workflow
index 30244ca..d3164be 100755
--- a/my_ocrd_workflow
+++ b/my_ocrd_workflow
@@ -82,7 +82,7 @@ main() {
if [ "$LOG_LEVEL" = "DEBUG" -o "$LOG_LEVEL" = "TRACE" ]; then
- pip3 list
+ pip3 list || true
fi
main
diff --git a/ocrd_logging.py b/ocrd_logging.py
deleted file mode 100644
index a380b99..0000000
--- a/ocrd_logging.py
+++ /dev/null
@@ -1 +0,0 @@
-setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index d97256c..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-tensorflow-gpu < 2.0 # Needed for sbb_text_linedetector
-
-ocrd >= 2.13.1
-
-# XXX See https://github.com/OCR-D/ocrd_tesserocr/issues/135
-# ocrd_tesserocr >= 0.8.XXX
-https://github.com/mikegerber/ocrd_tesserocr/archive/fix/set-pcgtsid.tar.gz
-
-ocrd_calamari >= 0.0.7
-
-https://github.com/qurator-spk/sbb_textline_detector/archive/8b01d9e.tar.gz
-
-https://github.com/qurator-spk/dinglehopper/archive/2b98f69.tar.gz
diff --git a/run b/run
index 7fbf16d..de48bc9 100755
--- a/run
+++ b/run
@@ -1,31 +1,42 @@
-#!/bin/sh
-# Run the my_ocrd_workflow container on the current workspace
+#!/bin/bash
set -e # Abort on error
-DOCKER_IMAGE=${DOCKER_IMAGE:-my_ocrd_workflow:latest} # default to locally built
-
-if echo "$DOCKER_IMAGE" | grep -q "/"; then
- docker pull "$DOCKER_IMAGE"
-fi
+self=`realpath $0`
+self_dir=`dirname "$self"`
-# XXX Work around podman vs docker uid behaviour
+# Docker run options
+docker_run_options="--rm -t"
+docker_run_options="$docker_run_options --mount type=bind,src=\"$(pwd)\",target=/data"
+# In podman, the container always runs as the real user == uid 0 in container
if docker -v 2>&1 | grep -q podman; then
user="0:0"
else
user="`id -u`:`id -g`"
fi
-
-
-# The container currently needs to run privileged to allow it to read from e.g.
+docker_run_options="$docker_run_options --user $user"
+docker_run_options="$docker_run_options -e LOG_LEVEL=$LOG_LEVEL"
+# The containers currently need to run privileged to allow it to read from e.g.
# /home on SELinux secured systems such as Fedora. We might want to use udica
# instead in the future.
+docker_run_options="$docker_run_options --privileged=true"
+
+
+# Build aliases for the containerized ocrd processors
+build_alias() {
+ local command=$1
+ local docker_image=$2
+
+ alias $command="docker run $docker_run_options $docker_image $command"
+}
+shopt -s expand_aliases # Required for non-interactive shells
+build_alias ocrd my_ocrd_workflow-core
+build_alias ocrd-olena-binarize my_ocrd_workflow-ocrd_olena
+build_alias ocrd-sbb-textline-detector my_ocrd_workflow-sbb_textline_detector
+build_alias ocrd-calamari-recognize my_ocrd_workflow-ocrd_calamari
+build_alias ocrd-tesserocr-recognize my_ocrd_workflow-ocrd_tesserocr
+build_alias ocrd-dinglehopper my_ocrd_workflow-dinglehopper
+
-docker run --privileged=true --rm -t \
- \
- --user $user \
- --mount type=bind,src="$(pwd)",target=/data \
- \
- -e LOG_LEVEL=$LOG_LEVEL \
- $DOCKER_IMAGE "$@"
+. $self_dir/my_ocrd_workflow
diff --git a/run-docker-hub b/run-docker-hub
deleted file mode 100755
index 9e4339e..0000000
--- a/run-docker-hub
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/sh
-# Run the my_ocrd_workflow container on the current workspace
-
-DOCKER_IMAGE=mikegerber/my_ocrd_workflow:stable `dirname $0`/run "$@"
diff --git a/xsd/pagecontent.2017-07-15.xsd b/xsd/pagecontent.2017-07-15.xsd
deleted file mode 100644
index b4b2266..0000000
--- a/xsd/pagecontent.2017-07-15.xsd
+++ /dev/null
@@ -1,2137 +0,0 @@
-
-
-
-
-
- Page Content - Ground Truth and Storage
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- The timestamp has to be in UTC (Coordinated
- Universal Time) and not local time.
-
-
-
-
-
-
- The timestamp has to be in UTC (Coordinated
- Universal Time) and not local time.
-
-
-
-
-
-
-
-
-
- External reference of any kind
-
-
-
-
-
-
-
- Alternative document page images (e.g.
- black-and-white)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Unassigned regions are considered to be in the
- (virtual) default layer which is to be treated
- as below any other layers.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- For generic use
-
-
-
-
- Page type
-
-
-
-
-
- The primary language used in the page (lower-level definitions override the page-level definition)
-
-
-
-
-
-
- The secondary language used in the page (lower-level definitions override the page-level definition)
-
-
-
-
-
-
- The primary script used in the page (lower-level definitions override the page-level definition)
-
-
-
-
-
-
- The secondary script used in the page (lower-level definitions override the page-level definition)
-
-
-
-
-
-
- The direction in which text in a region should be
- read (within lines) (lower-level definitions override the page-level definition)
-
-
-
-
-
- Inner-block order of text lines (in addition to “readingDirection” which is the inner-text line order of words and characters) (lower-level definitions override the page-level definition)
-
-
-
-
-
-
- Pure text is represented as a text region. This includes
- drop capitals, but practically ornate text may be
- considered as a graphic.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
- The nature of the text in the region
-
-
-
-
-
-
- The degree of space in points between the lines of
- text (line spacing)
-
-
-
-
-
-
- The direction in which text in a region should be
- read (within lines)
-
-
-
-
-
- Inner-block order of text lines (in addition to “readingDirection” which is the inner-text line order of words and characters)
-
-
-
-
- The angle the baseline of text withing a region has to be rotated (relative to the rectangle encapsulating the region) in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
- Defines whether a region of text is indented or not
-
-
-
-
-
- Text align
-
-
-
-
-
- The primary language used in the region
-
-
-
-
-
-
- The secondary language used in the region
-
-
-
-
-
-
- The primary script used in the region
-
-
-
-
-
-
- The secondary script used in the region
-
-
-
-
-
-
-
-
-
-
- Point list with format "x1,y1 x2,y2 ..."
-
-
-
-
-
-
-
-
-
- Multiple connected points that mark the baseline
- of the glyphs
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Overrides primaryLanguage attribute of parent text
- region
-
-
-
-
-
-
- The primary script used in the text line
-
-
-
-
-
-
- The secondary script used in the text line
-
-
-
-
-
-
- The direction in which text in a text line should be read
-
-
-
-
-
-
- Overrides the production attribute of the parent
- text region
-
-
-
-
-
- For generic use
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Overrides primaryLanguage attribute of parent line
- and/or text region
-
-
-
-
-
-
- The primary script used in the word
-
-
-
-
-
-
- The secondary script used in the word
-
-
-
-
-
-
- The direction in which characters in a word should be read
-
-
-
-
-
-
- Overrides the production attribute of the parent
- text line and/or text region.
-
-
-
-
-
- For generic use
-
-
-
-
-
-
-
-
-
-
- Container for graphemes, grapheme groups and
- non-printing characters
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- The script used for the glyph
-
-
-
-
-
-
- Overrides the production attribute of the parent
- word / text line / text region.
-
-
-
-
-
- For generic use
-
-
-
-
-
-
-
-
-
- Text in a "simple" form (ASCII or extended ASCII
- as mostly used for typing). I.e. no use of
- special characters for ligatures (should be
- stored as two separate characters) etc.
-
-
-
-
-
-
- Correct encoding of the original, always using
- the corresponding Unicode code point. I.e.
- ligatures have to be represented as one
- character etc.
-
-
-
-
-
-
- Used for sort order in case multiple TextEquivs are defined. The text content with the lowest index should be interpreted as the main text content.
-
-
-
-
-
-
-
-
-
- OCR confidence value (between 0 and 1)
-
-
-
-
-
-
-
-
-
-
- Type of text content (is it free text or a number, for instance)
-This is only a descriptive attribute, the text type is not checked during XML validation
-
-
-
-
- Refinement for dataType attribute. Can be a regular expression, for instance.
-
-
-
-
-
-
-
-
- An image is considered to be more intricate and complex
- than a graphic. These can be photos or drawings.
-
-
-
-
-
-
- The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
- The colour bit depth required for the region
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
- Specifies whether the region also contains
- text
-
-
-
-
-
-
-
-
-
- A line drawing is a single colour illustration without
- solid areas.
-
-
-
-
-
-
- The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
- The pen (foreground) colour of the region
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
- Specifies whether the region also contains
- text
-
-
-
-
-
-
-
-
-
- Regions containing simple graphics, such as a company
- logo, should be marked as graphic regions.
-
-
-
-
-
-
- The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
- The type of graphic in the region
-
-
-
-
-
-
- An approximation of the number of colours
- used in the region
-
-
-
-
-
-
- Specifies whether the region also contains
- text.
-
-
-
-
-
-
-
-
-
- Tabular data in any form is represented with a table
- region. Rows and columns may or may not have separator
- lines; these lines are not separator regions.
-
-
-
-
-
-
- The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
- The number of rows present in the table
-
-
-
-
-
-
- The number of columns present in the table
-
-
-
-
-
-
- The colour of the lines used in the region
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
- Specifies the presence of line separators
-
-
-
-
-
-
- Specifies whether the region also contains
- text
-
-
-
-
-
-
-
-
-
- Regions containing charts or graphs of any type, should
- be marked as chart regions.
-
-
-
-
-
-
- The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
- The type of chart in the region
-
-
-
-
-
-
- An approximation of the number of colours
- used in the region
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
- Specifies whether the region also contains
- text
-
-
-
-
-
-
-
-
-
- Separators are lines that lie between columns and
- paragraphs and can be used to logically separate
- different articles from each other.
-
-
-
-
-
-
- The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
- The colour of the separator
-
-
-
-
-
-
-
-
-
- Regions containing equations and mathematical symbols
- should be marked as maths regions.
-
-
-
-
-
-
- The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
-
-
-
- Regions containing chemical formulas.
-
-
-
-
-
-
-
- The angle the rectangle encapsulating a
- region has to be rotated in clockwise
- direction in order to correct the present
- skew (negative values indicate
- anti-clockwise rotation). Range:
- -179.999,180
-
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
-
-
-
-
- Regions containing musical notations.
-
-
-
-
-
-
- The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
-
-
-
- Regions containing advertisements.
-
-
-
-
-
-
- The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
-
-
-
- Noise regions are regions where no real data lies, only
- false data created by artifacts on the document or
- scanner noise.
-
-
-
-
-
-
-
-
-
- To be used if the region type cannot be ascertained.
-
-
-
-
-
-
-
-
-
- Determines the effective area on the paper of a printed page. Its size is equal for all pages of a book (exceptions: titlepage, multipage pictures).
-It contains all living elements (except marginals) like body type, footnotes, headings, running titles.
-It does not contain pagenumber (if not part of running title), marginals, signature mark, preview words.
-
-
-
-
-
-
-
-
-
- Definition of the reading order within the page. To express a reading order between elements they have to be included in an OrderedGroup. Groups may contain further groups.
-
-
-
-
-
-
-
-
-
- Numbered region
-
-
-
- Position (order number) of this item within the current hierarchy level.
-
-
-
-
-
-
-
- Indexed group containing ordered elements
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members.
-
-
-
-
- Position (order number) of this item within the
- current hierarchy level.
-
-
-
-
-
-
-
-
- Is this group a continuation of another group (from
- previous column or page, for example)?
-
-
-
-
-
-
-
-
-
-
- Indexed group containing unordered elements
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members.
-
-
-
-
- Position (order number) of this item within the
- current hierarchy level.
-
-
-
-
-
-
-
- Is this group a continuation of another group (from previous column or page, for example)?
-
-
-
-
-
-
-
-
-
-
-
- Numbered group (contains ordered elements)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members.
-
-
-
-
-
- Is this group a continuation of another group (from previous column or page, for example)?
-
-
-
-
-
-
-
-
- Numbered group (contains unordered elements)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members.
-
-
-
-
-
- Is this group a continuation of another group (from previous column or page, for example)?
-
-
-
-
-
-
- Border of the actual page (if the scanned image contains parts not belonging to the page).
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- iso15924 2016-07-14
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Can be used to express the z-index of overlapping
- regions. An element with a greater z-index is always in
- front of another element with lower z-index.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Point list with format "x1,y1 x2,y2 ..."
-
-
-
-
-
-
-
-
-
- Container for one-to-one relations between layout
- objects (for example: DropCap - paragraph, caption -
- image)
-
-
-
-
-
-
-
-
-
-
- One-to-one relation between to layout object. Use 'link'
- for loose relations and 'join' for strong relations
- (where something is fragmented for instance).
-
- Examples for 'link': caption - image floating -
- paragraph paragraph - paragraph (when a pragraph is
- split across columns and the last word of the first
- paragraph DOES NOT continue in the second paragraph)
- drop-cap - paragraph (when the drop-cap is a whole word)
-
- Examples for 'join': word - word (separated word at the
- end of a line) drop-cap - paragraph (when the drop-cap
- is not a whole word) paragraph - paragraph (when a
- pragraph is split across columns and the last word of
- the first paragraph DOES continue in the second
- paragraph)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- For generic use
-
-
-
-
-
- Text production type
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Monospace (fixed-pitch, non-proportional) or
- proportional font
-
-
-
-
-
- For instance: Arial, Times New Roman. Add more
- information if necessary (e.g. blackletter,
- antiqua).
-
-
-
-
-
-
- Serif or sans-serif typeface
-
-
-
-
-
-
-
- The size of the characters in points
-
-
-
-
-
- The x-height or corpus size refers to the distance between the baseline and the mean line of lower-case letters in a typeface. The unit is assumed to be pixels.
-
-
-
-
-
- The degree of space (in points) between the
- characters in a string of text
-
-
-
-
-
-
- Text colour in RGB encoded format (red value) + (256 x green value) + (65536 x blue value)
-
-
-
-
- Background colour
-
-
-
-
- Background colour in RGB encoded format (red value) + (256 x green value) + (65536 x blue value)
-
-
-
-
-
- Specifies whether the colour of the text appears
- reversed against a background colour
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Roles the region takes (e.g. in context of a
- parent region)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- For generic use
-
-
-
-
-
- Is this region a continuation of another region (in previous column or page, for example)?
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Examples: "123.456", "+1234.456", "-1234.456", "-.456", "-456"
-
-
-
- Examples: "123.456", "+1234.456", "-1.2344e56", "-.45E-6", "INF", "-INF", "NaN"
-
-
-
- Examples: "123456", "+00000012", "-1", "-456"
-
-
-
- Examples: "true", "false", "1", "0"
-
-
-
- Examples: "2001-10-26", "2001-10-26+02:00", "2001-10-26Z", "2001-10-26+00:00", "-2001-10-26", "-20000-04-01"
-
-
-
- Examples: "21:32:52", "21:32:52+02:00", "19:32:52Z", "19:32:52+00:00", "21:32:52.12679"
-
-
-
- Examples: "2001-10-26T21:32:52", "2001-10-26T21:32:52+02:00", "2001-10-26T19:32:52Z", "2001-10-26T19:32:52+00:00", "-2001-10-26T21:32:52", "2001-10-26T21:32:52.12679"
-
-
-
- Generic text string
-
-
-
- An XSD type that is not listed or a custom type (use dataTypeDetails attribute)
-
-
-
-
-
-
-
-
-
- Container for graphemes, grapheme groups and
- non-printing characters
-
-
-
-
-
-
-
-
-
-
-
- Base type for graphemes, grapheme groups and non-printing characters
-
-
-
-
-
-
-
-
- Order index of grapheme, group, or non-printing character within the parent container (graphemes or glyph or grapheme group)
-
-
-
-
-
-
-
-
-
-
- Type of character represented by the grapheme/group/non-printing character element
-
-
-
-
-
-
-
-
-
-
- For generic use
-
-
- For generic use
-
-
-
-
- Represents a sub-element of a glyph. Smallest graphical unit that can be assigned a Unicode code point
-
-
-
-
-
-
-
-
-
-
-
-
- A glyph component without visual representation but with Unicode code point. Non-visual / non-printing / control character. Part of grapheme container (of glyph) or grapheme sub group.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Container for user-defined attributes
-
-
-
-
-
-
-
-
- Structured custom data defined by name, type and value.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Cell position in table starting with row 0
-
-
-
- Cell position in table starting with column 0
-
-
-
- Number of rows the cell spans (optional; default is 1)
-
-
-
- Number of columns the cell spans (optional; default is 1)
-
-
-
-
-
-
-
- Data for a region that takes on the role of a table cell within a parent table region
-
-
-
-
\ No newline at end of file
diff --git a/xsd/pagecontent.2018-07-15.xsd b/xsd/pagecontent.2018-07-15.xsd
deleted file mode 100644
index c6b7e93..0000000
--- a/xsd/pagecontent.2018-07-15.xsd
+++ /dev/null
@@ -1,2496 +0,0 @@
-
-
-
-
-
- Page Content - Ground Truth and Storage
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- The timestamp has to be in UTC (Coordinated
- Universal Time) and not local time.
-
-
-
-
-
-
- The timestamp has to be in UTC (Coordinated
- Universal Time) and not local time.
-
-
-
-
-
-
-
-
-
-
-
- External reference of any kind
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
- Type of metadata (e.g. author)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- E.g. imagePhotometricInterpretation
-
-
-
-
-
- E.g. RGB
-
-
-
-
-
-
-
-
-
- A semantic label / tag
-
-
-
-
-
-
-
- Reference to external model / ontology / schema
-
-
-
-
-
- E.g. an RDF resource identifier (to be used as subject or object of an RDF triple)
-
-
-
-
- Prefix for all labels (e.g. first part of an URI)
-
-
-
-
-
-
-
-
- Semantic label
-
-
-
-
- The label / tag (e.g. 'person'). Can be an RDF resource identifier (e.g. object of an RDF triple).
-
-
-
-
-
-
- Additional information on the label (e.g. 'YYYY-mm-dd' for a date label). Can be used as predicate of an RDF triple.
-
-
-
-
-
-
-
-
-
-
-
- Alternative document page images (e.g.
- black-and-white)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Unassigned regions are considered to be in the
- (virtual) default layer which is to be treated
- as below any other layers.
-
-
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Contains the image file name including the file extension.
-
-
-
-
-
- Specifies the width of the image.
-
-
-
-
- Specifies the height of the image.
-
-
-
-
- Specifies the image resolution in width.
-
-
-
-
- Specifies the image resolution in height.
-
-
-
-
-
- Specifies the unit of the resolution information
- referring to a standardised unit of measurement (pixels per inch, pixels per centimeter or other).
-
-
-
-
-
-
-
-
-
-
-
-
-
- For generic use
-
-
-
-
- Page type
-
-
-
-
-
- The primary language used in the page (lower-level definitions override the page-level definition)
-
-
-
-
-
-
- The secondary language used in the page (lower-level definitions override the page-level definition)
-
-
-
-
-
-
- The primary script used in the page (lower-level definitions override the page-level definition)
-
-
-
-
-
-
- The secondary script used in the page (lower-level definitions override the page-level definition)
-
-
-
-
-
-
- The direction in which text in a region should be
- read (within lines) (lower-level definitions override the page-level definition)
-
-
-
-
-
- Inner-block order of text lines (in addition to “readingDirection” which is the inner-text line order of words and characters) (lower-level definitions override the page-level definition)
-
-
-
-
- Confidence value for whole page (between 0 and 1)
-
-
-
-
-
-
-
- Pure text is represented as a text region. This includes
- drop capitals, but practically ornate text may be
- considered as a graphic.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
- The nature of the text in the region
-
-
-
-
-
-
- The degree of space in points between the lines of
- text (line spacing)
-
-
-
-
-
-
- The direction in which text in a region should be
- read (within lines)
-
-
-
-
-
- Inner-block order of text lines (in addition to “readingDirection” which is the inner-text line order of words and characters)
-
-
-
-
- The angle the baseline of text withing a region has to be rotated (relative to the rectangle encapsulating the region) in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
- Defines whether a region of text is indented or not
-
-
-
-
-
- Text align
-
-
-
-
-
- The primary language used in the region
-
-
-
-
-
-
- The secondary language used in the region
-
-
-
-
-
-
- The primary script used in the region
-
-
-
-
-
-
- The secondary script used in the region
-
-
-
-
-
-
-
-
-
-
- Point list with format "x1,y1 x2,y2 ..."
-
-
-
-
- Confidence value (between 0 and 1)
-
-
-
-
-
-
-
-
- Alternative text line images (e.g.
- black-and-white)
-
-
-
-
-
-
-
- Multiple connected points that mark the baseline
- of the glyphs
-
-
-
-
-
-
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
-
-
- Overrides primaryLanguage attribute of parent text
- region
-
-
-
-
-
-
- The primary script used in the text line
-
-
-
-
-
-
- The secondary script used in the text line
-
-
-
-
-
-
- The direction in which text in a text line should be read
-
-
-
-
-
-
- Overrides the production attribute of the parent
- text region
-
-
-
-
-
- For generic use
-
-
-
-
-
-
- Position (order number) of this text line within the
- parent text region.
-
-
-
-
-
-
-
-
-
-
- Alternative word images (e.g.
- black-and-white)
-
-
-
-
-
-
-
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
-
-
- Overrides primaryLanguage attribute of parent line
- and/or text region
-
-
-
-
-
-
- The primary script used in the word
-
-
-
-
-
-
- The secondary script used in the word
-
-
-
-
-
-
- The direction in which characters in a word should be read
-
-
-
-
-
-
- Overrides the production attribute of the parent
- text line and/or text region.
-
-
-
-
-
- For generic use
-
-
-
-
-
-
-
-
-
- Alternative glyph images (e.g.
- black-and-white)
-
-
-
-
-
-
-
- Container for graphemes, grapheme groups and
- non-printing characters
-
-
-
-
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
-
-
-
-
-
-
- The script used for the glyph
-
-
-
-
-
-
- Overrides the production attribute of the parent
- word / text line / text region.
-
-
-
-
-
- For generic use
-
-
-
-
-
-
-
-
-
- Text in a "simple" form (ASCII or extended ASCII
- as mostly used for typing). I.e. no use of
- special characters for ligatures (should be
- stored as two separate characters) etc.
-
-
-
-
-
-
- Correct encoding of the original, always using
- the corresponding Unicode code point. I.e.
- ligatures have to be represented as one
- character etc.
-
-
-
-
-
-
- Used for sort order in case multiple TextEquivs are defined. The text content with the lowest index should be interpreted as the main text content.
-
-
-
-
-
-
-
-
-
- OCR confidence value (between 0 and 1)
-
-
-
-
- Type of text content (is it free text or a number, for instance)
-This is only a descriptive attribute, the text type is not checked during XML validation
-
-
-
-
- Refinement for dataType attribute. Can be a regular expression, for instance.
-
-
-
-
-
-
-
-
- An image is considered to be more intricate and complex
- than a graphic. These can be photos or drawings.
-
-
-
-
-
-
- The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
- The colour bit depth required for the region
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
- Specifies whether the region also contains
- text
-
-
-
-
-
-
-
-
-
- A line drawing is a single colour illustration without
- solid areas.
-
-
-
-
-
-
- The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
- The pen (foreground) colour of the region
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
- Specifies whether the region also contains
- text
-
-
-
-
-
-
-
-
-
- Regions containing simple graphics, such as a company
- logo, should be marked as graphic regions.
-
-
-
-
-
-
- The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
- The type of graphic in the region
-
-
-
-
-
-
- An approximation of the number of colours
- used in the region
-
-
-
-
-
-
- Specifies whether the region also contains
- text.
-
-
-
-
-
-
-
-
-
- Tabular data in any form is represented with a table
- region. Rows and columns may or may not have separator
- lines; these lines are not separator regions.
-
-
-
-
-
-
-
- Table grid (visible or virtual grid lines)
-
-
-
-
-
- The angle the rectangle encapsulating a
- region has to be rotated in clockwise
- direction in order to correct the present
- skew (negative values indicate
- anti-clockwise rotation). Range:
- -179.999,180
-
-
-
-
-
-
- The number of rows present in the table
-
-
-
-
-
-
- The number of columns present in the table
-
-
-
-
-
-
- The colour of the lines used in the region
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
- Specifies the presence of line separators
-
-
-
-
-
-
- Specifies whether the region also contains
- text
-
-
-
-
-
-
-
-
- Matrix of grid points defining the table grid on the page
-
-
-
-
- One row in the grid point matrix. Points with x,y coordinates. (note: for a table with n table rows there should be n+1 grid rows)
-
-
-
-
-
- Points with x,y coordinates.
-
-
-
-
- The grid row index
-
-
-
-
-
-
-
-
-
- Regions containing charts or graphs of any type, should
- be marked as chart regions.
-
-
-
-
-
-
- The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
- The type of chart in the region
-
-
-
-
-
-
- An approximation of the number of colours
- used in the region
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
- Specifies whether the region also contains
- text
-
-
-
-
-
-
-
-
-
- Separators are lines that lie between columns and
- paragraphs and can be used to logically separate
- different articles from each other.
-
-
-
-
-
-
- The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
- The colour of the separator
-
-
-
-
-
-
-
-
-
- Regions containing equations and mathematical symbols
- should be marked as maths regions.
-
-
-
-
-
-
- The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
-
-
-
- Regions containing chemical formulas.
-
-
-
-
-
-
-
- The angle the rectangle encapsulating a
- region has to be rotated in clockwise
- direction in order to correct the present
- skew (negative values indicate
- anti-clockwise rotation). Range:
- -179.999,180
-
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
-
-
-
-
- Regions containing maps.
-
-
-
-
-
-
-
- The angle the rectangle encapsulating a
- region has to be rotated in clockwise
- direction in order to correct the present
- skew (negative values indicate
- anti-clockwise rotation). Range:
- -179.999,180
-
-
-
-
-
-
-
-
-
- Regions containing musical notations.
-
-
-
-
-
-
- The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
-
-
-
- Regions containing advertisements.
-
-
-
-
-
-
- The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
-Range: -179.999,180
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
-
-
-
- Noise regions are regions where no real data lies, only
- false data created by artifacts on the document or
- scanner noise.
-
-
-
-
-
-
-
-
-
- To be used if the region type cannot be ascertained.
-
-
-
-
-
-
-
-
-
- Regions containing content that is not covered by the default types (text, graphic, image, line drawing, chart, table, separator, maths, map, music, chem, advert, noise, unknown)
-
-
-
-
-
-
-
- Information on the type of content represented by this region
-
-
-
-
-
-
-
-
-
- Determines the effective area on the paper of a printed page. Its size is equal for all pages of a book (exceptions: titlepage, multipage pictures).
-It contains all living elements (except marginals) like body type, footnotes, headings, running titles.
-It does not contain pagenumber (if not part of running title), marginals, signature mark, preview words.
-
-
-
-
-
-
-
-
-
- Definition of the reading order within the page. To express a reading order between elements they have to be included in an OrderedGroup. Groups may contain further groups.
-
-
-
-
-
-
-
-
-
- Confidence value (between 0 and 1)
-
-
-
-
-
-
-
- Numbered region
-
-
-
- Position (order number) of this item within the current hierarchy level.
-
-
-
-
-
-
-
- Indexed group containing ordered elements
-
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
-
-
-
-
-
-
-
-
- Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members.
-
-
-
-
- Position (order number) of this item within the
- current hierarchy level.
-
-
-
-
-
-
-
-
- Is this group a continuation of another group (from
- previous column or page, for example)?
-
-
-
-
-
-
-
-
-
-
- Indexed group containing unordered elements
-
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
-
-
-
-
-
-
-
- Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members.
-
-
-
-
- Position (order number) of this item within the
- current hierarchy level.
-
-
-
-
-
-
-
- Is this group a continuation of another group (from previous column or page, for example)?
-
-
-
-
-
-
-
-
-
-
-
- Numbered group (contains ordered elements)
-
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
-
-
-
-
-
-
-
-
- Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members.
-
-
-
-
-
- Is this group a continuation of another group (from previous column or page, for example)?
-
-
-
-
-
-
-
-
- Numbered group (contains unordered elements)
-
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
-
-
-
-
-
-
-
- Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members.
-
-
-
-
-
- Is this group a continuation of another group (from previous column or page, for example)?
-
-
-
-
-
-
- Border of the actual page (if the scanned image contains parts not belonging to the page).
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- ISO 639.x 2016-07-14
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- iso15924 2016-07-14
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Can be used to express the z-index of overlapping
- regions. An element with a greater z-index is always in
- front of another element with lower z-index.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Confidence value (between 0 and 1)
-
-
-
-
-
-
- Point list with format "x1,y1 x2,y2 ..."
-
-
-
-
-
-
-
-
-
- Container for one-to-one relations between layout
- objects (for example: DropCap - paragraph, caption -
- image)
-
-
-
-
-
-
-
-
-
-
- One-to-one relation between to layout object. Use 'link'
- for loose relations and 'join' for strong relations
- (where something is fragmented for instance).
-
- Examples for 'link': caption - image floating -
- paragraph paragraph - paragraph (when a paragraph is
- split across columns and the last word of the first
- paragraph DOES NOT continue in the second paragraph)
- drop-cap - paragraph (when the drop-cap is a whole word)
-
- Examples for 'join': word - word (separated word at the
- end of a line) drop-cap - paragraph (when the drop-cap
- is not a whole word) paragraph - paragraph (when a
- pragraph is split across columns and the last word of
- the first paragraph DOES continue in the second
- paragraph)
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- For generic use
-
-
-
-
-
- Text production type
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Monospace (fixed-pitch, non-proportional) or
- proportional font
-
-
-
-
-
- For instance: Arial, Times New Roman. Add more
- information if necessary (e.g. blackletter,
- antiqua).
-
-
-
-
-
-
- Serif or sans-serif typeface
-
-
-
-
-
-
-
- The size of the characters in points
-
-
-
-
-
- The x-height or corpus size refers to the distance between the baseline and the mean line of lower-case letters in a typeface. The unit is assumed to be pixels.
-
-
-
-
-
- The degree of space (in points) between the
- characters in a string of text
-
-
-
-
-
-
- Text colour in RGB encoded format (red value) + (256 x green value) + (65536 x blue value)
-
-
-
-
- Background colour
-
-
-
-
- Background colour in RGB encoded format (red value) + (256 x green value) + (65536 x blue value)
-
-
-
-
-
- Specifies whether the colour of the text appears
- reversed against a background colour
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Alternative region images (e.g.
- black-and-white)
-
-
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
- Roles the region takes (e.g. in context of a
- parent region)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- For generic use
-
-
-
-
-
- Is this region a continuation of another region (in previous column or page, for example)?
-
-
-
-
-
-
-
-
- Confidence value (between 0 and 1)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Examples: "123.456", "+1234.456", "-1234.456", "-.456", "-456"
-
-
-
- Examples: "123.456", "+1234.456", "-1.2344e56", "-.45E-6", "INF", "-INF", "NaN"
-
-
-
- Examples: "123456", "+00000012", "-1", "-456"
-
-
-
- Examples: "true", "false", "1", "0"
-
-
-
- Examples: "2001-10-26", "2001-10-26+02:00", "2001-10-26Z", "2001-10-26+00:00", "-2001-10-26", "-20000-04-01"
-
-
-
- Examples: "21:32:52", "21:32:52+02:00", "19:32:52Z", "19:32:52+00:00", "21:32:52.12679"
-
-
-
- Examples: "2001-10-26T21:32:52", "2001-10-26T21:32:52+02:00", "2001-10-26T19:32:52Z", "2001-10-26T19:32:52+00:00", "-2001-10-26T21:32:52", "2001-10-26T21:32:52.12679"
-
-
-
- Generic text string
-
-
-
- An XSD type that is not listed or a custom type (use dataTypeDetails attribute)
-
-
-
-
-
-
-
-
-
- Container for graphemes, grapheme groups and
- non-printing characters
-
-
-
-
-
-
-
-
-
-
-
- Base type for graphemes, grapheme groups and non-printing characters
-
-
-
-
-
-
-
-
- Order index of grapheme, group, or non-printing character within the parent container (graphemes or glyph or grapheme group)
-
-
-
-
-
-
-
-
-
-
- Type of character represented by the grapheme/group/non-printing character element
-
-
-
-
-
-
-
-
-
-
- For generic use
-
-
- For generic use
-
-
-
-
- Represents a sub-element of a glyph. Smallest graphical unit that can be assigned a Unicode code point
-
-
-
-
-
-
-
-
-
-
-
-
- A glyph component without visual representation but with Unicode code point. Non-visual / non-printing / control character. Part of grapheme container (of glyph) or grapheme sub group.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Container for user-defined attributes
-
-
-
-
-
-
-
-
- Structured custom data defined by name, type and value.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Cell position in table starting with row 0
-
-
-
- Cell position in table starting with column 0
-
-
-
- Number of rows the cell spans (optional; default is 1)
-
-
-
- Number of columns the cell spans (optional; default is 1)
-
-
-
-
- Is the cell a column or row header?
-
-
-
-
-
-
-
-
-
- Data for a region that takes on the role of a table cell within a parent table region
-
-
-
-
diff --git a/xsd/pagecontent.2019-07-15.xsd b/xsd/pagecontent.2019-07-15.xsd
deleted file mode 100644
index 40c6f24..0000000
--- a/xsd/pagecontent.2019-07-15.xsd
+++ /dev/null
@@ -1,2674 +0,0 @@
-
-
-
-
-
-
- Page Content - Ground Truth and Storage
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- The timestamp has to be in UTC (Coordinated
- Universal Time) and not local time.
-
-
-
-
-
-
- The timestamp has to be in UTC
- (Coordinated Universal Time)
- and not local time.
-
-
-
-
-
-
-
-
-
-
-
-
- External reference of any kind
-
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
-
- Type of metadata (e.g. author)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- E.g. imagePhotometricInterpretation
-
-
-
-
-
- E.g. RGB
-
-
-
-
-
-
-
-
-
- A semantic label / tag
-
-
-
-
-
-
-
- Reference to external model / ontology / schema
-
-
-
-
-
-
- E.g. an RDF resource identifier
- (to be used as subject or object of an RDF triple)
-
-
-
-
-
-
- Prefix for all labels (e.g. first part of an URI)
-
-
-
-
-
-
-
- Semantic label
-
-
-
-
- The label / tag (e.g. 'person').
- Can be an RDF resource identifier
- (e.g. object of an RDF triple).
-
-
-
-
-
-
- Additional information on the label
- (e.g. 'YYYY-mm-dd' for a date label).
- Can be used as predicate of an RDF triple.
-
-
-
-
-
-
-
-
-
-
-
- Alternative document page images
- (e.g. black-and-white).
-
-
-
-
-
-
-
-
-
- Order of blocks within the page.
-
-
-
-
-
- Unassigned regions are considered to be in the
- (virtual) default layer which is to be treated
- as below any other layers.
-
-
-
-
-
-
-
- Default text style
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Contains the image file name including the file extension.
-
-
-
-
-
- Specifies the width of the image.
-
-
-
-
- Specifies the height of the image.
-
-
-
-
- Specifies the image resolution in width.
-
-
-
-
- Specifies the image resolution in height.
-
-
-
-
-
- Specifies the unit of the resolution information
- referring to a standardised unit of measurement
- (pixels per inch, pixels per centimeter or other).
-
-
-
-
-
-
-
-
-
-
-
-
- For generic use
-
-
-
-
-
- The angle the rectangle encapsulating the page
- (or its Border) has to be rotated in clockwise direction
- in order to correct the present skew
- (negative values indicate anti-clockwise rotation).
- (The rotated image can be further referenced
- via “AlternativeImage”.)
- Range: -179.999,180
-
-
-
-
-
-
- The type of the page within the document
- (e.g. cover page).
-
-
-
-
-
-
- The primary language used in the page
- (lower-level definitions override the page-level definition).
-
-
-
-
-
-
- The secondary language used in the page
- (lower-level definitions override the page-level definition).
-
-
-
-
-
-
- The primary script used in the page
- (lower-level definitions override the page-level definition).
-
-
-
-
-
-
- The secondary script used in the page
- (lower-level definitions override the page-level definition).
-
-
-
-
-
-
- The direction in which text within lines
- should be read (order of words and characters),
- in addition to “textLineOrder”
- (lower-level definitions override the page-level definition).
-
-
-
-
-
-
- The order of text lines within a block,
- in addition to “readingDirection”
- (lower-level definitions override the page-level definition).
-
-
-
-
-
- Confidence value for whole page (between 0 and 1)
-
-
-
-
-
-
- Pure text is represented as a text region. This includes
- drop capitals, but practically ornate text may be
- considered as a graphic.
-
-
-
-
-
-
-
-
-
-
-
-
- The angle the rectangle encapsulating the region
- has to be rotated in clockwise direction
- in order to correct the present skew
- (negative values indicate anti-clockwise rotation).
- (The rotated image can be further referenced
- via “AlternativeImage”.)
- Range: -179.999,180
-
-
-
-
-
-
- The nature of the text in the region
-
-
-
-
-
-
- The degree of space in points between the lines of
- text (line spacing)
-
-
-
-
-
-
- The direction in which text within lines
- should be read (order of words and characters),
- in addition to “textLineOrder”.
-
-
-
-
-
-
- The order of text lines within the block,
- in addition to “readingDirection”.
-
-
-
-
-
-
- The angle the baseline of text within the region
- has to be rotated (relative to the rectangle
- encapsulating the region) in clockwise direction
- in order to correct the present skew,
- in addition to “orientation”
- (negative values indicate anti-clockwise rotation).
- Range: -179.999,180
-
-
-
-
-
-
- Defines whether a region of text is indented or not
-
-
-
-
-
- Text align
-
-
-
-
-
- The primary language used in the region
-
-
-
-
-
-
- The secondary language used in the region
-
-
-
-
-
-
- The primary script used in the region
-
-
-
-
-
-
- The secondary script used in the region
-
-
-
-
-
-
-
-
-
-
- Point list with format "x1,y1 x2,y2 ..."
-
-
-
-
- Confidence value (between 0 and 1)
-
-
-
-
-
-
-
-
- Alternative text line images (e.g.
- black-and-white)
-
-
-
-
-
-
-
- Multiple connected points that mark the baseline
- of the glyphs
-
-
-
-
-
-
-
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
-
-
- Overrides primaryLanguage attribute of parent text
- region
-
-
-
-
-
-
- The primary script used in the text line
-
-
-
-
-
-
- The secondary script used in the text line
-
-
-
-
-
-
- The direction in which text within the line
- should be read (order of words and characters).
-
-
-
-
-
-
- Overrides the production attribute of the parent
- text region
-
-
-
-
-
- For generic use
-
-
-
-
-
-
- Position (order number) of this text line within the
- parent text region.
-
-
-
-
-
-
-
-
-
- Alternative word images (e.g.
- black-and-white)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
-
-
- Overrides primaryLanguage attribute of parent line
- and/or text region
-
-
-
-
-
-
- The primary script used in the word
-
-
-
-
-
-
- The secondary script used in the word
-
-
-
-
-
-
- The direction in which text within the word
- should be read (order of characters).
-
-
-
-
-
-
- Overrides the production attribute of the parent
- text line and/or text region.
-
-
-
-
-
- For generic use
-
-
-
-
-
-
-
-
-
- Alternative glyph images (e.g.
- black-and-white)
-
-
-
-
-
-
-
- Container for graphemes, grapheme groups and
- non-printing characters
-
-
-
-
-
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
-
-
-
-
- The script used for the glyph
-
-
-
-
-
-
- Overrides the production attribute of the parent
- word / text line / text region.
-
-
-
-
-
- For generic use
-
-
-
-
-
-
-
-
-
- Text in a "simple" form (ASCII or extended ASCII
- as mostly used for typing). I.e. no use of
- special characters for ligatures (should be
- stored as two separate characters) etc.
-
-
-
-
-
-
- Correct encoding of the original, always using
- the corresponding Unicode code point. I.e.
- ligatures have to be represented as one
- character etc.
-
-
-
-
-
-
-
- Used for sort order in case multiple TextEquivs are defined.
- The text content with the lowest index should be interpreted
- as the main text content.
-
-
-
-
-
-
-
-
-
-
- OCR confidence value (between 0 and 1)
-
-
-
-
-
- Type of text content (is it free text or a number, for instance).
- This is only a descriptive attribute, the text type
- is not checked during XML validation.
-
-
-
-
-
-
- Refinement for dataType attribute. Can be a regular expression, for instance.
-
-
-
-
-
-
-
-
-
- An image is considered to be more intricate and complex
- than a graphic. These can be photos or drawings.
-
-
-
-
-
-
-
- The angle the rectangle encapsulating a region
- has to be rotated in clockwise direction
- in order to correct the present skew
- (negative values indicate anti-clockwise rotation).
- Range: -179.999,180
-
-
-
-
-
-
- The colour bit depth required for the region
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
- Specifies whether the region also contains
- text
-
-
-
-
-
-
-
-
-
- A line drawing is a single colour illustration without
- solid areas.
-
-
-
-
-
-
-
- The angle the rectangle encapsulating a region
- has to be rotated in clockwise direction
- in order to correct the present skew
- (negative values indicate anti-clockwise rotation).
- Range: -179.999,180
-
-
-
-
-
-
- The pen (foreground) colour of the region
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
- Specifies whether the region also contains
- text
-
-
-
-
-
-
-
-
-
- Regions containing simple graphics, such as a company
- logo, should be marked as graphic regions.
-
-
-
-
-
-
-
- The angle the rectangle encapsulating a region
- has to be rotated in clockwise direction
- in order to correct the present skew
- (negative values indicate anti-clockwise rotation).
- Range: -179.999,180
-
-
-
-
-
-
- The type of graphic in the region
-
-
-
-
-
-
- An approximation of the number of colours
- used in the region
-
-
-
-
-
-
- Specifies whether the region also contains
- text.
-
-
-
-
-
-
-
-
-
- Tabular data in any form is represented with a table
- region. Rows and columns may or may not have separator
- lines; these lines are not separator regions.
-
-
-
-
-
-
-
- Table grid (visible or virtual grid lines)
-
-
-
-
-
-
- The angle the rectangle encapsulating a region
- has to be rotated in clockwise direction
- in order to correct the present skew
- (negative values indicate anti-clockwise rotation).
- Range: -179.999,180
-
-
-
-
-
-
- The number of rows present in the table
-
-
-
-
-
-
- The number of columns present in the table
-
-
-
-
-
-
- The colour of the lines used in the region
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
- Specifies the presence of line separators
-
-
-
-
-
-
- Specifies whether the region also contains
- text
-
-
-
-
-
-
-
-
-
- Matrix of grid points defining the table grid on the page.
-
-
-
-
-
-
- One row in the grid point matrix.
- Points with x,y coordinates.
- (note: for a table with n table rows there should be n+1 grid rows)
-
-
-
-
-
-
-
- Points with x,y coordinates.
-
-
-
-
- The grid row index
-
-
-
-
-
-
-
-
- Regions containing charts or graphs of any type, should
- be marked as chart regions.
-
-
-
-
-
-
-
- The angle the rectangle encapsulating a region
- has to be rotated in clockwise direction
- in order to correct the present skew
- (negative values indicate anti-clockwise rotation).
- Range: -179.999,180
-
-
-
-
-
-
- The type of chart in the region
-
-
-
-
-
-
- An approximation of the number of colours
- used in the region
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
- Specifies whether the region also contains
- text
-
-
-
-
-
-
-
-
-
- Separators are lines that lie between columns and
- paragraphs and can be used to logically separate
- different articles from each other.
-
-
-
-
-
-
-
- The angle the rectangle encapsulating a region
- has to be rotated in clockwise direction
- in order to correct the present skew
- (negative values indicate anti-clockwise rotation).
- Range: -179.999,180
-
-
-
-
-
-
- The colour of the separator
-
-
-
-
-
-
-
-
-
- Regions containing equations and mathematical symbols
- should be marked as maths regions.
-
-
-
-
-
-
-
- The angle the rectangle encapsulating a region
- has to be rotated in clockwise direction
- in order to correct the present skew
- (negative values indicate anti-clockwise rotation).
- Range: -179.999,180
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
-
-
-
- Regions containing chemical formulas.
-
-
-
-
-
-
-
- The angle the rectangle encapsulating a
- region has to be rotated in clockwise
- direction in order to correct the present
- skew (negative values indicate
- anti-clockwise rotation). Range:
- -179.999,180
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
-
-
-
- Regions containing maps.
-
-
-
-
-
-
-
- The angle the rectangle encapsulating a
- region has to be rotated in clockwise
- direction in order to correct the present
- skew (negative values indicate
- anti-clockwise rotation). Range:
- -179.999,180
-
-
-
-
-
-
-
-
-
- Regions containing musical notations.
-
-
-
-
-
-
-
- The angle the rectangle encapsulating a region
- has to be rotated in clockwise direction
- in order to correct the present skew
- (negative values indicate anti-clockwise rotation).
- Range: -179.999,180
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
-
-
-
- Regions containing advertisements.
-
-
-
-
-
-
-
- The angle the rectangle encapsulating a region
- has to be rotated in clockwise direction
- in order to correct the present skew
- (negative values indicate anti-clockwise rotation).
- Range: -179.999,180
-
-
-
-
-
-
- The background colour of the region
-
-
-
-
-
-
-
-
-
- Noise regions are regions where no real data lies, only
- false data created by artifacts on the document or
- scanner noise.
-
-
-
-
-
-
-
-
-
- To be used if the region type cannot be ascertained.
-
-
-
-
-
-
-
-
-
- Regions containing content that is not covered
- by the default types (text, graphic, image,
- line drawing, chart, table, separator, maths,
- map, music, chem, advert, noise, unknown).
-
-
-
-
-
-
-
- Information on the type of content represented by this region
-
-
-
-
-
-
-
-
-
- Determines the effective area on the paper of a printed page.
- Its size is equal for all pages of a book
- (exceptions: titlepage, multipage pictures).
- It contains all living elements (except marginals)
- like body type, footnotes, headings, running titles.
- It does not contain pagenumber (if not part of running title),
- marginals, signature mark, preview words.
-
-
-
-
-
-
-
-
-
- Definition of the reading order within the page.
- To express a reading order between elements
- they have to be included in an OrderedGroup.
- Groups may contain further groups.
-
-
-
-
-
-
-
-
- Confidence value (between 0 and 1)
-
-
-
-
-
- Numbered region
-
-
-
- Position (order number) of this item within the current hierarchy level.
-
-
-
-
-
-
-
- Indexed group containing ordered elements
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
-
-
-
-
-
-
-
- Optional link to a parent region of nested regions.
- The parent region doubles as reading order group.
- Only the nested regions should be allowed as group members.
-
-
-
-
-
-
- Position (order number) of this item within the
- current hierarchy level.
-
-
-
-
-
-
-
-
- Is this group a continuation of another group (from
- previous column or page, for example)?
-
-
-
-
-
- For generic use
-
-
-
-
-
-
-
- Indexed group containing unordered elements
-
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
-
-
-
-
-
-
-
- Optional link to a parent region of nested regions.
- The parent region doubles as reading order group.
- Only the nested regions should be allowed as group members.
-
-
-
-
-
-
- Position (order number) of this item within the
- current hierarchy level.
-
-
-
-
-
-
-
-
- Is this group a continuation of another group
- (from previous column or page, for example)?
-
-
-
-
-
- For generic use
-
-
-
-
-
-
-
-
-
-
- Numbered group (contains ordered elements)
-
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
-
-
-
-
-
-
-
- Optional link to a parent region of nested regions.
- The parent region doubles as reading order group.
- Only the nested regions should be allowed as group members.
-
-
-
-
-
-
-
-
- Is this group a continuation of another group
- (from previous column or page, for example)?
-
-
-
-
-
- For generic use
-
-
-
-
-
-
-
- Numbered group (contains unordered elements)
-
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
-
-
-
-
-
-
-
- Optional link to a parent region of nested regions.
- The parent region doubles as reading order group.
- Only the nested regions should be allowed as group members.
-
-
-
-
-
-
-
-
- Is this group a continuation of another group
- (from previous column or page, for example)?
-
-
-
-
-
- For generic use
-
-
-
-
-
-
-
- Border of the actual page (if the scanned image
- contains parts not belonging to the page).
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- ISO 639.x 2016-07-14
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- iso15924 2016-07-14
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Can be used to express the z-index of overlapping
- regions. An element with a greater z-index is always in
- front of another element with lower z-index.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Confidence value (between 0 and 1)
-
-
-
-
-
-
- Point list with format "x1,y1 x2,y2 ..."
-
-
-
-
-
-
-
-
-
- Container for one-to-one relations between layout
- objects (for example: DropCap - paragraph, caption -
- image).
-
-
-
-
-
-
-
-
-
-
- One-to-one relation between to layout object. Use 'link'
- for loose relations and 'join' for strong relations
- (where something is fragmented for instance).
-
- Examples for 'link': caption - image floating -
- paragraph paragraph - paragraph (when a paragraph is
- split across columns and the last word of the first
- paragraph DOES NOT continue in the second paragraph)
- drop-cap - paragraph (when the drop-cap is a whole word)
-
- Examples for 'join': word - word (separated word at the
- end of a line) drop-cap - paragraph (when the drop-cap
- is not a whole word) paragraph - paragraph (when a
- pragraph is split across columns and the last word of
- the first paragraph DOES continue in the second
- paragraph)
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- For generic use
-
-
-
-
-
-
-
- Text production type
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Monospace (fixed-pitch, non-proportional) or
- proportional font.
-
-
-
-
-
- For instance: Arial, Times New Roman.
- Add more information if necessary
- (e.g. blackletter, antiqua).
-
-
-
-
-
-
- Serif or sans-serif typeface.
-
-
-
-
-
-
-
- The size of the characters in points.
-
-
-
-
-
-
- The x-height or corpus size refers to the distance
- between the baseline and the mean line of
- lower-case letters in a typeface.
- The unit is assumed to be pixels.
-
-
-
-
-
-
- The degree of space (in points) between
- the characters in a string of text.
-
-
-
-
-
-
-
- Text colour in RGB encoded format
- (red value) + (256 x green value) + (65536 x blue value).
-
-
-
-
-
- Background colour
-
-
-
-
-
- Background colour in RGB encoded format
- (red value) + (256 x green value) + (65536 x blue value).
-
-
-
-
-
-
- Specifies whether the colour of the text appears
- reversed against a background colour.
-
-
-
-
-
-
-
-
- Line style details if "underlined" is TRUE
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Alternative region images
- (e.g. black-and-white).
-
-
-
-
-
-
-
-
- Semantic labels / tags
-
-
-
-
-
- Roles the region takes
- (e.g. in context of a parent region).
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- For generic use
-
-
-
-
-
-
- Is this region a continuation of another region
- (in previous column or page, for example)?
-
-
-
-
-
-
-
-
-
-
- Confidence value (between 0 and 1)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Examples:
- "123.456", "+1234.456",
- "-1234.456", "-.456", "-456"
-
-
-
-
-
-
- Examples:
- "123.456", "+1234.456", "-1.2344e56",
- "-.45E-6", "INF", "-INF", "NaN"
-
-
-
-
-
-
- Examples:
- "123456", "+00000012", "-1", "-456"
-
-
-
-
-
-
- Examples: "true", "false", "1", "0"
-
-
-
-
-
-
- Examples:
- "2001-10-26", "2001-10-26+02:00",
- "2001-10-26Z", "2001-10-26+00:00",
- "-2001-10-26", "-20000-04-01"
-
-
-
-
-
-
- Examples:
- "21:32:52", "21:32:52+02:00", "19:32:52Z",
- "19:32:52+00:00", "21:32:52.12679"
-
-
-
-
-
-
- Examples:
- "2001-10-26T21:32:52", "2001-10-26T21:32:52+02:00",
- "2001-10-26T19:32:52Z", "2001-10-26T19:32:52+00:00",
- "-2001-10-26T21:32:52", "2001-10-26T21:32:52.12679"
-
-
-
-
-
- Generic text string
-
-
-
-
-
- An XSD type that is not listed or a custom type
- (use dataTypeDetails attribute).
-
-
-
-
-
-
-
-
-
-
-
- Container for graphemes, grapheme groups and
- non-printing characters.
-
-
-
-
-
-
-
-
-
-
-
- Base type for graphemes, grapheme groups and non-printing characters.
-
-
-
-
-
-
-
-
-
- Order index of grapheme, group, or non-printing character
- within the parent container (graphemes or glyph or grapheme group).
-
-
-
-
-
-
-
-
-
-
-
-
- Type of character represented by the
- grapheme, group, or non-printing character element.
-
-
-
-
-
-
-
-
-
-
-
- For generic use
-
-
-
-
- For generic use
-
-
-
-
-
-
- Represents a sub-element of a glyph.
- Smallest graphical unit that can be
- assigned a Unicode code point.
-
-
-
-
-
-
-
-
-
-
-
-
-
- A glyph component without visual representation
- but with Unicode code point.
- Non-visual / non-printing / control character.
- Part of grapheme container (of glyph) or grapheme sub group.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Container for user-defined attributes
-
-
-
-
-
-
-
-
- Structured custom data defined by name, type and value.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Cell position in table starting with row 0
-
-
-
-
- Cell position in table starting with column 0
-
-
-
-
- Number of rows the cell spans (optional; default is 1)
-
-
-
-
- Number of columns the cell spans (optional; default is 1)
-
-
-
-
-
- Is the cell a column or row header?
-
-
-
-
-
-
-
-
-
- Data for a region that takes on the role
- of a table cell within a parent table region.
-
-
-
-
-
-
-
-
-
-
-
-
-