diff --git a/my_ocrd_workflow b/my_ocrd_workflow
index 8adb4bb..4b50ff2 100755
--- a/my_ocrd_workflow
+++ b/my_ocrd_workflow
@@ -54,10 +54,20 @@ do_ocr() {
#ocrd workspace validate mets.xml
}
+page_validate_xml() {
+ filegrp=$1
+
+ local file
+ for file in `ocrd workspace find -G $filegrp`; do
+ xmllint --noout --schema `dirname $0`/xsd/pagecontent.2018-07-15.xsd $file
+ done
+}
+
do_fontident
do_linesegmentation
do_ocr
+page_validate_xml OCR-D-OCR-TESS # This also makes sure PAGE Viewer can open it
# XXX Multiple calls create multiple identical mets:agent elements
diff --git a/xsd/pagecontent.2017-07-15.xsd b/xsd/pagecontent.2017-07-15.xsd
new file mode 100644
index 0000000..b4b2266
--- /dev/null
+++ b/xsd/pagecontent.2017-07-15.xsd
@@ -0,0 +1,2137 @@
+
+
+
+
+
+ Page Content - Ground Truth and Storage
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The timestamp has to be in UTC (Coordinated
+ Universal Time) and not local time.
+
+
+
+
+
+
+ The timestamp has to be in UTC (Coordinated
+ Universal Time) and not local time.
+
+
+
+
+
+
+
+
+
+ External reference of any kind
+
+
+
+
+
+
+
+ Alternative document page images (e.g.
+ black-and-white)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Unassigned regions are considered to be in the
+ (virtual) default layer which is to be treated
+ as below any other layers.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For generic use
+
+
+
+
+ Page type
+
+
+
+
+
+ The primary language used in the page (lower-level definitions override the page-level definition)
+
+
+
+
+
+
+ The secondary language used in the page (lower-level definitions override the page-level definition)
+
+
+
+
+
+
+ The primary script used in the page (lower-level definitions override the page-level definition)
+
+
+
+
+
+
+ The secondary script used in the page (lower-level definitions override the page-level definition)
+
+
+
+
+
+
+ The direction in which text in a region should be
+ read (within lines) (lower-level definitions override the page-level definition)
+
+
+
+
+
+ Inner-block order of text lines (in addition to “readingDirection” which is the inner-text line order of words and characters) (lower-level definitions override the page-level definition)
+
+
+
+
+
+
+ Pure text is represented as a text region. This includes
+ drop capitals, but practically ornate text may be
+ considered as a graphic.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+ The nature of the text in the region
+
+
+
+
+
+
+ The degree of space in points between the lines of
+ text (line spacing)
+
+
+
+
+
+
+ The direction in which text in a region should be
+ read (within lines)
+
+
+
+
+
+ Inner-block order of text lines (in addition to “readingDirection” which is the inner-text line order of words and characters)
+
+
+
+
+ The angle the baseline of text withing a region has to be rotated (relative to the rectangle encapsulating the region) in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+ Defines whether a region of text is indented or not
+
+
+
+
+
+ Text align
+
+
+
+
+
+ The primary language used in the region
+
+
+
+
+
+
+ The secondary language used in the region
+
+
+
+
+
+
+ The primary script used in the region
+
+
+
+
+
+
+ The secondary script used in the region
+
+
+
+
+
+
+
+
+
+
+ Point list with format "x1,y1 x2,y2 ..."
+
+
+
+
+
+
+
+
+
+ Multiple connected points that mark the baseline
+ of the glyphs
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Overrides primaryLanguage attribute of parent text
+ region
+
+
+
+
+
+
+ The primary script used in the text line
+
+
+
+
+
+
+ The secondary script used in the text line
+
+
+
+
+
+
+ The direction in which text in a text line should be read
+
+
+
+
+
+
+ Overrides the production attribute of the parent
+ text region
+
+
+
+
+
+ For generic use
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Overrides primaryLanguage attribute of parent line
+ and/or text region
+
+
+
+
+
+
+ The primary script used in the word
+
+
+
+
+
+
+ The secondary script used in the word
+
+
+
+
+
+
+ The direction in which characters in a word should be read
+
+
+
+
+
+
+ Overrides the production attribute of the parent
+ text line and/or text region.
+
+
+
+
+
+ For generic use
+
+
+
+
+
+
+
+
+
+
+ Container for graphemes, grapheme groups and
+ non-printing characters
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The script used for the glyph
+
+
+
+
+
+
+ Overrides the production attribute of the parent
+ word / text line / text region.
+
+
+
+
+
+ For generic use
+
+
+
+
+
+
+
+
+
+ Text in a "simple" form (ASCII or extended ASCII
+ as mostly used for typing). I.e. no use of
+ special characters for ligatures (should be
+ stored as two separate characters) etc.
+
+
+
+
+
+
+ Correct encoding of the original, always using
+ the corresponding Unicode code point. I.e.
+ ligatures have to be represented as one
+ character etc.
+
+
+
+
+
+
+ Used for sort order in case multiple TextEquivs are defined. The text content with the lowest index should be interpreted as the main text content.
+
+
+
+
+
+
+
+
+
+ OCR confidence value (between 0 and 1)
+
+
+
+
+
+
+
+
+
+
+ Type of text content (is it free text or a number, for instance)
+This is only a descriptive attribute, the text type is not checked during XML validation
+
+
+
+
+ Refinement for dataType attribute. Can be a regular expression, for instance.
+
+
+
+
+
+
+
+
+ An image is considered to be more intricate and complex
+ than a graphic. These can be photos or drawings.
+
+
+
+
+
+
+ The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+ The colour bit depth required for the region
+
+
+
+
+
+
+ The background colour of the region
+
+
+
+
+
+
+ Specifies whether the region also contains
+ text
+
+
+
+
+
+
+
+
+
+ A line drawing is a single colour illustration without
+ solid areas.
+
+
+
+
+
+
+ The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+ The pen (foreground) colour of the region
+
+
+
+
+
+
+ The background colour of the region
+
+
+
+
+
+
+ Specifies whether the region also contains
+ text
+
+
+
+
+
+
+
+
+
+ Regions containing simple graphics, such as a company
+ logo, should be marked as graphic regions.
+
+
+
+
+
+
+ The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+ The type of graphic in the region
+
+
+
+
+
+
+ An approximation of the number of colours
+ used in the region
+
+
+
+
+
+
+ Specifies whether the region also contains
+ text.
+
+
+
+
+
+
+
+
+
+ Tabular data in any form is represented with a table
+ region. Rows and columns may or may not have separator
+ lines; these lines are not separator regions.
+
+
+
+
+
+
+ The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+ The number of rows present in the table
+
+
+
+
+
+
+ The number of columns present in the table
+
+
+
+
+
+
+ The colour of the lines used in the region
+
+
+
+
+
+
+ The background colour of the region
+
+
+
+
+
+
+ Specifies the presence of line separators
+
+
+
+
+
+
+ Specifies whether the region also contains
+ text
+
+
+
+
+
+
+
+
+
+ Regions containing charts or graphs of any type, should
+ be marked as chart regions.
+
+
+
+
+
+
+ The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+ The type of chart in the region
+
+
+
+
+
+
+ An approximation of the number of colours
+ used in the region
+
+
+
+
+
+
+ The background colour of the region
+
+
+
+
+
+
+ Specifies whether the region also contains
+ text
+
+
+
+
+
+
+
+
+
+ Separators are lines that lie between columns and
+ paragraphs and can be used to logically separate
+ different articles from each other.
+
+
+
+
+
+
+ The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+ The colour of the separator
+
+
+
+
+
+
+
+
+
+ Regions containing equations and mathematical symbols
+ should be marked as maths regions.
+
+
+
+
+
+
+ The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+ The background colour of the region
+
+
+
+
+
+
+
+
+
+ Regions containing chemical formulas.
+
+
+
+
+
+
+
+ The angle the rectangle encapsulating a
+ region has to be rotated in clockwise
+ direction in order to correct the present
+ skew (negative values indicate
+ anti-clockwise rotation). Range:
+ -179.999,180
+
+
+
+
+
+
+
+ The background colour of the region
+
+
+
+
+
+
+
+
+
+
+ Regions containing musical notations.
+
+
+
+
+
+
+ The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+ The background colour of the region
+
+
+
+
+
+
+
+
+
+ Regions containing advertisements.
+
+
+
+
+
+
+ The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+
+ The background colour of the region
+
+
+
+
+
+
+
+
+
+ Noise regions are regions where no real data lies, only
+ false data created by artifacts on the document or
+ scanner noise.
+
+
+
+
+
+
+
+
+
+ To be used if the region type cannot be ascertained.
+
+
+
+
+
+
+
+
+
+ Determines the effective area on the paper of a printed page. Its size is equal for all pages of a book (exceptions: titlepage, multipage pictures).
+It contains all living elements (except marginals) like body type, footnotes, headings, running titles.
+It does not contain pagenumber (if not part of running title), marginals, signature mark, preview words.
+
+
+
+
+
+
+
+
+
+ Definition of the reading order within the page. To express a reading order between elements they have to be included in an OrderedGroup. Groups may contain further groups.
+
+
+
+
+
+
+
+
+
+ Numbered region
+
+
+
+ Position (order number) of this item within the current hierarchy level.
+
+
+
+
+
+
+
+ Indexed group containing ordered elements
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members.
+
+
+
+
+ Position (order number) of this item within the
+ current hierarchy level.
+
+
+
+
+
+
+
+
+ Is this group a continuation of another group (from
+ previous column or page, for example)?
+
+
+
+
+
+
+
+
+
+
+ Indexed group containing unordered elements
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members.
+
+
+
+
+ Position (order number) of this item within the
+ current hierarchy level.
+
+
+
+
+
+
+
+ Is this group a continuation of another group (from previous column or page, for example)?
+
+
+
+
+
+
+
+
+
+
+
+ Numbered group (contains ordered elements)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members.
+
+
+
+
+
+ Is this group a continuation of another group (from previous column or page, for example)?
+
+
+
+
+
+
+
+
+ Numbered group (contains unordered elements)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members.
+
+
+
+
+
+ Is this group a continuation of another group (from previous column or page, for example)?
+
+
+
+
+
+
+ Border of the actual page (if the scanned image contains parts not belonging to the page).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ iso15924 2016-07-14
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Can be used to express the z-index of overlapping
+ regions. An element with a greater z-index is always in
+ front of another element with lower z-index.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Point list with format "x1,y1 x2,y2 ..."
+
+
+
+
+
+
+
+
+
+ Container for one-to-one relations between layout
+ objects (for example: DropCap - paragraph, caption -
+ image)
+
+
+
+
+
+
+
+
+
+
+ One-to-one relation between to layout object. Use 'link'
+ for loose relations and 'join' for strong relations
+ (where something is fragmented for instance).
+
+ Examples for 'link': caption - image floating -
+ paragraph paragraph - paragraph (when a pragraph is
+ split across columns and the last word of the first
+ paragraph DOES NOT continue in the second paragraph)
+ drop-cap - paragraph (when the drop-cap is a whole word)
+
+ Examples for 'join': word - word (separated word at the
+ end of a line) drop-cap - paragraph (when the drop-cap
+ is not a whole word) paragraph - paragraph (when a
+ pragraph is split across columns and the last word of
+ the first paragraph DOES continue in the second
+ paragraph)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For generic use
+
+
+
+
+
+ Text production type
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Monospace (fixed-pitch, non-proportional) or
+ proportional font
+
+
+
+
+
+ For instance: Arial, Times New Roman. Add more
+ information if necessary (e.g. blackletter,
+ antiqua).
+
+
+
+
+
+
+ Serif or sans-serif typeface
+
+
+
+
+
+
+
+ The size of the characters in points
+
+
+
+
+
+ The x-height or corpus size refers to the distance between the baseline and the mean line of lower-case letters in a typeface. The unit is assumed to be pixels.
+
+
+
+
+
+ The degree of space (in points) between the
+ characters in a string of text
+
+
+
+
+
+
+ Text colour in RGB encoded format (red value) + (256 x green value) + (65536 x blue value)
+
+
+
+
+ Background colour
+
+
+
+
+ Background colour in RGB encoded format (red value) + (256 x green value) + (65536 x blue value)
+
+
+
+
+
+ Specifies whether the colour of the text appears
+ reversed against a background colour
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Roles the region takes (e.g. in context of a
+ parent region)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For generic use
+
+
+
+
+
+ Is this region a continuation of another region (in previous column or page, for example)?
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Examples: "123.456", "+1234.456", "-1234.456", "-.456", "-456"
+
+
+
+ Examples: "123.456", "+1234.456", "-1.2344e56", "-.45E-6", "INF", "-INF", "NaN"
+
+
+
+ Examples: "123456", "+00000012", "-1", "-456"
+
+
+
+ Examples: "true", "false", "1", "0"
+
+
+
+ Examples: "2001-10-26", "2001-10-26+02:00", "2001-10-26Z", "2001-10-26+00:00", "-2001-10-26", "-20000-04-01"
+
+
+
+ Examples: "21:32:52", "21:32:52+02:00", "19:32:52Z", "19:32:52+00:00", "21:32:52.12679"
+
+
+
+ Examples: "2001-10-26T21:32:52", "2001-10-26T21:32:52+02:00", "2001-10-26T19:32:52Z", "2001-10-26T19:32:52+00:00", "-2001-10-26T21:32:52", "2001-10-26T21:32:52.12679"
+
+
+
+ Generic text string
+
+
+
+ An XSD type that is not listed or a custom type (use dataTypeDetails attribute)
+
+
+
+
+
+
+
+
+
+ Container for graphemes, grapheme groups and
+ non-printing characters
+
+
+
+
+
+
+
+
+
+
+
+ Base type for graphemes, grapheme groups and non-printing characters
+
+
+
+
+
+
+
+
+ Order index of grapheme, group, or non-printing character within the parent container (graphemes or glyph or grapheme group)
+
+
+
+
+
+
+
+
+
+
+ Type of character represented by the grapheme/group/non-printing character element
+
+
+
+
+
+
+
+
+
+
+ For generic use
+
+
+ For generic use
+
+
+
+
+ Represents a sub-element of a glyph. Smallest graphical unit that can be assigned a Unicode code point
+
+
+
+
+
+
+
+
+
+
+
+
+ A glyph component without visual representation but with Unicode code point. Non-visual / non-printing / control character. Part of grapheme container (of glyph) or grapheme sub group.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Container for user-defined attributes
+
+
+
+
+
+
+
+
+ Structured custom data defined by name, type and value.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Cell position in table starting with row 0
+
+
+
+ Cell position in table starting with column 0
+
+
+
+ Number of rows the cell spans (optional; default is 1)
+
+
+
+ Number of columns the cell spans (optional; default is 1)
+
+
+
+
+
+
+
+ Data for a region that takes on the role of a table cell within a parent table region
+
+
+
+
\ No newline at end of file
diff --git a/xsd/pagecontent.2018-07-15.xsd b/xsd/pagecontent.2018-07-15.xsd
new file mode 100644
index 0000000..c6b7e93
--- /dev/null
+++ b/xsd/pagecontent.2018-07-15.xsd
@@ -0,0 +1,2496 @@
+
+
+
+
+
+ Page Content - Ground Truth and Storage
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The timestamp has to be in UTC (Coordinated
+ Universal Time) and not local time.
+
+
+
+
+
+
+ The timestamp has to be in UTC (Coordinated
+ Universal Time) and not local time.
+
+
+
+
+
+
+
+
+
+
+
+ External reference of any kind
+
+
+
+
+
+
+ Semantic labels / tags
+
+
+
+
+
+ Type of metadata (e.g. author)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ E.g. imagePhotometricInterpretation
+
+
+
+
+
+ E.g. RGB
+
+
+
+
+
+
+
+
+
+ A semantic label / tag
+
+
+
+
+
+
+
+ Reference to external model / ontology / schema
+
+
+
+
+
+ E.g. an RDF resource identifier (to be used as subject or object of an RDF triple)
+
+
+
+
+ Prefix for all labels (e.g. first part of an URI)
+
+
+
+
+
+
+
+
+ Semantic label
+
+
+
+
+ The label / tag (e.g. 'person'). Can be an RDF resource identifier (e.g. object of an RDF triple).
+
+
+
+
+
+
+ Additional information on the label (e.g. 'YYYY-mm-dd' for a date label). Can be used as predicate of an RDF triple.
+
+
+
+
+
+
+
+
+
+
+
+ Alternative document page images (e.g.
+ black-and-white)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Unassigned regions are considered to be in the
+ (virtual) default layer which is to be treated
+ as below any other layers.
+
+
+
+
+
+
+
+
+ Semantic labels / tags
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Contains the image file name including the file extension.
+
+
+
+
+
+ Specifies the width of the image.
+
+
+
+
+ Specifies the height of the image.
+
+
+
+
+ Specifies the image resolution in width.
+
+
+
+
+ Specifies the image resolution in height.
+
+
+
+
+
+ Specifies the unit of the resolution information
+ referring to a standardised unit of measurement (pixels per inch, pixels per centimeter or other).
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For generic use
+
+
+
+
+ Page type
+
+
+
+
+
+ The primary language used in the page (lower-level definitions override the page-level definition)
+
+
+
+
+
+
+ The secondary language used in the page (lower-level definitions override the page-level definition)
+
+
+
+
+
+
+ The primary script used in the page (lower-level definitions override the page-level definition)
+
+
+
+
+
+
+ The secondary script used in the page (lower-level definitions override the page-level definition)
+
+
+
+
+
+
+ The direction in which text in a region should be
+ read (within lines) (lower-level definitions override the page-level definition)
+
+
+
+
+
+ Inner-block order of text lines (in addition to “readingDirection” which is the inner-text line order of words and characters) (lower-level definitions override the page-level definition)
+
+
+
+
+ Confidence value for whole page (between 0 and 1)
+
+
+
+
+
+
+
+ Pure text is represented as a text region. This includes
+ drop capitals, but practically ornate text may be
+ considered as a graphic.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+ The nature of the text in the region
+
+
+
+
+
+
+ The degree of space in points between the lines of
+ text (line spacing)
+
+
+
+
+
+
+ The direction in which text in a region should be
+ read (within lines)
+
+
+
+
+
+ Inner-block order of text lines (in addition to “readingDirection” which is the inner-text line order of words and characters)
+
+
+
+
+ The angle the baseline of text withing a region has to be rotated (relative to the rectangle encapsulating the region) in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+ Defines whether a region of text is indented or not
+
+
+
+
+
+ Text align
+
+
+
+
+
+ The primary language used in the region
+
+
+
+
+
+
+ The secondary language used in the region
+
+
+
+
+
+
+ The primary script used in the region
+
+
+
+
+
+
+ The secondary script used in the region
+
+
+
+
+
+
+
+
+
+
+ Point list with format "x1,y1 x2,y2 ..."
+
+
+
+
+ Confidence value (between 0 and 1)
+
+
+
+
+
+
+
+
+ Alternative text line images (e.g.
+ black-and-white)
+
+
+
+
+
+
+
+ Multiple connected points that mark the baseline
+ of the glyphs
+
+
+
+
+
+
+
+
+
+
+
+
+ Semantic labels / tags
+
+
+
+
+
+
+
+ Overrides primaryLanguage attribute of parent text
+ region
+
+
+
+
+
+
+ The primary script used in the text line
+
+
+
+
+
+
+ The secondary script used in the text line
+
+
+
+
+
+
+ The direction in which text in a text line should be read
+
+
+
+
+
+
+ Overrides the production attribute of the parent
+ text region
+
+
+
+
+
+ For generic use
+
+
+
+
+
+
+ Position (order number) of this text line within the
+ parent text region.
+
+
+
+
+
+
+
+
+
+
+ Alternative word images (e.g.
+ black-and-white)
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Semantic labels / tags
+
+
+
+
+
+
+
+ Overrides primaryLanguage attribute of parent line
+ and/or text region
+
+
+
+
+
+
+ The primary script used in the word
+
+
+
+
+
+
+ The secondary script used in the word
+
+
+
+
+
+
+ The direction in which characters in a word should be read
+
+
+
+
+
+
+ Overrides the production attribute of the parent
+ text line and/or text region.
+
+
+
+
+
+ For generic use
+
+
+
+
+
+
+
+
+
+ Alternative glyph images (e.g.
+ black-and-white)
+
+
+
+
+
+
+
+ Container for graphemes, grapheme groups and
+ non-printing characters
+
+
+
+
+
+
+
+
+
+
+ Semantic labels / tags
+
+
+
+
+
+
+
+
+
+
+
+ The script used for the glyph
+
+
+
+
+
+
+ Overrides the production attribute of the parent
+ word / text line / text region.
+
+
+
+
+
+ For generic use
+
+
+
+
+
+
+
+
+
+ Text in a "simple" form (ASCII or extended ASCII
+ as mostly used for typing). I.e. no use of
+ special characters for ligatures (should be
+ stored as two separate characters) etc.
+
+
+
+
+
+
+ Correct encoding of the original, always using
+ the corresponding Unicode code point. I.e.
+ ligatures have to be represented as one
+ character etc.
+
+
+
+
+
+
+ Used for sort order in case multiple TextEquivs are defined. The text content with the lowest index should be interpreted as the main text content.
+
+
+
+
+
+
+
+
+
+ OCR confidence value (between 0 and 1)
+
+
+
+
+ Type of text content (is it free text or a number, for instance)
+This is only a descriptive attribute, the text type is not checked during XML validation
+
+
+
+
+ Refinement for dataType attribute. Can be a regular expression, for instance.
+
+
+
+
+
+
+
+
+ An image is considered to be more intricate and complex
+ than a graphic. These can be photos or drawings.
+
+
+
+
+
+
+ The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+ The colour bit depth required for the region
+
+
+
+
+
+
+ The background colour of the region
+
+
+
+
+
+
+ Specifies whether the region also contains
+ text
+
+
+
+
+
+
+
+
+
+ A line drawing is a single colour illustration without
+ solid areas.
+
+
+
+
+
+
+ The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+ The pen (foreground) colour of the region
+
+
+
+
+
+
+ The background colour of the region
+
+
+
+
+
+
+ Specifies whether the region also contains
+ text
+
+
+
+
+
+
+
+
+
+ Regions containing simple graphics, such as a company
+ logo, should be marked as graphic regions.
+
+
+
+
+
+
+ The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+ The type of graphic in the region
+
+
+
+
+
+
+ An approximation of the number of colours
+ used in the region
+
+
+
+
+
+
+ Specifies whether the region also contains
+ text.
+
+
+
+
+
+
+
+
+
+ Tabular data in any form is represented with a table
+ region. Rows and columns may or may not have separator
+ lines; these lines are not separator regions.
+
+
+
+
+
+
+
+ Table grid (visible or virtual grid lines)
+
+
+
+
+
+ The angle the rectangle encapsulating a
+ region has to be rotated in clockwise
+ direction in order to correct the present
+ skew (negative values indicate
+ anti-clockwise rotation). Range:
+ -179.999,180
+
+
+
+
+
+
+ The number of rows present in the table
+
+
+
+
+
+
+ The number of columns present in the table
+
+
+
+
+
+
+ The colour of the lines used in the region
+
+
+
+
+
+
+ The background colour of the region
+
+
+
+
+
+
+ Specifies the presence of line separators
+
+
+
+
+
+
+ Specifies whether the region also contains
+ text
+
+
+
+
+
+
+
+
+ Matrix of grid points defining the table grid on the page
+
+
+
+
+ One row in the grid point matrix. Points with x,y coordinates. (note: for a table with n table rows there should be n+1 grid rows)
+
+
+
+
+
+ Points with x,y coordinates.
+
+
+
+
+ The grid row index
+
+
+
+
+
+
+
+
+
+ Regions containing charts or graphs of any type, should
+ be marked as chart regions.
+
+
+
+
+
+
+ The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+ The type of chart in the region
+
+
+
+
+
+
+ An approximation of the number of colours
+ used in the region
+
+
+
+
+
+
+ The background colour of the region
+
+
+
+
+
+
+ Specifies whether the region also contains
+ text
+
+
+
+
+
+
+
+
+
+ Separators are lines that lie between columns and
+ paragraphs and can be used to logically separate
+ different articles from each other.
+
+
+
+
+
+
+ The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+ The colour of the separator
+
+
+
+
+
+
+
+
+
+ Regions containing equations and mathematical symbols
+ should be marked as maths regions.
+
+
+
+
+
+
+ The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+ The background colour of the region
+
+
+
+
+
+
+
+
+
+ Regions containing chemical formulas.
+
+
+
+
+
+
+
+ The angle the rectangle encapsulating a
+ region has to be rotated in clockwise
+ direction in order to correct the present
+ skew (negative values indicate
+ anti-clockwise rotation). Range:
+ -179.999,180
+
+
+
+
+
+
+
+ The background colour of the region
+
+
+
+
+
+
+
+
+
+
+ Regions containing maps.
+
+
+
+
+
+
+
+ The angle the rectangle encapsulating a
+ region has to be rotated in clockwise
+ direction in order to correct the present
+ skew (negative values indicate
+ anti-clockwise rotation). Range:
+ -179.999,180
+
+
+
+
+
+
+
+
+
+ Regions containing musical notations.
+
+
+
+
+
+
+ The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+ The background colour of the region
+
+
+
+
+
+
+
+
+
+ Regions containing advertisements.
+
+
+
+
+
+
+ The angle the rectangle encapsulating a region has to be rotated in clockwise direction in order to correct the present skew (negative values indicate anti-clockwise rotation).
+Range: -179.999,180
+
+
+
+
+
+
+ The background colour of the region
+
+
+
+
+
+
+
+
+
+ Noise regions are regions where no real data lies, only
+ false data created by artifacts on the document or
+ scanner noise.
+
+
+
+
+
+
+
+
+
+ To be used if the region type cannot be ascertained.
+
+
+
+
+
+
+
+
+
+ Regions containing content that is not covered by the default types (text, graphic, image, line drawing, chart, table, separator, maths, map, music, chem, advert, noise, unknown)
+
+
+
+
+
+
+
+ Information on the type of content represented by this region
+
+
+
+
+
+
+
+
+
+ Determines the effective area on the paper of a printed page. Its size is equal for all pages of a book (exceptions: titlepage, multipage pictures).
+It contains all living elements (except marginals) like body type, footnotes, headings, running titles.
+It does not contain pagenumber (if not part of running title), marginals, signature mark, preview words.
+
+
+
+
+
+
+
+
+
+ Definition of the reading order within the page. To express a reading order between elements they have to be included in an OrderedGroup. Groups may contain further groups.
+
+
+
+
+
+
+
+
+
+ Confidence value (between 0 and 1)
+
+
+
+
+
+
+
+ Numbered region
+
+
+
+ Position (order number) of this item within the current hierarchy level.
+
+
+
+
+
+
+
+ Indexed group containing ordered elements
+
+
+
+
+
+
+
+ Semantic labels / tags
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members.
+
+
+
+
+ Position (order number) of this item within the
+ current hierarchy level.
+
+
+
+
+
+
+
+
+ Is this group a continuation of another group (from
+ previous column or page, for example)?
+
+
+
+
+
+
+
+
+
+
+ Indexed group containing unordered elements
+
+
+
+
+
+
+
+ Semantic labels / tags
+
+
+
+
+
+
+
+
+
+
+
+
+ Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members.
+
+
+
+
+ Position (order number) of this item within the
+ current hierarchy level.
+
+
+
+
+
+
+
+ Is this group a continuation of another group (from previous column or page, for example)?
+
+
+
+
+
+
+
+
+
+
+
+ Numbered group (contains ordered elements)
+
+
+
+
+
+
+
+ Semantic labels / tags
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members.
+
+
+
+
+
+ Is this group a continuation of another group (from previous column or page, for example)?
+
+
+
+
+
+
+
+
+ Numbered group (contains unordered elements)
+
+
+
+
+
+
+
+ Semantic labels / tags
+
+
+
+
+
+
+
+
+
+
+
+
+ Optional link to a parent region of nested regions. The parent region doubles as reading order group. Only the nested regions should be allowed as group members.
+
+
+
+
+
+ Is this group a continuation of another group (from previous column or page, for example)?
+
+
+
+
+
+
+ Border of the actual page (if the scanned image contains parts not belonging to the page).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ISO 639.x 2016-07-14
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ iso15924 2016-07-14
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Can be used to express the z-index of overlapping
+ regions. An element with a greater z-index is always in
+ front of another element with lower z-index.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Confidence value (between 0 and 1)
+
+
+
+
+
+
+ Point list with format "x1,y1 x2,y2 ..."
+
+
+
+
+
+
+
+
+
+ Container for one-to-one relations between layout
+ objects (for example: DropCap - paragraph, caption -
+ image)
+
+
+
+
+
+
+
+
+
+
+ One-to-one relation between to layout object. Use 'link'
+ for loose relations and 'join' for strong relations
+ (where something is fragmented for instance).
+
+ Examples for 'link': caption - image floating -
+ paragraph paragraph - paragraph (when a paragraph is
+ split across columns and the last word of the first
+ paragraph DOES NOT continue in the second paragraph)
+ drop-cap - paragraph (when the drop-cap is a whole word)
+
+ Examples for 'join': word - word (separated word at the
+ end of a line) drop-cap - paragraph (when the drop-cap
+ is not a whole word) paragraph - paragraph (when a
+ pragraph is split across columns and the last word of
+ the first paragraph DOES continue in the second
+ paragraph)
+
+
+
+
+
+
+ Semantic labels / tags
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For generic use
+
+
+
+
+
+ Text production type
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Monospace (fixed-pitch, non-proportional) or
+ proportional font
+
+
+
+
+
+ For instance: Arial, Times New Roman. Add more
+ information if necessary (e.g. blackletter,
+ antiqua).
+
+
+
+
+
+
+ Serif or sans-serif typeface
+
+
+
+
+
+
+
+ The size of the characters in points
+
+
+
+
+
+ The x-height or corpus size refers to the distance between the baseline and the mean line of lower-case letters in a typeface. The unit is assumed to be pixels.
+
+
+
+
+
+ The degree of space (in points) between the
+ characters in a string of text
+
+
+
+
+
+
+ Text colour in RGB encoded format (red value) + (256 x green value) + (65536 x blue value)
+
+
+
+
+ Background colour
+
+
+
+
+ Background colour in RGB encoded format (red value) + (256 x green value) + (65536 x blue value)
+
+
+
+
+
+ Specifies whether the colour of the text appears
+ reversed against a background colour
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Alternative region images (e.g.
+ black-and-white)
+
+
+
+
+
+
+
+
+ Semantic labels / tags
+
+
+
+
+
+ Roles the region takes (e.g. in context of a
+ parent region)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For generic use
+
+
+
+
+
+ Is this region a continuation of another region (in previous column or page, for example)?
+
+
+
+
+
+
+
+
+ Confidence value (between 0 and 1)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Examples: "123.456", "+1234.456", "-1234.456", "-.456", "-456"
+
+
+
+ Examples: "123.456", "+1234.456", "-1.2344e56", "-.45E-6", "INF", "-INF", "NaN"
+
+
+
+ Examples: "123456", "+00000012", "-1", "-456"
+
+
+
+ Examples: "true", "false", "1", "0"
+
+
+
+ Examples: "2001-10-26", "2001-10-26+02:00", "2001-10-26Z", "2001-10-26+00:00", "-2001-10-26", "-20000-04-01"
+
+
+
+ Examples: "21:32:52", "21:32:52+02:00", "19:32:52Z", "19:32:52+00:00", "21:32:52.12679"
+
+
+
+ Examples: "2001-10-26T21:32:52", "2001-10-26T21:32:52+02:00", "2001-10-26T19:32:52Z", "2001-10-26T19:32:52+00:00", "-2001-10-26T21:32:52", "2001-10-26T21:32:52.12679"
+
+
+
+ Generic text string
+
+
+
+ An XSD type that is not listed or a custom type (use dataTypeDetails attribute)
+
+
+
+
+
+
+
+
+
+ Container for graphemes, grapheme groups and
+ non-printing characters
+
+
+
+
+
+
+
+
+
+
+
+ Base type for graphemes, grapheme groups and non-printing characters
+
+
+
+
+
+
+
+
+ Order index of grapheme, group, or non-printing character within the parent container (graphemes or glyph or grapheme group)
+
+
+
+
+
+
+
+
+
+
+ Type of character represented by the grapheme/group/non-printing character element
+
+
+
+
+
+
+
+
+
+
+ For generic use
+
+
+ For generic use
+
+
+
+
+ Represents a sub-element of a glyph. Smallest graphical unit that can be assigned a Unicode code point
+
+
+
+
+
+
+
+
+
+
+
+
+ A glyph component without visual representation but with Unicode code point. Non-visual / non-printing / control character. Part of grapheme container (of glyph) or grapheme sub group.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Container for user-defined attributes
+
+
+
+
+
+
+
+
+ Structured custom data defined by name, type and value.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Cell position in table starting with row 0
+
+
+
+ Cell position in table starting with column 0
+
+
+
+ Number of rows the cell spans (optional; default is 1)
+
+
+
+ Number of columns the cell spans (optional; default is 1)
+
+
+
+
+ Is the cell a column or row header?
+
+
+
+
+
+
+
+
+
+ Data for a region that takes on the role of a table cell within a parent table region
+
+
+
+