Merge branch 'feat/text-extraction-levels'

pull/38/head
Gerber, Mike 4 years ago
commit f14ae46870

@ -43,8 +43,12 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
$REPORT_PREFIX defaults to "report". The reports include the character $REPORT_PREFIX defaults to "report". The reports include the character
error rate (CER) and the word error rate (WER). error rate (CER) and the word error rate (WER).
By default, the text of PAGE files is extracted on 'region' level. You may
use "--textequiv-level line" to extract from the level of TextLine tags.
Options: Options:
--metrics / --no-metrics Enable/disable metrics and green/red --metrics / --no-metrics Enable/disable metrics and green/red
--textequiv-level LEVEL PAGE TextEquiv level to extract text from
--progress Show progress bar --progress Show progress bar
--help Show this message and exit. --help Show this message and exit.
~~~ ~~~
@ -56,6 +60,15 @@ dinglehopper some-document.gt.page.xml some-document.ocr.alto.xml
This generates `report.html` and `report.json`. This generates `report.html` and `report.json`.
### dinglehopper-extract
The tool `dinglehopper-extract` extracts the text of the given input file on
stdout, for example:
~~~
dinglehopper-extract --textequiv-level line OCR-D-GT-PAGE/00000024.page.xml
~~~
### OCR-D
As a OCR-D processor: As a OCR-D processor:
~~~ ~~~
ocrd-dinglehopper -I OCR-D-GT-PAGE,OCR-D-OCR-TESS -O OCR-D-OCR-TESS-EVAL ocrd-dinglehopper -I OCR-D-GT-PAGE,OCR-D-OCR-TESS -O OCR-D-OCR-TESS-EVAL
@ -65,9 +78,14 @@ This generates HTML and JSON reports in the `OCR-D-OCR-TESS-EVAL` filegroup.
![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true) ![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true)
You may also want to disable metrics and the green-red color scheme by The OCR-D processor has these parameters:
parameter:
| Parameter | Meaning |
| ------------------------- | ------------------------------------------------------------------- |
| `-P metrics false` | Disable metrics and the green-red color scheme (default: enabled) |
| `-P textequiv_level line` | (PAGE) Extract text from TextLine level (default: TextRegion level) |
For example:
~~~ ~~~
ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics false ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics false
~~~ ~~~

@ -80,15 +80,15 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
'''.format(gtx, ocrx) '''.format(gtx, ocrx)
def process(gt, ocr, report_prefix, *, metrics=True): def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'):
"""Check OCR result against GT. """Check OCR result against GT.
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
Click on a wrapper. Click on a wrapper.
""" """
gt_text = extract(gt) gt_text = extract(gt, textequiv_level=textequiv_level)
ocr_text = extract(ocr) ocr_text = extract(ocr, textequiv_level=textequiv_level)
cer, n_characters = character_error_rate_n(gt_text, ocr_text) cer, n_characters = character_error_rate_n(gt_text, ocr_text)
wer, n_words = word_error_rate_n(gt_text, ocr_text) wer, n_words = word_error_rate_n(gt_text, ocr_text)
@ -134,8 +134,9 @@ def process(gt, ocr, report_prefix, *, metrics=True):
@click.argument('ocr', type=click.Path(exists=True)) @click.argument('ocr', type=click.Path(exists=True))
@click.argument('report_prefix', type=click.Path(), default='report') @click.argument('report_prefix', type=click.Path(), default='report')
@click.option('--metrics/--no-metrics', default=True, help='Enable/disable metrics and green/red') @click.option('--metrics/--no-metrics', default=True, help='Enable/disable metrics and green/red')
@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL')
@click.option('--progress', default=False, is_flag=True, help='Show progress bar') @click.option('--progress', default=False, is_flag=True, help='Show progress bar')
def main(gt, ocr, report_prefix, metrics, progress): def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
""" """
Compare the PAGE/ALTO/text document GT against the document OCR. Compare the PAGE/ALTO/text document GT against the document OCR.
@ -150,9 +151,12 @@ def main(gt, ocr, report_prefix, metrics, progress):
The comparison report will be written to $REPORT_PREFIX.{html,json}, where The comparison report will be written to $REPORT_PREFIX.{html,json}, where
$REPORT_PREFIX defaults to "report". The reports include the character error $REPORT_PREFIX defaults to "report". The reports include the character error
rate (CER) and the word error rate (WER). rate (CER) and the word error rate (WER).
By default, the text of PAGE files is extracted on 'region' level. You may
use "--textequiv-level line" to extract from the level of TextLine tags.
""" """
Config.progress = progress Config.progress = progress
process(gt, ocr, report_prefix, metrics=metrics) process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
if __name__ == '__main__': if __name__ == '__main__':

@ -0,0 +1,27 @@
import os
import click
from .extracted_text import ExtractedText
from .ocr_files import extract
@click.command()
@click.argument('input_file', type=click.Path(exists=True))
@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL')
def main(input_file, textequiv_level):
"""
Extract the text of the given INPUT_FILE.
dinglehopper detects if INPUT_FILE is an ALTO or PAGE XML document to extract
its text and falls back to plain text if no ALTO or PAGE is detected.
By default, the text of PAGE files is extracted on 'region' level. You may
use "--textequiv-level line" to extract from the level of TextLine tags.
"""
input_text = extract(input_file, textequiv_level=textequiv_level).text
print(input_text)
if __name__ == '__main__':
main()

@ -4,6 +4,7 @@ import unicodedata
from contextlib import suppress from contextlib import suppress
from itertools import repeat from itertools import repeat
from typing import Optional from typing import Optional
from lxml import etree as ET
import attr import attr
@ -147,9 +148,6 @@ class ExtractedText:
@property @property
def text(self): def text(self):
if self._text is not None: if self._text is not None:
if self._text == '':
return None
else:
return self._text return self._text
else: else:
return self.joiner.join(s.text for s in self.segments) return self.joiner.join(s.text for s in self.segments)
@ -159,11 +157,17 @@ class ExtractedText:
def segment_id_for_pos(self, pos): def segment_id_for_pos(self, pos):
# Calculate segment ids once, on the first call # Calculate segment ids once, on the first call
if not self._segment_id_for_pos: if not self._segment_id_for_pos:
if self._text is not None:
segment_id_for_pos = list(repeat(self.segment_id, len(self._text)))
else:
# Recurse
segment_id_for_pos = [] segment_id_for_pos = []
for s in self.segments: for s in self.segments:
segment_id_for_pos.extend(repeat(s.segment_id, len(s.text))) seg_ids = [s.segment_id_for_pos(i) for i in range(len(s.text))]
segment_id_for_pos.extend(seg_ids)
segment_id_for_pos.extend(repeat(None, len(self.joiner))) segment_id_for_pos.extend(repeat(None, len(self.joiner)))
segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)] segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)]
# This is frozen, so we have to jump through the hoop: # This is frozen, so we have to jump through the hoop:
object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos) object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
assert self._segment_id_for_pos assert self._segment_id_for_pos
@ -171,17 +175,62 @@ class ExtractedText:
return self._segment_id_for_pos[pos] return self._segment_id_for_pos[pos]
@classmethod @classmethod
def from_text_segment(cls, text_segment, nsmap): def from_text_segment(cls, text_segment, nsmap, textequiv_level='region'):
"""Build an ExtractedText from a PAGE content text element""" """Build an ExtractedText from a PAGE content text element"""
def invert_dict(d):
"""Invert the given dict"""
return {v: k for k, v in d.items()}
def get_textequiv_unicode(s):
"""Get the TextEquiv/Unicode text of the given PAGE text element"""
textequivs = s.findall('./page:TextEquiv', namespaces=nsmap)
def get_index(te):
index = te.attrib.get('index')
try:
return int(index)
except TypeError:
return None
textequivs = sorted(textequivs, key=get_index)
return textequivs[0].find('./page:Unicode', namespaces=nsmap).text
localname_for_textequiv_level = {
'region': 'TextRegion',
'line': 'TextLine'
}
textequiv_level_for_localname = invert_dict(localname_for_textequiv_level)
children_for_localname = {
'TextRegion': 'TextLine'
}
joiner_for_textequiv_level = {
'line': '\n'
}
segment_id = text_segment.attrib['id'] segment_id = text_segment.attrib['id']
localname = ET.QName(text_segment).localname
if localname == localname_for_textequiv_level[textequiv_level]:
segment_text = None segment_text = None
with suppress(AttributeError): with suppress(AttributeError):
segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text segment_text = get_textequiv_unicode(text_segment)
segment_text = segment_text or '' segment_text = segment_text or ''
segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization
segment_text = segment_text or '' segment_text = segment_text or ''
return cls(segment_id, None, None, segment_text) return cls(segment_id, None, None, segment_text)
else:
# Recurse
sub_localname = children_for_localname[localname]
sub_textequiv_level = textequiv_level_for_localname[sub_localname]
segments = []
for sub_segment in text_segment.iterfind('./page:%s' % sub_localname, namespaces=nsmap):
segments.append(
ExtractedText.from_text_segment(
sub_segment, nsmap,
textequiv_level=sub_textequiv_level)
)
joiner = joiner_for_textequiv_level[sub_textequiv_level]
return cls(segment_id, segments, joiner, None)
@classmethod @classmethod
def from_str(cls, text, normalization=Normalization.NFC_SBB): def from_str(cls, text, normalization=Normalization.NFC_SBB):

@ -54,9 +54,12 @@ def page_namespace(tree):
raise ValueError('Not a PAGE tree') raise ValueError('Not a PAGE tree')
def page_extract(tree): def page_extract(tree, *, textequiv_level='region'):
"""Extract text from the given PAGE content ElementTree.""" """Extract text from the given PAGE content ElementTree."""
# Internally, this is just parsing the Reading Order (if it exists) and
# and leaves reading the TextRegions to ExtractedText.from_text_segment().
nsmap = {'page': page_namespace(tree)} nsmap = {'page': page_namespace(tree)}
regions = [] regions = []
@ -69,23 +72,23 @@ def page_extract(tree):
region_id = region_ref_indexed.attrib['regionRef'] region_id = region_ref_indexed.attrib['regionRef']
region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap) region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap)
if region is not None: if region is not None:
regions.append(ExtractedText.from_text_segment(region, nsmap)) regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level))
else: else:
warn('Not a TextRegion: "%s"' % region_id) pass # Not a TextRegion
else: else:
raise NotImplementedError raise NotImplementedError
else: else:
for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap): for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap):
regions.append(ExtractedText.from_text_segment(region, nsmap)) regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level))
# Filter empty region texts # Filter empty region texts
regions = [r for r in regions if r.text is not None] regions = [r for r in regions if r.text != '']
return ExtractedText(None, regions, '\n', None) return ExtractedText(None, regions, '\n', None)
def page_text(tree): def page_text(tree, *, textequiv_level='region'):
return page_extract(tree).text return page_extract(tree, textequiv_level=textequiv_level).text
def plain_extract(filename): def plain_extract(filename):
@ -102,7 +105,7 @@ def plain_text(filename):
return plain_extract(filename).text return plain_extract(filename).text
def extract(filename): def extract(filename, *, textequiv_level='region'):
"""Extract the text from the given file. """Extract the text from the given file.
Supports PAGE, ALTO and falls back to plain text. Supports PAGE, ALTO and falls back to plain text.
@ -112,7 +115,7 @@ def extract(filename):
except XMLSyntaxError: except XMLSyntaxError:
return plain_extract(filename) return plain_extract(filename)
try: try:
return page_extract(tree) return page_extract(tree, textequiv_level=textequiv_level)
except ValueError: except ValueError:
return alto_extract(tree) return alto_extract(tree)

@ -22,6 +22,12 @@
"type": "boolean", "type": "boolean",
"default": true, "default": true,
"description": "Enable/disable metrics and green/red" "description": "Enable/disable metrics and green/red"
},
"textequiv_level": {
"type": "string",
"enum": ["region", "line"],
"default": "region",
"description": "PAGE XML hierarchy level to extract the text from"
} }
} }
} }

@ -32,6 +32,7 @@ class OcrdDinglehopperEvaluate(Processor):
log = getLogger('processor.OcrdDinglehopperEvaluate') log = getLogger('processor.OcrdDinglehopperEvaluate')
metrics = self.parameter['metrics'] metrics = self.parameter['metrics']
textequiv_level = self.parameter['textequiv_level']
gt_grp, ocr_grp = self.input_file_grp.split(',') gt_grp, ocr_grp = self.input_file_grp.split(',')
for n, page_id in enumerate(self.workspace.mets.physical_pages): for n, page_id in enumerate(self.workspace.mets.physical_pages):
gt_file = next(self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id)) gt_file = next(self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id))
@ -52,7 +53,8 @@ class OcrdDinglehopperEvaluate(Processor):
gt_file.local_filename, gt_file.local_filename,
ocr_file.local_filename, ocr_file.local_filename,
report_prefix, report_prefix,
metrics=metrics metrics=metrics,
textequiv_level=textequiv_level
) )
# Add reports to the workspace # Add reports to the workspace

@ -0,0 +1,382 @@
<?xml version="1.0" encoding="UTF-8"?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15/pagecontent.xsd" pcGtsId="id__00000083">
<Metadata>
<Creator>doculibtopagexml</Creator>
<Created>2018-11-20T05:00:14</Created>
<LastChange>2019-04-17T10:47:36</LastChange></Metadata>
<Page imageFilename="00000083.tif" imageXResolution="300.00000" imageYResolution="300.00000" imageWidth="1275" imageHeight="2032" type="content" readingDirection="left-to-right" textLineOrder="top-to-bottom" primaryLanguage="German">
<PrintSpace>
<Coords points="45,93 45,1916 1035,1916 1035,93"/></PrintSpace>
<ReadingOrder>
<OrderedGroup id="ro357564684568544579089">
<RegionRefIndexed regionRef="r1070" index="3"/>
</OrderedGroup></ReadingOrder>
<TextRegion id="r1070" readingDirection="left-to-right" textLineOrder="top-to-bottom" type="paragraph" align="justify" primaryLanguage="German">
<Coords points="190,1215 209,1215 209,1255 376,1255 376,1252 465,1252 465,1251 484,1251 484,1255 588,1255 588,1254 636,1254 636,1249 677,1249 677,1224 690,1224 690,1250 760,1250 760,1251 921,1251 921,1264 964,1264 964,1340 966,1340 966,1384 968,1384 968,1429 969,1429 969,1562 972,1562 972,1636 973,1636 973,1698 975,1698 975,1725 944,1725 944,1788 929,1788 929,1789 898,1789 898,1799 875,1799 875,1797 826,1797 826,1790 690,1790 690,1795 674,1795 674,1785 564,1785 564,1786 519,1786 519,1788 491,1788 491,1800 438,1800 438,1802 422,1802 422,1790 299,1790 299,1795 280,1795 280,1804 228,1804 228,1798 112,1798 112,1659 111,1659 111,1481 109,1481 109,1359 111,1359 111,1264 112,1264 112,1263 113,1263 113,1262 114,1262 114,1260 115,1260 115,1256 116,1256 116,1254 117,1254 117,1253 118,1253 118,1252 119,1252 119,1250 120,1250 120,1249 121,1249 121,1248 122,1248 122,1247 123,1247 123,1246 124,1246 124,1245 125,1245 125,1244 126,1244 126,1243 127,1243 127,1242 128,1242 128,1241 129,1241 129,1240 134,1240 134,1239 139,1239 139,1238 190,1238"/>
<TextLine id="l1071">
<Coords points="112,1732 280,1732 280,1748 391,1748 391,1753 596,1753 596,1744 635,1744 635,1745 690,1745 690,1748 806,1748 806,1751 858,1751 858,1752 898,1752 898,1762 929,1762 929,1776 944,1776 944,1788 929,1788 929,1789 898,1789 898,1799 875,1799 875,1797 826,1797 826,1790 690,1790 690,1795 674,1795 674,1785 564,1785 564,1786 519,1786 519,1788 491,1788 491,1800 438,1800 438,1802 422,1802 422,1790 299,1790 299,1795 280,1795 280,1804 228,1804 228,1798 112,1798"/>
<Word id="w1072">
<Coords points="112,1732 146,1732 146,1747 206,1747 206,1773 211,1773 211,1774 213,1774 213,1775 214,1775 214,1779 213,1779 213,1781 212,1781 212,1783 211,1783 211,1785 210,1785 210,1786 209,1786 209,1787 208,1787 208,1788 206,1788 206,1789 146,1789 146,1798 112,1798"/>
<Glyph id="c1073">
<Coords points="112,1732 146,1732 146,1798 112,1798"/>
<TextEquiv conf="0.91338">
<Unicode>H</Unicode></TextEquiv></Glyph>
<Glyph id="c1074">
<Coords points="149,1748 165,1748 165,1776 149,1776"/>
<TextEquiv conf="0.61335">
<Unicode>a</Unicode></TextEquiv></Glyph>
<Glyph id="c1075">
<Coords points="167,1750 186,1750 186,1783 167,1783"/>
<TextEquiv conf="0.69192">
<Unicode>n</Unicode></TextEquiv></Glyph>
<Glyph id="c1076">
<Coords points="187,1747 206,1747 206,1781 187,1781"/>
<TextEquiv conf="0.72500">
<Unicode>d</Unicode></TextEquiv></Glyph>
<Glyph id="c1205">
<Coords points="211,1774 213,1774 213,1775 214,1775 214,1779 213,1779 213,1781 212,1781 212,1783 211,1783 211,1785 210,1785 210,1786 209,1786 209,1787 208,1787 208,1788 206,1788 206,1789 206,1788 205,1788 205,1782 206,1782 206,1780 207,1780 207,1778 210,1778 210,1774 211,1774 211,1773"/>
<TextEquiv>
<Unicode>,</Unicode></TextEquiv></Glyph>
<TextEquiv conf="0.61335">
<Unicode>Hand,</Unicode></TextEquiv></Word>
<Word id="w1077">
<Coords points="228,1732 280,1732 280,1748 391,1748 391,1790 299,1790 299,1795 280,1795 280,1804 228,1804"/>
<Glyph id="c1078">
<Coords points="228,1732 280,1732 280,1804 228,1804"/>
<TextEquiv conf="0.87457">
<Unicode>M</Unicode></TextEquiv></Glyph>
<Glyph id="c1079">
<Coords points="282,1759 299,1759 299,1795 282,1795"/>
<TextEquiv conf="0.76524">
<Unicode>y</Unicode></TextEquiv></Glyph>
<Glyph id="c1080">
<Coords points="301,1753 311,1753 311,1788 301,1788"/>
<TextEquiv conf="0.86902">
<Unicode>l</Unicode></TextEquiv></Glyph>
<Glyph id="c1081">
<Coords points="313,1761 330,1761 330,1788 313,1788"/>
<TextEquiv conf="0.85741">
<Unicode>o</Unicode></TextEquiv></Glyph>
<Glyph id="c1082">
<Coords points="332,1762 345,1762 345,1790 332,1790"/>
<TextEquiv conf="0.82725">
<Unicode>r</Unicode></TextEquiv></Glyph>
<Glyph id="c1083">
<Coords points="347,1756 364,1756 364,1789 347,1789"/>
<TextEquiv conf="0.84884">
<Unicode>d</Unicode></TextEquiv></Glyph>
<Glyph id="c1084">
<Coords points="373,1748 391,1748 391,1790 373,1790"/>
<TextEquiv conf="0.81744">
<Unicode>?</Unicode></TextEquiv></Glyph>
<TextEquiv conf="0.76524">
<Unicode>Mylord?</Unicode></TextEquiv></Word>
<Word id="w1085">
<Coords points="422,1753 438,1753 438,1757 503,1757 503,1762 519,1762 519,1788 491,1788 491,1800 438,1800 438,1802 422,1802"/>
<Glyph id="c1086">
<Coords points="422,1753 438,1753 438,1802 422,1802"/>
<TextEquiv conf="0.82658">
<Unicode>f</Unicode></TextEquiv></Glyph>
<Glyph id="c1087">
<Coords points="436,1763 450,1763 450,1790 436,1790"/>
<TextEquiv conf="0.83664">
<Unicode>r</Unicode></TextEquiv></Glyph>
<Glyph id="c1088">
<Coords points="451,1761 468,1761 468,1788 451,1788"/>
<TextEquiv conf="0.74675">
<Unicode>a</Unicode></TextEquiv></Glyph>
<Glyph id="c1089">
<Coords points="472,1762 491,1762 491,1800 472,1800"/>
<TextEquiv conf="0.83707">
<Unicode>g</Unicode></TextEquiv></Glyph>
<Glyph id="c1090">
<Coords points="492,1757 503,1757 503,1788 492,1788"/>
<TextEquiv conf="0.79790">
<Unicode>t</Unicode></TextEquiv></Glyph>
<Glyph id="c1091">
<Coords points="505,1762 519,1762 519,1788 505,1788"/>
<TextEquiv conf="0.88885">
<Unicode>e</Unicode></TextEquiv></Glyph>
<TextEquiv conf="0.74675">
<Unicode>fragte</Unicode></TextEquiv></Word>
<Word id="w1092">
<Coords points="531,1753 549,1753 549,1757 579,1757 579,1785 564,1785 564,1786 531,1786"/>
<Glyph id="c1093">
<Coords points="531,1753 549,1753 549,1786 531,1786"/>
<TextEquiv conf="0.84252">
<Unicode>d</Unicode></TextEquiv></Glyph>
<Glyph id="c1094">
<Coords points="550,1759 564,1759 564,1786 550,1786"/>
<TextEquiv conf="0.88588">
<Unicode>e</Unicode></TextEquiv></Glyph>
<Glyph id="c1095">
<Coords points="566,1757 579,1757 579,1785 566,1785"/>
<TextEquiv conf="0.83230">
<Unicode>r</Unicode></TextEquiv></Glyph>
<TextEquiv conf="0.83230">
<Unicode>der</Unicode></TextEquiv></Word>
<Word id="w1096">
<Coords points="596,1744 635,1744 635,1745 690,1745 690,1795 674,1795 674,1785 596,1785"/>
<Glyph id="c1097">
<Coords points="596,1744 635,1744 635,1785 596,1785"/>
<TextEquiv conf="0.80936">
<Unicode>G</Unicode></TextEquiv></Glyph>
<Glyph id="c1098">
<Coords points="637,1755 651,1755 651,1783 637,1783"/>
<TextEquiv conf="0.78064">
<Unicode>r</Unicode></TextEquiv></Glyph>
<Glyph id="c1099">
<Coords points="652,1754 671,1754 671,1784 652,1784"/>
<TextEquiv conf="0.79657">
<Unicode>a</Unicode></TextEquiv></Glyph>
<Glyph id="c1100">
<Coords points="674,1745 690,1745 690,1795 674,1795"/>
<TextEquiv conf="0.85403">
<Unicode>f</Unicode></TextEquiv></Glyph>
<TextEquiv conf="0.78064">
<Unicode>Graf</Unicode></TextEquiv></Word>
<Word id="w1101">
<Coords points="697,1755 716,1755 716,1758 757,1758 757,1787 737,1787 737,1786 719,1786 719,1785 697,1785"/>
<Glyph id="c1102">
<Coords points="697,1755 716,1755 716,1785 697,1785"/>
<TextEquiv conf="0.84576">
<Unicode>v</Unicode></TextEquiv></Glyph>
<Glyph id="c1103">
<Coords points="719,1758 735,1758 735,1786 719,1786"/>
<TextEquiv conf="0.89206">
<Unicode>o</Unicode></TextEquiv></Glyph>
<Glyph id="c1104">
<Coords points="737,1758 757,1758 757,1787 737,1787"/>
<TextEquiv conf="0.85889">
<Unicode>n</Unicode></TextEquiv></Glyph>
<TextEquiv conf="0.84576">
<Unicode>von</Unicode></TextEquiv></Word>
<Word id="w1105">
<Coords points="768,1748 806,1748 806,1751 858,1751 858,1752 898,1752 898,1762 929,1762 929,1776 944,1776 944,1788 929,1788 929,1789 898,1789 898,1799 875,1799 875,1797 826,1797 826,1790 768,1790"/>
<Glyph id="c1106">
<Coords points="768,1748 806,1748 806,1790 768,1790"/>
<TextEquiv conf="0.81040">
<Unicode>R</Unicode></TextEquiv></Glyph>
<Glyph id="c1107">
<Coords points="808,1761 825,1761 825,1789 808,1789"/>
<TextEquiv conf="0.85909">
<Unicode>o</Unicode></TextEquiv></Glyph>
<Glyph id="c1108">
<Coords points="826,1751 858,1751 858,1797 826,1797"/>
<TextEquiv conf="0.83404">
<Unicode></Unicode></TextEquiv></Glyph>
<Glyph id="c1109">
<Coords points="860,1763 873,1763 873,1790 860,1790"/>
<TextEquiv conf="0.85515">
<Unicode>e</Unicode></TextEquiv></Glyph>
<Glyph id="c1110">
<Coords points="875,1752 898,1752 898,1799 875,1799"/>
<TextEquiv conf="0.89503">
<Unicode></Unicode></TextEquiv></Glyph>
<Glyph id="c1111">
<Coords points="899,1762 913,1762 913,1789 899,1789"/>
<TextEquiv conf="0.87816">
<Unicode>e</Unicode></TextEquiv></Glyph>
<Glyph id="c1112">
<Coords points="911,1762 929,1762 929,1789 911,1789"/>
<TextEquiv conf="0.73941">
<Unicode>r</Unicode></TextEquiv></Glyph>
<Glyph id="c1113">
<Coords points="934,1776 944,1776 944,1788 934,1788"/>
<TextEquiv conf="0.69111">
<Unicode>.</Unicode></TextEquiv></Glyph>
<TextEquiv conf="0.69111">
<Unicode>Roeer.</Unicode></TextEquiv></Word>
<TextEquiv conf="0.70871">
<Unicode>Hand, Mylord? fragte der Graf von Roeer.</Unicode></TextEquiv></TextLine>
<TextLine id="l766">
<Coords points="109,1359 139,1359 139,1367 168,1367 168,1379 364,1379 364,1378 418,1378 418,1377 428,1377 428,1379 558,1379 558,1374 643,1374 643,1373 661,1373 661,1374 822,1374 822,1372 864,1372 864,1374 898,1374 898,1383 955,1383 955,1384 968,1384 968,1406 955,1406 955,1410 876,1410 876,1411 864,1411 864,1413 722,1413 722,1418 661,1418 661,1421 643,1421 643,1412 373,1412 373,1413 340,1413 340,1414 310,1414 310,1413 241,1413 241,1411 203,1411 203,1410 187,1410 187,1406 149,1406 149,1404 139,1404 139,1402 109,1402"/>
<Word id="w769">
<Coords points="109,1359 139,1359 139,1367 168,1367 168,1406 149,1406 149,1404 139,1404 139,1402 109,1402"/>
<Glyph id="c770">
<Coords points="109,1359 139,1359 139,1402 109,1402"/>
<TextEquiv conf="0.70756">
<Unicode>A</Unicode></TextEquiv></Glyph>
<Glyph id="c771">
<Coords points="139,1369 149,1369 149,1404 139,1404"/>
<TextEquiv conf="0.76907">
<Unicode>l</Unicode></TextEquiv></Glyph>
<Glyph id="c772">
<Coords points="149,1367 168,1367 168,1406 149,1406"/>
<TextEquiv conf="0.68295">
<Unicode>s</Unicode></TextEquiv></Glyph>
<TextEquiv conf="0.68295">
<Unicode>Als</Unicode></TextEquiv></Word>
<Word id="w773">
<Coords points="187,1384 201,1384 201,1386 218,1386 218,1411 203,1411 203,1410 187,1410"/>
<Glyph id="c774">
<Coords points="187,1384 201,1384 201,1410 187,1410"/>
<TextEquiv conf="0.83952">
<Unicode>e</Unicode></TextEquiv></Glyph>
<Glyph id="c775">
<Coords points="203,1386 218,1386 218,1411 203,1411"/>
<TextEquiv conf="0.81121">
<Unicode>r</Unicode></TextEquiv></Glyph>
<TextEquiv conf="0.81121">
<Unicode>er</Unicode></TextEquiv></Word>
<Word id="w776">
<Coords points="364,1378 373,1378 373,1381 393,1381 393,1412 373,1412 373,1413 340,1413 340,1414 310,1414 310,1413 241,1413 241,1388 258,1388 258,1379 364,1379"/>
<Glyph id="c777">
<Coords points="241,1388 255,1388 255,1413 241,1413"/>
<TextEquiv conf="0.88983">
<Unicode>e</Unicode></TextEquiv></Glyph>
<Glyph id="c778">
<Coords points="258,1379 267,1379 267,1413 258,1413"/>
<TextEquiv conf="0.87166">
<Unicode>i</Unicode></TextEquiv></Glyph>
<Glyph id="c779">
<Coords points="269,1385 288,1385 288,1413 269,1413"/>
<TextEquiv conf="0.85669">
<Unicode>n</Unicode></TextEquiv></Glyph>
<Glyph id="c782">
<Coords points="310,1385 340,1385 340,1414 310,1414"/>
<TextEquiv conf="0.90717">
<Unicode>m</Unicode></TextEquiv></Glyph>
<Glyph id="c783">
<Coords points="343,1386 361,1386 361,1412 343,1412"/>
<TextEquiv conf="0.77710">
<Unicode>a</Unicode></TextEquiv></Glyph>
<Glyph id="c784">
<Coords points="364,1378 373,1378 373,1413 364,1413"/>
<TextEquiv conf="0.80457">
<Unicode>l</Unicode></TextEquiv></Glyph>
<Glyph id="c785">
<Coords points="375,1381 393,1381 393,1412 375,1412"/>
<TextEquiv conf="0.79192">
<Unicode>s</Unicode></TextEquiv></Glyph>
<Glyph id="c1195">
<Coords points="303,1385 303,1386 304,1386 304,1401 305,1401 305,1402 304,1402 304,1407 303,1407 303,1409 302,1409 302,1410 301,1410 301,1411 300,1411 300,1412 297,1412 297,1411 296,1411 296,1410 295,1410 295,1409 292,1409 292,1408 291,1408 291,1397 290,1397 291,1397 291,1392 292,1392 292,1391 293,1391 293,1389 295,1389 295,1388 296,1388 296,1387 301,1387 301,1385"/>
<TextEquiv>
<Unicode>s</Unicode></TextEquiv></Glyph>
<TextEquiv conf="0.58499">
<Unicode>einsmals</Unicode></TextEquiv></Word>
<Word id="w786">
<Coords points="418,1377 428,1377 428,1385 450,1385 450,1412 418,1412"/>
<Glyph id="c787">
<Coords points="418,1377 428,1377 428,1412 418,1412"/>
<TextEquiv conf="0.90477">
<Unicode>i</Unicode></TextEquiv></Glyph>
<Glyph id="c788">
<Coords points="431,1385 450,1385 450,1412 431,1412"/>
<TextEquiv conf="0.90877">
<Unicode>n</Unicode></TextEquiv></Glyph>
<TextEquiv conf="0.90477">
<Unicode>in</Unicode></TextEquiv></Word>
<Word id="w789">
<Coords points="471,1379 489,1379 489,1385 538,1385 538,1412 471,1412"/>
<Glyph id="c790">
<Coords points="471,1379 489,1379 489,1412 471,1412"/>
<TextEquiv conf="0.83564">
<Unicode>d</Unicode></TextEquiv></Glyph>
<Glyph id="c791">
<Coords points="491,1386 503,1386 503,1411 491,1411"/>
<TextEquiv conf="0.83281">
<Unicode>e</Unicode></TextEquiv></Glyph>
<Glyph id="c792">
<Coords points="506,1385 538,1385 538,1412 506,1412"/>
<TextEquiv conf="0.86322">
<Unicode>m</Unicode></TextEquiv></Glyph>
<TextEquiv conf="0.83281">
<Unicode>dem</Unicode></TextEquiv></Word>
<Word id="w793">
<Coords points="643,1373 661,1373 661,1374 722,1374 722,1385 730,1385 730,1411 722,1411 722,1418 661,1418 661,1421 643,1421 643,1412 558,1412 558,1374 643,1374"/>
<Glyph id="c794">
<Coords points="558,1374 590,1374 590,1412 558,1412"/>
<TextEquiv conf="0.87259">
<Unicode>O</Unicode></TextEquiv></Glyph>
<Glyph id="c795">
<Coords points="593,1374 609,1374 609,1410 593,1410"/>
<TextEquiv conf="0.84287">
<Unicode>b</Unicode></TextEquiv></Glyph>
<Glyph id="c796">
<Coords points="611,1384 625,1384 625,1411 611,1411"/>
<TextEquiv conf="0.88296">
<Unicode>e</Unicode></TextEquiv></Glyph>
<Glyph id="c797">
<Coords points="627,1384 640,1384 640,1410 627,1410"/>
<TextEquiv conf="0.83827">
<Unicode>r</Unicode></TextEquiv></Glyph>
<Glyph id="c798">
<Coords points="643,1373 661,1373 661,1421 643,1421"/>
<TextEquiv conf="0.75418">
<Unicode>h</Unicode></TextEquiv></Glyph>
<Glyph id="c799">
<Coords points="664,1383 681,1383 681,1410 664,1410"/>
<TextEquiv conf="0.87030">
<Unicode>a</Unicode></TextEquiv></Glyph>
<Glyph id="c800">
<Coords points="683,1383 704,1383 704,1411 683,1411"/>
<TextEquiv conf="0.84676">
<Unicode>u</Unicode></TextEquiv></Glyph>
<Glyph id="c801">
<Coords points="705,1374 722,1374 722,1418 705,1418"/>
<TextEquiv conf="0.79240">
<Unicode>ſ</Unicode></TextEquiv></Glyph>
<Glyph id="c802">
<Coords points="716,1385 730,1385 730,1411 716,1411"/>
<TextEquiv conf="0.89839">
<Unicode>e</Unicode></TextEquiv></Glyph>
<TextEquiv conf="0.75418">
<Unicode>Oberhauſe</Unicode></TextEquiv></Word>
<Word id="w811">
<Coords points="911,1383 955,1383 955,1384 968,1384 968,1406 955,1406 955,1410 911,1410"/>
<Glyph id="c812">
<Coords points="911,1383 940,1383 940,1410 911,1410"/>
<TextEquiv conf="0.83790">
<Unicode>w</Unicode></TextEquiv></Glyph>
<Glyph id="c813">
<Coords points="942,1383 955,1383 955,1410 942,1410"/>
<TextEquiv conf="0.85182">
<Unicode>e</Unicode></TextEquiv></Glyph>
<Glyph id="c814">
<Coords points="957,1384 968,1384 968,1406 957,1406"/>
<TextEquiv conf="0.86700">
<Unicode></Unicode></TextEquiv></Glyph>
<TextEquiv conf="0.83790">
<Unicode>we⸗</Unicode></TextEquiv></Word>
<Word id="w1208">
<Coords points="764,1376 773,1376 773,1384 811,1384 811,1411 764,1411 764,1410 748,1410 748,1384 764,1384"/>
<Glyph id="c804">
<Coords points="748,1384 761,1384 761,1410 748,1410"/>
<TextEquiv conf="0.85674">
<Unicode>e</Unicode></TextEquiv></Glyph>
<Glyph id="c805">
<Coords points="764,1376 773,1376 773,1411 764,1411"/>
<TextEquiv conf="0.91519">
<Unicode>i</Unicode></TextEquiv></Glyph>
<Glyph id="c806">
<Coords points="776,1384 795,1384 795,1410 776,1410"/>
<TextEquiv conf="0.89158">
<Unicode>n</Unicode></TextEquiv></Glyph>
<Glyph id="c807">
<Coords points="797,1384 811,1384 811,1411 797,1411"/>
<TextEquiv conf="0.95123">
<Unicode>e</Unicode></TextEquiv></Glyph>
<TextEquiv>
<Unicode>eine</Unicode></TextEquiv></Word>
<Word id="w1209">
<Coords points="822,1372 864,1372 864,1374 898,1374 898,1410 876,1410 876,1411 864,1411 864,1413 822,1413"/>
<Glyph id="c808">
<Coords points="822,1372 864,1372 864,1413 822,1413"/>
<TextEquiv conf="0.79185">
<Unicode>B</Unicode></TextEquiv></Glyph>
<Glyph id="c809">
<Coords points="867,1377 876,1377 876,1411 867,1411"/>
<TextEquiv conf="0.91084">
<Unicode>i</Unicode></TextEquiv></Glyph>
<Glyph id="c810">
<Coords points="878,1374 898,1374 898,1410 878,1410"/>
<TextEquiv conf="0.83545">
<Unicode></Unicode></TextEquiv></Glyph>
<TextEquiv>
<Unicode>Bi</Unicode></TextEquiv></Word>
<TextEquiv conf="0.75683">
<Unicode>Als er einsmals in dem Oberhauſe eine Bi we⸗</Unicode></TextEquiv></TextLine>
<TextEquiv conf="0.70871">
<Unicode>Inconsistent dummy region text</Unicode></TextEquiv>
<TextStyle fontFamily="Fraktur"/></TextRegion></Page></PcGts>

Binary file not shown.

After

Width:  |  Height:  |  Size: 426 KiB

@ -0,0 +1,348 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15/pagecontent.xsd">
<Metadata>
<Creator/>
<Created>2020-09-16T15:51:31</Created>
<LastChange>1970-01-01T01:00:00</LastChange>
<Comments/>
</Metadata>
<Page imageFilename="462875_0008.jpg" imageHeight="2396" imageWidth="1504">
<ReadingOrder>
<OrderedGroup id="g1">
<RegionRefIndexed index="0" regionRef="r1"/>
<RegionRefIndexed index="1" regionRef="r2"/>
<RegionRefIndexed index="2" regionRef="r3"/>
</OrderedGroup>
</ReadingOrder>
<TextRegion id="r1" orientation="0.114332257404385">
<Coords points="729,187 952,186 952,218 729,219"/>
<TextLine id="l1">
<Coords points="729,187 952,186 952,218 729,219"/>
<TextEquiv index="0">
<Unicode>— Vl —</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>— Vl —</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode/>
</TextEquiv>
</TextRegion>
<TextRegion id="r2" orientation="-0.010667742595615">
<Coords points="296,269 1390,267 1393,2064 299,2066"/>
<TextLine id="l2">
<Coords points="301,270 1389,270 1389,306 301,306"/>
<TextEquiv index="0">
<Unicode>sondere Schrift daraus zu machen. Locke scheint fort-</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>gondere Schrift daraus zu machen. LDocke scheint fort—-</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l3">
<Coords points="301,322 1386,322 1386,366 301,366"/>
<TextEquiv index="1">
<Unicode>gefahren zu haben, einzelne Bemorkungen und Beobach—</Unicode>
</TextEquiv>
<TextEquiv index="0">
<Unicode>gefahren zu haben, einzelne Bemerkungen und Beobach-</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l4">
<Coords points="301,375 1387,375 1387,419 301,419"/>
<TextEquiv index="0">
<Unicode>tungen zu derselben niederzuschreiben, je nachdem sich</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>tungen zu derselben niederzuschreiben, je nachdem sich</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l5">
<Coords points="299,428 1385,428 1385,471 299,471"/>
<TextEquiv index="0">
<Unicode>ihm dieselben aufdrängten. Der Tod rief ihn im Jahre</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>ibm dicselben aufdrängten. Der Tod rief ihn im Jahre</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l6">
<Coords points="300,482 1385,482 1385,526 300,526"/>
<TextEquiv index="0">
<Unicode>1704 ab, bevor die Schrift vollendet war. Somit lagen</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>1704 ab, bevor die Schrift vollendet war. Somit lagen</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l7">
<Coords points="301,535 1388,535 1388,578 301,578"/>
<TextEquiv index="0">
<Unicode>dem Herausgeber der hinterlassenen Schriften nur mehr</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>dem ILerausgeber der hinterlassenen Schriften nur melr</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l8">
<Coords points="301,589 1389,589 1389,633 301,633"/>
<TextEquiv index="0">
<Unicode>oder weniger zusammenhängende Fragmente zu dieser</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>oder weniger zusammenhängende Pragmente zu dieser</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l9">
<Coords points="301,642 1387,642 1387,685 301,685"/>
<TextEquiv index="0">
<Unicode>Schrift vor; manche Wiederholungen und Unfertigkeiten</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>Schrift vor; manche wiederholungen und Unfertigkeiten</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l10">
<Coords points="302,695 1386,695 1386,737 302,737"/>
<TextEquiv index="0">
<Unicode>finden ihre Erklärung in diesem Umstande. Trotzdem</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>finden ihre Erklärung in diesem Umsſtande. Trotzdem</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l11">
<Coords points="302,747 1210,747 1210,791 302,791"/>
<TextEquiv index="0">
<Unicode>ist klar, was Locke in der Hauptsache wollte.</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>ist kKlar, was Locke in der Hauptsache wollte.</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l12">
<Coords points="388,800 1387,800 1387,844 388,844"/>
<TextEquiv index="0">
<Unicode>Der Philosoph ging von der Klage darüber aus, dass</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>Der Philosoph ging von der Klage darüber aus, dass</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l13">
<Coords points="300,852 1388,852 1388,894 300,894"/>
<TextEquiv index="0">
<Unicode>der Mensch bei der Leitung seiner Selbst die rechte</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>der Mensch bei der Leitung seiner Selbst die rechitée</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l14">
<Coords points="302,905 1389,905 1389,950 302,950"/>
<TextEquiv index="0">
<Unicode>Hülfe so selten bei seinem Verstande sucht; er wollte</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>IIülfe so selten bei seinem Verstande sucht; er wollte</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l15">
<Coords points="301,959 1388,959 1388,1002 301,1002"/>
<TextEquiv index="0">
<Unicode>die Thorheit dieser Vernachlässigung in ihren schlimmen</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>die Thorbeit dieser Vernachlässigung in ihren schlimmen</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l16">
<Coords points="301,1012 1385,1012 1385,1057 301,1057"/>
<TextEquiv index="0">
<Unicode>Folgen für die Handlung und Stellung des Menschen im</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>Folgen für die Iandlung und Stellung des Menschen im</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l17">
<Coords points="299,1066 1386,1066 1386,1110 299,1110"/>
<TextEquiv index="0">
<Unicode>Leben darlegen. Besonders auf deren Fehlgriffe in der</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>Leben darlegen. Besonders auf deren PFeblgriſſe in der</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l18">
<Coords points="299,1119 1383,1119 1383,1162 299,1162"/>
<TextEquiv index="0">
<Unicode>Bechandlung und Verwendung des Verstandes wies er hin.</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>Bechandlung und Verwendung des Verstandes wies er hin.</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l19">
<Coords points="298,1172 298,1215 1384,1215 1384,1172 298,1172"/>
<TextEquiv index="0">
<Unicode>Als ersten Fehlgriff rügte er die Weise Derer, die über-</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>Als ersten Fehlgriff rügte cr die Weise Derer, die über—</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l20">
<Coords points="300,1223 1387,1223 1387,1267 300,1267"/>
<TextEquiv index="0">
<Unicode>haupt ihren eigenen Verstand selten gebrauchen, vielmehr</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>haupt ilren cigenen Verstand seclten gebrauchen, vielmelir</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l21">
<Coords points="300,1268 1386,1268 1386,1318 300,1318"/>
<TextEquiv index="0">
<Unicode>nach dem Verstande ihrer Eltern, Nachbarn oder Vor-</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>nach dem Veorstande ihrer Eltern, Nachbarn odor Vor—</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l22">
<Coords points="300,1329 1388,1329 1388,1374 300,1374"/>
<TextEquiv index="0">
<Unicode>gesetzten urtheilen, um der Mühe und Sorge eigenen</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>gesetaten urtheilen, um der Mühe und dorge eigenen</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l23">
<Coords points="302,1381 1389,1381 1389,1419 302,1419"/>
<TextEquiv index="0">
<Unicode>Denkens und Urtheilens überhoben zu sein. Als zweiten</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>Denkens und Urtheilens überhboben zu sein. Als 2weiten</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l24">
<Coords points="300,1434 1389,1434 1389,1477 300,1477"/>
<TextEquiv index="0">
<Unicode>Fehlgriſf hob er das häufige Herbeiziehen von Leiden-</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>Fehlgriſf hob er das häuſige DIerbeiziehen von Leiden—</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l25">
<Coords points="300,1488 1388,1488 1388,1533 300,1533"/>
<TextEquiv index="0">
<Unicode>schaft der eigenen Wünsche oder des Parteigeistes an</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>schaft der eigenen Wünsche oder des Parteigeistes an</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l26">
<Coords points="299,1533 1390,1533 1390,1576 299,1576"/>
<TextEquiv index="0">
<Unicode>Stelle der Vernunft hervor. Und als dritten Fehler be-</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>Stelle der Vernunft hervor. Und als dritten Fohler be—</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l27">
<Coords points="302,1592 1389,1592 1389,1635 302,1635"/>
<TextEquiv index="0">
<Unicode>zeichnete er die ebenfalls nicht seltene Einseitigkeit und</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>zeichnete exr die ebenfalls nicht seltene Einscitigkeit und</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l28">
<Coords points="306,1645 1392,1645 1391,1692 305,1692"/>
<TextEquiv index="0">
<Unicode>Kurzsichtigkeit der Verstandesbildung. Um diesen Fehlern</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>Kurzsichtigkeit der Verstandesbildung. Um diesen Fohlern</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l29">
<Coords points="307,1698 1390,1698 1390,1742 307,1742"/>
<TextEquiv index="0">
<Unicode>und den durch sie herbeigeführten Vorurtheilen, welche</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>und den durch sie herbeigeführten Vorurtheilen, welche</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l30">
<Coords points="307,1751 1388,1751 1388,1795 307,1795"/>
<TextEquiv index="0">
<Unicode>die Erkenntniss der Wahrheit hindern, thunlichst vor-</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>die Erkenntniss der Wahrheit hindern, thunlichst vor—</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l31">
<Coords points="305,1804 1390,1804 1390,1850 305,1850"/>
<TextEquiv index="0">
<Unicode>zubeugen, verlangt Locke mit grossem Nachdrucke und</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>zubeugen, verlangt Locke mit grossem Nachdrucke und</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l32">
<Coords points="305,1857 305,1905 1393,1905 1393,1857 305,1857"/>
<TextEquiv index="0">
<Unicode>mit der vollen Wärmee eines aufrichtigen Wahrheitsfreundes,</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>mit der vollen Wärmeo eines aufrichtigen Wahrheitsſreundes,</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l33">
<Coords points="304,1910 304,1957 1393,1957 1393,1910 304,1910"/>
<TextEquiv index="0">
<Unicode>dass der Mensch sich um die rechte Bildung und Leitung</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>dass der Mensch sich um die rechte Bildung und Leitung</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l34">
<Coords points="304,1963 1392,1963 1392,2011 304,2011"/>
<TextEquiv index="0">
<Unicode>seines Verstandes die gleiche Mühe geben soll, wie er</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>seines Verstandes die gleiche Mühe geben soll, wie er</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="l35">
<Coords points="303,2018 303,2065 846,2065 1393,2064 1393,2018 303,2018"/>
<TextEquiv index="0">
<Unicode>auf sein leibliches Wohlergehen zu verwenden pflegt. Es</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>auſf sein leibliches Wohlergehen zu verwenden pflegt. LEs</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode/>
</TextEquiv>
</TextRegion>
<ImageRegion id="r3">
<Coords points="0,0 0,2396 1504,2396 1504,0"/>
</ImageRegion>
</Page>
</PcGts>

@ -2,6 +2,7 @@ import unicodedata
import pytest import pytest
from uniseg.graphemecluster import grapheme_clusters from uniseg.graphemecluster import grapheme_clusters
from collections import namedtuple from collections import namedtuple
from lxml import etree as ET
from .. import seq_align, ExtractedText from .. import seq_align, ExtractedText
@ -66,3 +67,29 @@ def test_align():
assert alignment[8] == (None, '.', None, 'x2') assert alignment[8] == (None, '.', None, 'x2')
assert alignment[12] == ('t', None, 's2', None) assert alignment[12] == ('t', None, 's2', None)
assert alignment[15] == ('n', '', 's2', 'x3') assert alignment[15] == ('n', '', 's2', 'x3')
def test_textequiv_index():
"""
Test that extracting text from a PAGE TextEquiv honors the "index".
"""
# This example textline has two TextEquivs, the one with the lowest index
# should be used. The XML order of the TextEquivs is deliberately not
# in index order.
textline="""<?xml version="1.0"?>
<TextLine id="l3" xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15">
<TextEquiv index="1">
<Unicode>gefahren zu haben, einzelne Bemorkungen und Beobäch-</Unicode>
</TextEquiv>
<TextEquiv index="0">
<Unicode>gefahren zu haben, einzelne Bemerkungen und Beobach-</Unicode>
</TextEquiv>
</TextLine>
"""
root = ET.fromstring(textline)
nsmap = {'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" }
result = ExtractedText.from_text_segment(root, nsmap, textequiv_level='line').text
expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-"
assert expected == result

@ -124,12 +124,28 @@ def test_page_order():
def test_page_mixed_regions(): def test_page_mixed_regions():
# This file contains ImageRegions and TextRegions in the ReadingOrder # This file contains ImageRegions and TextRegions in the ReadingOrder
tree = ET.parse(os.path.join(data_dir, 'mixed-regions.page.xml')) tree = ET.parse(os.path.join(data_dir, 'mixed-regions.page.xml'))
with pytest.warns(UserWarning, match=r'Not a TextRegion'):
result = page_text(tree) result = page_text(tree)
assert 'non exaudiam uos. Chriſtiani uero quia orant iuxta' in result assert 'non exaudiam uos. Chriſtiani uero quia orant iuxta' in result
def test_page_level():
# This file contains inconsistent TextRegion and TextLine texts
# TextRegion
tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml'))
result = page_text(tree)
assert result == 'Inconsistent dummy region text'
tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml'))
result = page_text(tree, textequiv_level='region')
assert result == 'Inconsistent dummy region text'
# TextLine
tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml'))
result = page_text(tree, textequiv_level='line')
assert result == 'Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-'
def test_text(): def test_text():
assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml')) assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
assert "wieder ein. Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml')) assert "wieder ein. Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))

@ -22,6 +22,7 @@ setup(
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [
'dinglehopper=qurator.dinglehopper.cli:main', 'dinglehopper=qurator.dinglehopper.cli:main',
'dinglehopper-extract=qurator.dinglehopper.cli_extract:main',
'ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper', 'ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper',
] ]
} }

Loading…
Cancel
Save