🐛 dinglehopper: Do not throw error if a region ID is not found

The ReadingOrder might contain regions of types other than text regions,
so not finding a TextRegion with the referenced ID is not an error.
Downgrade to a warning for now.
pull/3/head
Gerber, Mike 5 years ago
parent 8237b3edaf
commit 10f010eaa8

@ -1,5 +1,7 @@
from __future__ import division, print_function from __future__ import division, print_function
from warnings import warn
from lxml import etree as ET from lxml import etree as ET
import sys import sys
@ -68,7 +70,7 @@ def page_text(tree):
if region is not None: if region is not None:
region_texts.append(region_text(region)) region_texts.append(region_text(region))
else: else:
raise ValueError('Invalid region id "%s" in file' % region_id) warn('Not a TextRegion: "%s"' % region_id)
else: else:
raise NotImplementedError raise NotImplementedError
else: else:

@ -0,0 +1,290 @@
<?xml version="1.0" encoding="UTF-8"?>
<pc:PcGts xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15">
<pc:Metadata>
<pc:Creator>OCR-D/core 1.0.0b19</pc:Creator>
<pc:Created>2019-09-26T11:59:19.519140</pc:Created>
<pc:LastChange>2019-09-26T11:59:19.519140</pc:LastChange>
<pc:MetadataItem type="processingStep" name="layout/segmentation/region" value="ocrd-tesserocr-segment-region">
<pc:Labels>
<pc:Label value="True" type="overwrite_regions"/>
<pc:Label value="8" type="padding"/>
<pc:Label value="False" type="crop_polygons"/>
<pc:Label value="True" type="find_tables"/>
</pc:Labels>
</pc:MetadataItem>
<pc:MetadataItem type="processingStep" name="layout/segmentation/line" value="ocrd-tesserocr-segment-line">
<pc:Labels>
<pc:Label value="True" type="overwrite_lines"/>
</pc:Labels>
</pc:MetadataItem>
</pc:Metadata>
<pc:Page imageFilename="../OCR-D-IMG-BIN/OCR-D-IMG-BIN_0001.png" imageWidth="1832" imageHeight="2408">
<pc:ReadingOrder>
<pc:OrderedGroup id="reading-order">
<pc:RegionRefIndexed index="0" regionRef="region0000"/>
<pc:RegionRefIndexed index="1" regionRef="region0001"/>
<pc:RegionRefIndexed index="2" regionRef="region0002"/>
<pc:RegionRefIndexed index="3" regionRef="region0003"/>
</pc:OrderedGroup>
</pc:ReadingOrder>
<pc:TextRegion id="region0001">
<pc:Coords points="184,196 1338,196 1338,1969 184,1969"/>
<pc:TextLine id="region0001_line0000">
<pc:Coords points="217,204 1324,204 1324,264 217,264"/>
<pc:TextEquiv>
<pc:Unicode>phariſei hypocritæ, qui comeditis domos uiduarã ſub</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0001">
<pc:Coords points="220,258 1325,258 1325,314 220,314"/>
<pc:TextEquiv>
<pc:Unicode>prætextu longarum precationum, propterea maiorẽ</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0002">
<pc:Coords points="218,305 1325,305 1325,359 218,359"/>
<pc:TextEquiv>
<pc:Unicode>accipieris condemnationem. Ideo enim ꝙ non oratis</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0003">
<pc:Coords points="217,354 1325,354 1325,413 217,413"/>
<pc:TextEquiv>
<pc:Unicode>ſecundum præſeriptum ſacræ ſcripturæ, nec ex ſpiritu</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0004">
<pc:Coords points="216,401 1322,401 1322,460 216,460"/>
<pc:TextEquiv>
<pc:Unicode>&amp; ueritate ſed iuxta ueſtram propriam conſtitutionẽ,</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0005">
<pc:Coords points="219,454 1324,454 1324,505 219,505"/>
<pc:TextEquiv>
<pc:Unicode>orationes ueſtræ nõ ſiunt Deo acceptæ, neq; ab eo ex⸗</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0006">
<pc:Coords points="219,501 1326,501 1326,563 219,563"/>
<pc:TextEquiv>
<pc:Unicode>audiunt᷑ Eſa, Cum multiplicaueritis orationes ueſtras</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0007">
<pc:Coords points="215,556 1325,556 1325,607 215,607"/>
<pc:TextEquiv>
<pc:Unicode>non exaudiam uos. Chriſtiani uero quia orant iuxta</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0008">
<pc:Coords points="218,605 1324,605 1324,665 218,665"/>
<pc:TextEquiv>
<pc:Unicode>tenorem ſcripturæ, &amp; ex ſpiritu &amp; ueritate, ideo eo⸗</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0009">
<pc:Coords points="217,651 1324,651 1324,707 217,707"/>
<pc:TextEquiv>
<pc:Unicode>rum orationes a Deo exaudiuntur, ſuntq; illi grat iſsi⸗</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0010">
<pc:Coords points="219,705 1322,705 1322,756 219,756"/>
<pc:TextEquiv>
<pc:Unicode>mæ, dicunt enim Pater noſter qui es iu cœlis &amp;c. Vos</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0011">
<pc:Coords points="218,756 1323,756 1323,806 218,806"/>
<pc:TextEquiv>
<pc:Unicode>autem hoc tenore orandi contempto, obmur muratis</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0012">
<pc:Coords points="218,803 1327,803 1327,854 218,854"/>
<pc:TextEquiv>
<pc:Unicode>ueſtras Horas canonicas, hoc eſt, diabolicas ab Anti⸗</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0013">
<pc:Coords points="218,852 1324,852 1324,904 218,904"/>
<pc:TextEquiv>
<pc:Unicode>chriſto inſtitutas. Paulus mauult quinq; uerba in Ec⸗</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0014">
<pc:Coords points="219,904 1323,904 1323,958 219,958"/>
<pc:TextEquiv>
<pc:Unicode>cle ſia loqui in ſenſu, qß decem milia uerborum in lin⸗</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0015">
<pc:Coords points="218,954 1326,954 1326,1010 218,1010"/>
<pc:TextEquiv>
<pc:Unicode>ua, Quibus uerbis adeo dãnat ueſtras prolixas ora⸗</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0016">
<pc:Coords points="192,1002 1324,1002 1324,1052 192,1052"/>
<pc:TextEquiv>
<pc:Unicode>tiones, ut ſi ſemiuncia ſanæ mentis uel mica ſidei eſfet</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0017">
<pc:Coords points="218,1055 965,1055 965,1101 218,1101"/>
<pc:TextEquiv>
<pc:Unicode>in uobis, eas ſine dubio omitteretis.</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0018">
<pc:Coords points="325,1103 1323,1103 1323,1160 325,1160"/>
<pc:TextEquiv>
<pc:Unicode>De inuocatione diuorum ne apiculus quidem ha</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0019">
<pc:Coords points="216,1156 1326,1156 1326,1212 216,1212"/>
<pc:TextEquiv>
<pc:Unicode>betur in ſacris literis, quare ter ſtulti eſtis quod inuo⸗</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0020">
<pc:Coords points="220,1210 1326,1210 1326,1262 220,1262"/>
<pc:TextEquiv>
<pc:Unicode>catis ſanctos, cum ex præce pto Dei ne mo inuocandus</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0021">
<pc:Coords points="218,1261 1326,1261 1326,1307 218,1307"/>
<pc:TextEquiv>
<pc:Unicode>ſit niſi ſolus Deus. Inuoca inquit me in die tribulatio⸗</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0022">
<pc:Coords points="222,1305 1324,1305 1324,1354 222,1354"/>
<pc:TextEquiv>
<pc:Unicode>nis. &amp; eruam te, &amp; honorificabis me. Et omnis qui⸗</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0023">
<pc:Coords points="221,1353 1324,1353 1324,1415 221,1415"/>
<pc:TextEquiv>
<pc:Unicode>cumq; inuocauerit nomen domini, ſaluus erit Sed</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0024">
<pc:Coords points="220,1404 1321,1404 1321,1465 220,1465"/>
<pc:TextEquiv>
<pc:Unicode>quomodo inuocabitis, in quem non credidiſtis? Quo</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0025">
<pc:Coords points="221,1456 1325,1456 1325,1508 221,1508"/>
<pc:TextEquiv>
<pc:Unicode>modo credetis ſine uerbo ? Inuocationẽ ergo in ſcrip⸗</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0026">
<pc:Coords points="222,1509 1323,1509 1323,1559 222,1559"/>
<pc:TextEquiv>
<pc:Unicode>turis non legitis cõmemorationem uero ſæpe, non ut</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0027">
<pc:Coords points="222,1555 1330,1555 1330,1612 222,1612"/>
<pc:TextEquiv>
<pc:Unicode>intercedant pro uobis ſancti, ſed nt meminerit Deus</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0028">
<pc:Coords points="219,1604 1325,1604 1325,1664 219,1664"/>
<pc:TextEquiv>
<pc:Unicode>Teſtamenti cum patribus ſanctis pacti, ut ſimiliter uo⸗</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0029">
<pc:Coords points="218,1653 1323,1653 1323,1719 218,1719"/>
<pc:TextEquiv>
<pc:Unicode>biſcum agat per miſericordiam, quemadmodum cum</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0030">
<pc:Coords points="219,1704 1321,1704 1321,1769 219,1769"/>
<pc:TextEquiv>
<pc:Unicode>ilis egit. Atq; hoc non eſt inuocare ſanctos. ſed Deum</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0031">
<pc:Coords points="222,1758 1322,1758 1322,1817 222,1817"/>
<pc:TextEquiv>
<pc:Unicode>ſuæ miſericordiæ &amp; promiſsionis admonere Sic pſal</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0032">
<pc:Coords points="224,1809 1324,1809 1324,1866 224,1866"/>
<pc:TextEquiv>
<pc:Unicode>mographus dicit, Qui paſcis Iſrael attende, qui de⸗</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0033">
<pc:Coords points="222,1858 1320,1858 1320,1913 222,1913"/>
<pc:TextEquiv>
<pc:Unicode>ducis uelut ouem Iacob Sic &amp; Moſes orat, Memento</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextLine id="region0001_line0034">
<pc:Coords points="345,1909 1320,1909 1320,1963 345,1963"/>
<pc:TextEquiv>
<pc:Unicode>B 3 domi⸗</pc:Unicode>
</pc:TextEquiv>
</pc:TextLine>
<pc:TextEquiv>
<pc:Unicode>phariſei hypocritæ, qui comeditis domos uiduarã ſub
prætextu longarum precationum, propterea maiorẽ
accipieris condemnationem. Ideo enim ꝙ non oratis
ſecundum præſeriptum ſacræ ſcripturæ, nec ex ſpiritu
&amp; ueritate ſed iuxta ueſtram propriam conſtitutionẽ,
orationes ueſtræ nõ ſiunt Deo acceptæ, neq; ab eo ex⸗
audiunt᷑ Eſa, Cum multiplicaueritis orationes ueſtras
non exaudiam uos. Chriſtiani uero quia orant iuxta
tenorem ſcripturæ, &amp; ex ſpiritu &amp; ueritate, ideo eo⸗
rum orationes a Deo exaudiuntur, ſuntq; illi grat iſsi⸗
mæ, dicunt enim Pater noſter qui es iu cœlis &amp;c. Vos
autem hoc tenore orandi contempto, obmur muratis
ueſtras Horas canonicas, hoc eſt, diabolicas ab Anti⸗
chriſto inſtitutas. Paulus mauult quinq; uerba in Ec⸗
cle ſia loqui in ſenſu, qß decem milia uerborum in lin⸗
ua, Quibus uerbis adeo dãnat ueſtras prolixas ora⸗
tiones, ut ſi ſemiuncia ſanæ mentis uel mica ſidei eſfet
in uobis, eas ſine dubio omitteretis.
De inuocatione diuorum ne apiculus quidem ha
betur in ſacris literis, quare ter ſtulti eſtis quod inuo⸗
catis ſanctos, cum ex præce pto Dei ne mo inuocandus
ſit niſi ſolus Deus. Inuoca inquit me in die tribulatio⸗
nis. &amp; eruam te, &amp; honorificabis me. Et omnis qui⸗
cumq; inuocauerit nomen domini, ſaluus erit Sed
quomodo inuocabitis, in quem non credidiſtis? Quo
modo credetis ſine uerbo ? Inuocationẽ ergo in ſcrip⸗
turis non legitis cõmemorationem uero ſæpe, non ut
intercedant pro uobis ſancti, ſed nt meminerit Deus
Teſtamenti cum patribus ſanctis pacti, ut ſimiliter uo⸗
biſcum agat per miſericordiam, quemadmodum cum
ilis egit. Atq; hoc non eſt inuocare ſanctos. ſed Deum
ſuæ miſericordiæ &amp; promiſsionis admonere Sic pſal
mographus dicit, Qui paſcis Iſrael attende, qui de⸗
ducis uelut ouem Iacob Sic &amp; Moſes orat, Memento
B 3 domi⸗</pc:Unicode>
</pc:TextEquiv>
</pc:TextRegion>
<pc:ImageRegion id="region0000">
<pc:Coords points="5,21 1790,21 1790,302 5,302"/>
</pc:ImageRegion>
<pc:ImageRegion id="region0002">
<pc:Coords points="0,1962 1813,1962 1813,2361 0,2361"/>
</pc:ImageRegion>
<pc:ImageRegion id="region0003">
<pc:Coords points="1316,166 1790,166 1790,238 1316,238"/>
</pc:ImageRegion>
</pc:Page>
</pc:PcGts>

@ -4,6 +4,8 @@ import re
import lxml.etree as ET import lxml.etree as ET
import textwrap import textwrap
import pytest
from .. import alto_namespace, alto_text, page_namespace, page_text, text from .. import alto_namespace, alto_text, page_namespace, page_text, text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@ -93,6 +95,15 @@ def test_page_order():
assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL) assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL)
def test_page_mixed_regions():
# This file contains ImageRegions and TextRegions in the ReadingOrder
tree = ET.parse(os.path.join(data_dir, 'mixed-regions.page.xml'))
with pytest.warns(UserWarning, match=r'Not a TextRegion'):
result = page_text(tree)
assert 'non exaudiam uos. Chriſtiani uero quia orant iuxta' in result
def test_text(): def test_text():
assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml')) assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
assert "wieder ein. — Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml')) assert "wieder ein. — Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))

Loading…
Cancel
Save