mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 11:50:00 +02:00
🐛 dinglehopper: Do not throw error if a region ID is not found
The ReadingOrder might contain regions of types other than text regions, so not finding a TextRegion with the referenced ID is not an error. Downgrade to a warning for now.
This commit is contained in:
parent
8237b3edaf
commit
10f010eaa8
3 changed files with 304 additions and 1 deletions
|
@ -1,5 +1,7 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
from warnings import warn
|
||||
|
||||
from lxml import etree as ET
|
||||
import sys
|
||||
|
||||
|
@ -68,7 +70,7 @@ def page_text(tree):
|
|||
if region is not None:
|
||||
region_texts.append(region_text(region))
|
||||
else:
|
||||
raise ValueError('Invalid region id "%s" in file' % region_id)
|
||||
warn('Not a TextRegion: "%s"' % region_id)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
else:
|
||||
|
|
290
qurator/dinglehopper/tests/data/mixed-regions.page.xml
Normal file
290
qurator/dinglehopper/tests/data/mixed-regions.page.xml
Normal file
|
@ -0,0 +1,290 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<pc:PcGts xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15">
|
||||
<pc:Metadata>
|
||||
<pc:Creator>OCR-D/core 1.0.0b19</pc:Creator>
|
||||
<pc:Created>2019-09-26T11:59:19.519140</pc:Created>
|
||||
<pc:LastChange>2019-09-26T11:59:19.519140</pc:LastChange>
|
||||
<pc:MetadataItem type="processingStep" name="layout/segmentation/region" value="ocrd-tesserocr-segment-region">
|
||||
<pc:Labels>
|
||||
<pc:Label value="True" type="overwrite_regions"/>
|
||||
<pc:Label value="8" type="padding"/>
|
||||
<pc:Label value="False" type="crop_polygons"/>
|
||||
<pc:Label value="True" type="find_tables"/>
|
||||
</pc:Labels>
|
||||
</pc:MetadataItem>
|
||||
<pc:MetadataItem type="processingStep" name="layout/segmentation/line" value="ocrd-tesserocr-segment-line">
|
||||
<pc:Labels>
|
||||
<pc:Label value="True" type="overwrite_lines"/>
|
||||
</pc:Labels>
|
||||
</pc:MetadataItem>
|
||||
</pc:Metadata>
|
||||
<pc:Page imageFilename="../OCR-D-IMG-BIN/OCR-D-IMG-BIN_0001.png" imageWidth="1832" imageHeight="2408">
|
||||
<pc:ReadingOrder>
|
||||
<pc:OrderedGroup id="reading-order">
|
||||
<pc:RegionRefIndexed index="0" regionRef="region0000"/>
|
||||
<pc:RegionRefIndexed index="1" regionRef="region0001"/>
|
||||
<pc:RegionRefIndexed index="2" regionRef="region0002"/>
|
||||
<pc:RegionRefIndexed index="3" regionRef="region0003"/>
|
||||
</pc:OrderedGroup>
|
||||
</pc:ReadingOrder>
|
||||
<pc:TextRegion id="region0001">
|
||||
<pc:Coords points="184,196 1338,196 1338,1969 184,1969"/>
|
||||
<pc:TextLine id="region0001_line0000">
|
||||
<pc:Coords points="217,204 1324,204 1324,264 217,264"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>phariſei hypocritæ, qui comeditis domos uiduarã ſub</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0001">
|
||||
<pc:Coords points="220,258 1325,258 1325,314 220,314"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>prætextu longarum precationum, propterea maiorẽ</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0002">
|
||||
<pc:Coords points="218,305 1325,305 1325,359 218,359"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>accipieris condemnationem. Ideo enim ꝙ non oratis</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0003">
|
||||
<pc:Coords points="217,354 1325,354 1325,413 217,413"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>ſecundum præſeriptum ſacræ ſcripturæ, nec ex ſpiritu</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0004">
|
||||
<pc:Coords points="216,401 1322,401 1322,460 216,460"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>& ueritate ſed iuxta ueſtram propriam conſtitutionẽ,</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0005">
|
||||
<pc:Coords points="219,454 1324,454 1324,505 219,505"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>orationes ueſtræ nõ ſiunt Deo acceptæ, neq; ab eo ex⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0006">
|
||||
<pc:Coords points="219,501 1326,501 1326,563 219,563"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>audiunt᷑ Eſa, Cum multiplicaueritis orationes ueſtras</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0007">
|
||||
<pc:Coords points="215,556 1325,556 1325,607 215,607"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>non exaudiam uos. Chriſtiani uero quia orant iuxta</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0008">
|
||||
<pc:Coords points="218,605 1324,605 1324,665 218,665"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>tenorem ſcripturæ, & ex ſpiritu & ueritate, ideo eo⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0009">
|
||||
<pc:Coords points="217,651 1324,651 1324,707 217,707"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>rum orationes a Deo exaudiuntur, ſuntq; illi grat iſsi⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0010">
|
||||
<pc:Coords points="219,705 1322,705 1322,756 219,756"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>mæ, dicunt enim Pater noſter qui es iu cœlis &c. Vos</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0011">
|
||||
<pc:Coords points="218,756 1323,756 1323,806 218,806"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>autem hoc tenore orandi contempto, obmur muratis</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0012">
|
||||
<pc:Coords points="218,803 1327,803 1327,854 218,854"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>ueſtras Horas canonicas, hoc eſt, diabolicas ab Anti⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0013">
|
||||
<pc:Coords points="218,852 1324,852 1324,904 218,904"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>chriſto inſtitutas. Paulus mauult quinq; uerba in Ec⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0014">
|
||||
<pc:Coords points="219,904 1323,904 1323,958 219,958"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>cle ſia loqui in ſenſu, qß decem milia uerborum in lin⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0015">
|
||||
<pc:Coords points="218,954 1326,954 1326,1010 218,1010"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>ua, Quibus uerbis adeo dãnat ueſtras prolixas ora⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0016">
|
||||
<pc:Coords points="192,1002 1324,1002 1324,1052 192,1052"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>tiones, ut ſi ſemiuncia ſanæ mentis uel mica ſidei eſfet</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0017">
|
||||
<pc:Coords points="218,1055 965,1055 965,1101 218,1101"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>in uobis, eas ſine dubio omitteretis.</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0018">
|
||||
<pc:Coords points="325,1103 1323,1103 1323,1160 325,1160"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>De inuocatione diuorum ne apiculus quidem ha</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0019">
|
||||
<pc:Coords points="216,1156 1326,1156 1326,1212 216,1212"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>betur in ſacris literis, quare ter ſtulti eſtis quod inuo⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0020">
|
||||
<pc:Coords points="220,1210 1326,1210 1326,1262 220,1262"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>catis ſanctos, cum ex præce pto Dei ne mo inuocandus</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0021">
|
||||
<pc:Coords points="218,1261 1326,1261 1326,1307 218,1307"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>ſit niſi ſolus Deus. Inuoca inquit me in die tribulatio⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0022">
|
||||
<pc:Coords points="222,1305 1324,1305 1324,1354 222,1354"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>nis. & eruam te, & honorificabis me. Et omnis qui⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0023">
|
||||
<pc:Coords points="221,1353 1324,1353 1324,1415 221,1415"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>cumq; inuocauerit nomen domini, ſaluus erit Sed</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0024">
|
||||
<pc:Coords points="220,1404 1321,1404 1321,1465 220,1465"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>quomodo inuocabitis, in quem non credidiſtis? Quo</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0025">
|
||||
<pc:Coords points="221,1456 1325,1456 1325,1508 221,1508"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>modo credetis ſine uerbo ? Inuocationẽ ergo in ſcrip⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0026">
|
||||
<pc:Coords points="222,1509 1323,1509 1323,1559 222,1559"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>turis non legitis cõmemorationem uero ſæpe, non ut</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0027">
|
||||
<pc:Coords points="222,1555 1330,1555 1330,1612 222,1612"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>intercedant pro uobis ſancti, ſed nt meminerit Deus</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0028">
|
||||
<pc:Coords points="219,1604 1325,1604 1325,1664 219,1664"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>Teſtamenti cum patribus ſanctis pacti, ut ſimiliter uo⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0029">
|
||||
<pc:Coords points="218,1653 1323,1653 1323,1719 218,1719"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>biſcum agat per miſericordiam, quemadmodum cum</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0030">
|
||||
<pc:Coords points="219,1704 1321,1704 1321,1769 219,1769"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>ilis egit. Atq; hoc non eſt inuocare ſanctos. ſed Deum</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0031">
|
||||
<pc:Coords points="222,1758 1322,1758 1322,1817 222,1817"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>ſuæ miſericordiæ & promiſsionis admonere Sic pſal</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0032">
|
||||
<pc:Coords points="224,1809 1324,1809 1324,1866 224,1866"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>mographus dicit, Qui paſcis Iſrael attende, qui de⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0033">
|
||||
<pc:Coords points="222,1858 1320,1858 1320,1913 222,1913"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>ducis uelut ouem Iacob Sic & Moſes orat, Memento</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0034">
|
||||
<pc:Coords points="345,1909 1320,1909 1320,1963 345,1963"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>B 3 domi⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>phariſei hypocritæ, qui comeditis domos uiduarã ſub
|
||||
prætextu longarum precationum, propterea maiorẽ
|
||||
accipieris condemnationem. Ideo enim ꝙ non oratis
|
||||
ſecundum præſeriptum ſacræ ſcripturæ, nec ex ſpiritu
|
||||
& ueritate ſed iuxta ueſtram propriam conſtitutionẽ,
|
||||
orationes ueſtræ nõ ſiunt Deo acceptæ, neq; ab eo ex⸗
|
||||
audiunt᷑ Eſa, Cum multiplicaueritis orationes ueſtras
|
||||
non exaudiam uos. Chriſtiani uero quia orant iuxta
|
||||
tenorem ſcripturæ, & ex ſpiritu & ueritate, ideo eo⸗
|
||||
rum orationes a Deo exaudiuntur, ſuntq; illi grat iſsi⸗
|
||||
mæ, dicunt enim Pater noſter qui es iu cœlis &c. Vos
|
||||
autem hoc tenore orandi contempto, obmur muratis
|
||||
ueſtras Horas canonicas, hoc eſt, diabolicas ab Anti⸗
|
||||
chriſto inſtitutas. Paulus mauult quinq; uerba in Ec⸗
|
||||
cle ſia loqui in ſenſu, qß decem milia uerborum in lin⸗
|
||||
ua, Quibus uerbis adeo dãnat ueſtras prolixas ora⸗
|
||||
tiones, ut ſi ſemiuncia ſanæ mentis uel mica ſidei eſfet
|
||||
in uobis, eas ſine dubio omitteretis.
|
||||
De inuocatione diuorum ne apiculus quidem ha
|
||||
betur in ſacris literis, quare ter ſtulti eſtis quod inuo⸗
|
||||
catis ſanctos, cum ex præce pto Dei ne mo inuocandus
|
||||
ſit niſi ſolus Deus. Inuoca inquit me in die tribulatio⸗
|
||||
nis. & eruam te, & honorificabis me. Et omnis qui⸗
|
||||
cumq; inuocauerit nomen domini, ſaluus erit Sed
|
||||
quomodo inuocabitis, in quem non credidiſtis? Quo
|
||||
modo credetis ſine uerbo ? Inuocationẽ ergo in ſcrip⸗
|
||||
turis non legitis cõmemorationem uero ſæpe, non ut
|
||||
intercedant pro uobis ſancti, ſed nt meminerit Deus
|
||||
Teſtamenti cum patribus ſanctis pacti, ut ſimiliter uo⸗
|
||||
biſcum agat per miſericordiam, quemadmodum cum
|
||||
ilis egit. Atq; hoc non eſt inuocare ſanctos. ſed Deum
|
||||
ſuæ miſericordiæ & promiſsionis admonere Sic pſal
|
||||
mographus dicit, Qui paſcis Iſrael attende, qui de⸗
|
||||
ducis uelut ouem Iacob Sic & Moſes orat, Memento
|
||||
B 3 domi⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextRegion>
|
||||
<pc:ImageRegion id="region0000">
|
||||
<pc:Coords points="5,21 1790,21 1790,302 5,302"/>
|
||||
</pc:ImageRegion>
|
||||
<pc:ImageRegion id="region0002">
|
||||
<pc:Coords points="0,1962 1813,1962 1813,2361 0,2361"/>
|
||||
</pc:ImageRegion>
|
||||
<pc:ImageRegion id="region0003">
|
||||
<pc:Coords points="1316,166 1790,166 1790,238 1316,238"/>
|
||||
</pc:ImageRegion>
|
||||
</pc:Page>
|
||||
</pc:PcGts>
|
|
@ -4,6 +4,8 @@ import re
|
|||
import lxml.etree as ET
|
||||
import textwrap
|
||||
|
||||
import pytest
|
||||
|
||||
from .. import alto_namespace, alto_text, page_namespace, page_text, text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||
|
@ -93,6 +95,15 @@ def test_page_order():
|
|||
assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL)
|
||||
|
||||
|
||||
def test_page_mixed_regions():
|
||||
# This file contains ImageRegions and TextRegions in the ReadingOrder
|
||||
tree = ET.parse(os.path.join(data_dir, 'mixed-regions.page.xml'))
|
||||
with pytest.warns(UserWarning, match=r'Not a TextRegion'):
|
||||
result = page_text(tree)
|
||||
|
||||
assert 'non exaudiam uos. Chriſtiani uero quia orant iuxta' in result
|
||||
|
||||
|
||||
def test_text():
|
||||
assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
|
||||
assert "wieder ein. — Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue