diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index a65e03a..aac743e 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -1,5 +1,7 @@ from __future__ import division, print_function +from warnings import warn + from lxml import etree as ET import sys @@ -68,7 +70,7 @@ def page_text(tree): if region is not None: region_texts.append(region_text(region)) else: - raise ValueError('Invalid region id "%s" in file' % region_id) + warn('Not a TextRegion: "%s"' % region_id) else: raise NotImplementedError else: diff --git a/qurator/dinglehopper/tests/data/mixed-regions.page.xml b/qurator/dinglehopper/tests/data/mixed-regions.page.xml new file mode 100644 index 0000000..0e2a117 --- /dev/null +++ b/qurator/dinglehopper/tests/data/mixed-regions.page.xml @@ -0,0 +1,290 @@ + + + + OCR-D/core 1.0.0b19 + 2019-09-26T11:59:19.519140 + 2019-09-26T11:59:19.519140 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + phariſei hypocritæ, qui comeditis domos uiduarã ſub + + + + + + prætextu longarum precationum, propterea maiorẽ + + + + + + accipieris condemnationem. Ideo enim ꝙ non oratis + + + + + + ſecundum præſeriptum ſacræ ſcripturæ, nec ex ſpiritu + + + + + + & ueritate ſed iuxta ueſtram propriam conſtitutionẽ, + + + + + + orationes ueſtræ nõ ſiunt Deo acceptæ, neq; ab eo ex⸗ + + + + + + audiunt᷑ Eſa, Cum multiplicaueritis orationes ueſtras + + + + + + non exaudiam uos. Chriſtiani uero quia orant iuxta + + + + + + tenorem ſcripturæ, & ex ſpiritu & ueritate, ideo eo⸗ + + + + + + rum orationes a Deo exaudiuntur, ſuntq; illi grat iſsi⸗ + + + + + + mæ, dicunt enim Pater noſter qui es iu cœlis &c. Vos + + + + + + autem hoc tenore orandi contempto, obmur muratis + + + + + + ueſtras Horas canonicas, hoc eſt, diabolicas ab Anti⸗ + + + + + + chriſto inſtitutas. Paulus mauult quinq; uerba in Ec⸗ + + + + + + cle ſia loqui in ſenſu, qß decem milia uerborum in lin⸗ + + + + + + ua, Quibus uerbis adeo dãnat ueſtras prolixas ora⸗ + + + + + + tiones, ut ſi ſemiuncia ſanæ mentis uel mica ſidei eſfet + + + + + + in uobis, eas ſine dubio omitteretis. + + + + + + De inuocatione diuorum ne apiculus quidem ha + + + + + + betur in ſacris literis, quare ter ſtulti eſtis quod inuo⸗ + + + + + + catis ſanctos, cum ex præce pto Dei ne mo inuocandus + + + + + + ſit niſi ſolus Deus. Inuoca inquit me in die tribulatio⸗ + + + + + + nis. & eruam te, & honorificabis me. Et omnis qui⸗ + + + + + + cumq; inuocauerit nomen domini, ſaluus erit Sed + + + + + + quomodo inuocabitis, in quem non credidiſtis? Quo + + + + + + modo credetis ſine uerbo ? Inuocationẽ ergo in ſcrip⸗ + + + + + + turis non legitis cõmemorationem uero ſæpe, non ut + + + + + + intercedant pro uobis ſancti, ſed nt meminerit Deus + + + + + + Teſtamenti cum patribus ſanctis pacti, ut ſimiliter uo⸗ + + + + + + biſcum agat per miſericordiam, quemadmodum cum + + + + + + ilis egit. Atq; hoc non eſt inuocare ſanctos. ſed Deum + + + + + + ſuæ miſericordiæ & promiſsionis admonere Sic pſal + + + + + + mographus dicit, Qui paſcis Iſrael attende, qui de⸗ + + + + + + ducis uelut ouem Iacob Sic & Moſes orat, Memento + + + + + + B 3 domi⸗ + + + + phariſei hypocritæ, qui comeditis domos uiduarã ſub +prætextu longarum precationum, propterea maiorẽ +accipieris condemnationem. Ideo enim ꝙ non oratis +ſecundum præſeriptum ſacræ ſcripturæ, nec ex ſpiritu +& ueritate ſed iuxta ueſtram propriam conſtitutionẽ, +orationes ueſtræ nõ ſiunt Deo acceptæ, neq; ab eo ex⸗ +audiunt᷑ Eſa, Cum multiplicaueritis orationes ueſtras +non exaudiam uos. Chriſtiani uero quia orant iuxta +tenorem ſcripturæ, & ex ſpiritu & ueritate, ideo eo⸗ +rum orationes a Deo exaudiuntur, ſuntq; illi grat iſsi⸗ +mæ, dicunt enim Pater noſter qui es iu cœlis &c. Vos +autem hoc tenore orandi contempto, obmur muratis +ueſtras Horas canonicas, hoc eſt, diabolicas ab Anti⸗ +chriſto inſtitutas. Paulus mauult quinq; uerba in Ec⸗ +cle ſia loqui in ſenſu, qß decem milia uerborum in lin⸗ +ua, Quibus uerbis adeo dãnat ueſtras prolixas ora⸗ +tiones, ut ſi ſemiuncia ſanæ mentis uel mica ſidei eſfet +in uobis, eas ſine dubio omitteretis. +De inuocatione diuorum ne apiculus quidem ha +betur in ſacris literis, quare ter ſtulti eſtis quod inuo⸗ +catis ſanctos, cum ex præce pto Dei ne mo inuocandus +ſit niſi ſolus Deus. Inuoca inquit me in die tribulatio⸗ +nis. & eruam te, & honorificabis me. Et omnis qui⸗ +cumq; inuocauerit nomen domini, ſaluus erit Sed +quomodo inuocabitis, in quem non credidiſtis? Quo +modo credetis ſine uerbo ? Inuocationẽ ergo in ſcrip⸗ +turis non legitis cõmemorationem uero ſæpe, non ut +intercedant pro uobis ſancti, ſed nt meminerit Deus +Teſtamenti cum patribus ſanctis pacti, ut ſimiliter uo⸗ +biſcum agat per miſericordiam, quemadmodum cum +ilis egit. Atq; hoc non eſt inuocare ſanctos. ſed Deum +ſuæ miſericordiæ & promiſsionis admonere Sic pſal +mographus dicit, Qui paſcis Iſrael attende, qui de⸗ +ducis uelut ouem Iacob Sic & Moſes orat, Memento +B 3 domi⸗ + + + + + + + + + + + + + diff --git a/qurator/dinglehopper/tests/test_ocr_files.py b/qurator/dinglehopper/tests/test_ocr_files.py index 694d548..dd9377a 100644 --- a/qurator/dinglehopper/tests/test_ocr_files.py +++ b/qurator/dinglehopper/tests/test_ocr_files.py @@ -4,6 +4,8 @@ import re import lxml.etree as ET import textwrap +import pytest + from .. import alto_namespace, alto_text, page_namespace, page_text, text data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @@ -93,6 +95,15 @@ def test_page_order(): assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL) +def test_page_mixed_regions(): + # This file contains ImageRegions and TextRegions in the ReadingOrder + tree = ET.parse(os.path.join(data_dir, 'mixed-regions.page.xml')) + with pytest.warns(UserWarning, match=r'Not a TextRegion'): + result = page_text(tree) + + assert 'non exaudiam uos. Chriſtiani uero quia orant iuxta' in result + + def test_text(): assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml')) assert "wieder ein. — Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))