From c3eefbb1e89eb9e6ad9bf57dc4d897dd7d3c9578 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 18:29:11 +0200 Subject: [PATCH 01/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 extracted_text.py diff --git a/extracted_text.py b/extracted_text.py new file mode 100644 index 0000000..d1dc6f0 --- /dev/null +++ b/extracted_text.py @@ -0,0 +1,42 @@ +import attr + + +@attr.s(frozen=True) +class ExtractedText: + segments = attr.ib() + joiner = attr.ib(type=str) + # XXX Use type annotations for attr types when support for Python 3.5 is dropped + # XXX Also I think these are not validated? + + @property + def text(self): + return self.joiner.join(s.text for s in self.segments) + + def segment_id_for_pos(self, pos): + i = 0 + for s in self.segments: + if i <= pos < i + len(s.text): + return s.id + i += len(s.text) + if i <= pos < i + len(self.joiner): + return None + i += len(self.joiner) + + +@attr.s(frozen=True) +class ExtractedTextSegment: + id = attr.ib(type=str) + text = attr.ib(type=str) + + +test1 = ExtractedText([ + ExtractedTextSegment('s0', 'foo'), + ExtractedTextSegment('s1', 'bar'), + ExtractedTextSegment('s2', 'bazinga') +], ' ') + + +assert test1.text == 'foo bar bazinga' +assert test1.segment_id_for_pos(0) == 's0' +assert test1.segment_id_for_pos(3) == None +assert test1.segment_id_for_pos(10) == 's2' From 475aa65e988b8640af64a0e9dd46c4f582af9162 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 18:30:34 +0200 Subject: [PATCH 02/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 063bac4..de6547b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ uniseg numpy colorama ocrd >= 1.0.0b15 +attrs From 7f5789567fc056558a22044969253c96d85b5059 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 19:36:49 +0200 Subject: [PATCH 03/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/extracted_text.py b/extracted_text.py index d1dc6f0..f99c8ac 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -1,4 +1,5 @@ import attr +import unicodedata @attr.s(frozen=True) @@ -23,10 +24,25 @@ class ExtractedText: i += len(self.joiner) +NORM_NFC = 0 + + +def normalize(text, normalization): + if normalization == NORM_NFC: + return unicodedata.normalize('NFC', text) + else: + raise ValueError() + + @attr.s(frozen=True) class ExtractedTextSegment: id = attr.ib(type=str) text = attr.ib(type=str) + @text.validator + def check(self, attribute, value): + if normalize(value, self.normalization) != value: + raise ValueError('String "{}" is not normalized.'.format(value)) + normalization = attr.ib(default=NORM_NFC) test1 = ExtractedText([ @@ -40,3 +56,6 @@ assert test1.text == 'foo bar bazinga' assert test1.segment_id_for_pos(0) == 's0' assert test1.segment_id_for_pos(3) == None assert test1.segment_id_for_pos(10) == 's2' + +# ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) +ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) From 93608ba69735ac6780cfe79ec26477b848350a35 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 19:40:57 +0200 Subject: [PATCH 04/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 16 ---------------- extracted_text_test.py | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 16 deletions(-) create mode 100644 extracted_text_test.py diff --git a/extracted_text.py b/extracted_text.py index f99c8ac..a76f402 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -43,19 +43,3 @@ class ExtractedTextSegment: if normalize(value, self.normalization) != value: raise ValueError('String "{}" is not normalized.'.format(value)) normalization = attr.ib(default=NORM_NFC) - - -test1 = ExtractedText([ - ExtractedTextSegment('s0', 'foo'), - ExtractedTextSegment('s1', 'bar'), - ExtractedTextSegment('s2', 'bazinga') -], ' ') - - -assert test1.text == 'foo bar bazinga' -assert test1.segment_id_for_pos(0) == 's0' -assert test1.segment_id_for_pos(3) == None -assert test1.segment_id_for_pos(10) == 's2' - -# ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) -ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) diff --git a/extracted_text_test.py b/extracted_text_test.py new file mode 100644 index 0000000..29fabfe --- /dev/null +++ b/extracted_text_test.py @@ -0,0 +1,19 @@ +from extracted_text import * + +def test_text(): + test1 = ExtractedText([ + ExtractedTextSegment('s0', 'foo'), + ExtractedTextSegment('s1', 'bar'), + ExtractedTextSegment('s2', 'bazinga') + ], ' ') + + + assert test1.text == 'foo bar bazinga' + assert test1.segment_id_for_pos(0) == 's0' + assert test1.segment_id_for_pos(3) is None + assert test1.segment_id_for_pos(10) == 's2' + +# TODO handle grapheme cluster positions? + +# ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) +ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) From 8e3a19d7e9b44d0d103604f7d2ddbf4da380714c Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 19:49:12 +0200 Subject: [PATCH 05/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 3 +++ extracted_text_test.py | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/extracted_text.py b/extracted_text.py index a76f402..69d836b 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -2,6 +2,9 @@ import attr import unicodedata +# TODO handle grapheme cluster positions? + + @attr.s(frozen=True) class ExtractedText: segments = attr.ib() diff --git a/extracted_text_test.py b/extracted_text_test.py index 29fabfe..b302ca8 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -13,7 +13,5 @@ def test_text(): assert test1.segment_id_for_pos(3) is None assert test1.segment_id_for_pos(10) == 's2' -# TODO handle grapheme cluster positions? - # ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) From 91371971ebc65787c0e4a1455e69240e04cf3e05 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 20:29:01 +0200 Subject: [PATCH 06/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 9 ++++++--- extracted_text_test.py | 15 ++++++++++----- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/extracted_text.py b/extracted_text.py index 69d836b..23cd519 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -1,5 +1,6 @@ import attr import unicodedata +import enum # TODO handle grapheme cluster positions? @@ -27,11 +28,13 @@ class ExtractedText: i += len(self.joiner) -NORM_NFC = 0 +class Normalization(enum.Enum): + NFC = 1 + NFC_MUFI = 2 def normalize(text, normalization): - if normalization == NORM_NFC: + if normalization == Normalization.NFC: return unicodedata.normalize('NFC', text) else: raise ValueError() @@ -45,4 +48,4 @@ class ExtractedTextSegment: def check(self, attribute, value): if normalize(value, self.normalization) != value: raise ValueError('String "{}" is not normalized.'.format(value)) - normalization = attr.ib(default=NORM_NFC) + normalization = attr.ib(converter=Normalization, default=Normalization.NFC) diff --git a/extracted_text_test.py b/extracted_text_test.py index b302ca8..2e916cd 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -1,17 +1,22 @@ -from extracted_text import * +import unicodedata +import pytest +from extracted_text import ExtractedText, ExtractedTextSegment + def test_text(): test1 = ExtractedText([ ExtractedTextSegment('s0', 'foo'), - ExtractedTextSegment('s1', 'bar'), + ExtractedTextSegment(1, 'bar'), ExtractedTextSegment('s2', 'bazinga') ], ' ') - assert test1.text == 'foo bar bazinga' assert test1.segment_id_for_pos(0) == 's0' assert test1.segment_id_for_pos(3) is None assert test1.segment_id_for_pos(10) == 's2' -# ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) -ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) + +def test_normalization_check(): + with pytest.raises(ValueError, match=r'.*is not normalized.*'): + ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) + assert ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) From eca8cbc81e803a2ed5ce7aa407f4237330f92eb1 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 20:31:54 +0200 Subject: [PATCH 07/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 4 ++-- extracted_text_test.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/extracted_text.py b/extracted_text.py index 23cd519..c84c77b 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -4,14 +4,14 @@ import enum # TODO handle grapheme cluster positions? +# TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped +# TODO types are not validated (attr does not do this yet) @attr.s(frozen=True) class ExtractedText: segments = attr.ib() joiner = attr.ib(type=str) - # XXX Use type annotations for attr types when support for Python 3.5 is dropped - # XXX Also I think these are not validated? @property def text(self): diff --git a/extracted_text_test.py b/extracted_text_test.py index 2e916cd..4919a76 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -6,7 +6,7 @@ from extracted_text import ExtractedText, ExtractedTextSegment def test_text(): test1 = ExtractedText([ ExtractedTextSegment('s0', 'foo'), - ExtractedTextSegment(1, 'bar'), + ExtractedTextSegment('s1', 'bar'), ExtractedTextSegment('s2', 'bazinga') ], ' ') From a02e7dcbce9013a1ba6b376dfe0acbb58929c76d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 18:29:11 +0200 Subject: [PATCH 08/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 extracted_text.py diff --git a/extracted_text.py b/extracted_text.py new file mode 100644 index 0000000..d1dc6f0 --- /dev/null +++ b/extracted_text.py @@ -0,0 +1,42 @@ +import attr + + +@attr.s(frozen=True) +class ExtractedText: + segments = attr.ib() + joiner = attr.ib(type=str) + # XXX Use type annotations for attr types when support for Python 3.5 is dropped + # XXX Also I think these are not validated? + + @property + def text(self): + return self.joiner.join(s.text for s in self.segments) + + def segment_id_for_pos(self, pos): + i = 0 + for s in self.segments: + if i <= pos < i + len(s.text): + return s.id + i += len(s.text) + if i <= pos < i + len(self.joiner): + return None + i += len(self.joiner) + + +@attr.s(frozen=True) +class ExtractedTextSegment: + id = attr.ib(type=str) + text = attr.ib(type=str) + + +test1 = ExtractedText([ + ExtractedTextSegment('s0', 'foo'), + ExtractedTextSegment('s1', 'bar'), + ExtractedTextSegment('s2', 'bazinga') +], ' ') + + +assert test1.text == 'foo bar bazinga' +assert test1.segment_id_for_pos(0) == 's0' +assert test1.segment_id_for_pos(3) == None +assert test1.segment_id_for_pos(10) == 's2' From f6a880860f977aa2343e60c2389b88acb272ccff Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 18:30:34 +0200 Subject: [PATCH 09/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index a275fc7..846990b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ numpy colorama MarkupSafe ocrd >= 1.0.0b15 +attrs From ac1e1ec79aa45a2a94d1fc6318d1ea35b8f91e15 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 19:36:49 +0200 Subject: [PATCH 10/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/extracted_text.py b/extracted_text.py index d1dc6f0..f99c8ac 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -1,4 +1,5 @@ import attr +import unicodedata @attr.s(frozen=True) @@ -23,10 +24,25 @@ class ExtractedText: i += len(self.joiner) +NORM_NFC = 0 + + +def normalize(text, normalization): + if normalization == NORM_NFC: + return unicodedata.normalize('NFC', text) + else: + raise ValueError() + + @attr.s(frozen=True) class ExtractedTextSegment: id = attr.ib(type=str) text = attr.ib(type=str) + @text.validator + def check(self, attribute, value): + if normalize(value, self.normalization) != value: + raise ValueError('String "{}" is not normalized.'.format(value)) + normalization = attr.ib(default=NORM_NFC) test1 = ExtractedText([ @@ -40,3 +56,6 @@ assert test1.text == 'foo bar bazinga' assert test1.segment_id_for_pos(0) == 's0' assert test1.segment_id_for_pos(3) == None assert test1.segment_id_for_pos(10) == 's2' + +# ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) +ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) From 4e9b0aeef1c911928448fda9f122d9a44c16fcb7 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 19:40:57 +0200 Subject: [PATCH 11/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 16 ---------------- extracted_text_test.py | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 16 deletions(-) create mode 100644 extracted_text_test.py diff --git a/extracted_text.py b/extracted_text.py index f99c8ac..a76f402 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -43,19 +43,3 @@ class ExtractedTextSegment: if normalize(value, self.normalization) != value: raise ValueError('String "{}" is not normalized.'.format(value)) normalization = attr.ib(default=NORM_NFC) - - -test1 = ExtractedText([ - ExtractedTextSegment('s0', 'foo'), - ExtractedTextSegment('s1', 'bar'), - ExtractedTextSegment('s2', 'bazinga') -], ' ') - - -assert test1.text == 'foo bar bazinga' -assert test1.segment_id_for_pos(0) == 's0' -assert test1.segment_id_for_pos(3) == None -assert test1.segment_id_for_pos(10) == 's2' - -# ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) -ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) diff --git a/extracted_text_test.py b/extracted_text_test.py new file mode 100644 index 0000000..29fabfe --- /dev/null +++ b/extracted_text_test.py @@ -0,0 +1,19 @@ +from extracted_text import * + +def test_text(): + test1 = ExtractedText([ + ExtractedTextSegment('s0', 'foo'), + ExtractedTextSegment('s1', 'bar'), + ExtractedTextSegment('s2', 'bazinga') + ], ' ') + + + assert test1.text == 'foo bar bazinga' + assert test1.segment_id_for_pos(0) == 's0' + assert test1.segment_id_for_pos(3) is None + assert test1.segment_id_for_pos(10) == 's2' + +# TODO handle grapheme cluster positions? + +# ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) +ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) From 96273b026da2c56a9e725b7803c5913b545a578f Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 19:49:12 +0200 Subject: [PATCH 12/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 3 +++ extracted_text_test.py | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/extracted_text.py b/extracted_text.py index a76f402..69d836b 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -2,6 +2,9 @@ import attr import unicodedata +# TODO handle grapheme cluster positions? + + @attr.s(frozen=True) class ExtractedText: segments = attr.ib() diff --git a/extracted_text_test.py b/extracted_text_test.py index 29fabfe..b302ca8 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -13,7 +13,5 @@ def test_text(): assert test1.segment_id_for_pos(3) is None assert test1.segment_id_for_pos(10) == 's2' -# TODO handle grapheme cluster positions? - # ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) From 5a5e3c824b3683bc4808ddfc7f2fa2d302f0f8ef Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 20:29:01 +0200 Subject: [PATCH 13/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 9 ++++++--- extracted_text_test.py | 15 ++++++++++----- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/extracted_text.py b/extracted_text.py index 69d836b..23cd519 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -1,5 +1,6 @@ import attr import unicodedata +import enum # TODO handle grapheme cluster positions? @@ -27,11 +28,13 @@ class ExtractedText: i += len(self.joiner) -NORM_NFC = 0 +class Normalization(enum.Enum): + NFC = 1 + NFC_MUFI = 2 def normalize(text, normalization): - if normalization == NORM_NFC: + if normalization == Normalization.NFC: return unicodedata.normalize('NFC', text) else: raise ValueError() @@ -45,4 +48,4 @@ class ExtractedTextSegment: def check(self, attribute, value): if normalize(value, self.normalization) != value: raise ValueError('String "{}" is not normalized.'.format(value)) - normalization = attr.ib(default=NORM_NFC) + normalization = attr.ib(converter=Normalization, default=Normalization.NFC) diff --git a/extracted_text_test.py b/extracted_text_test.py index b302ca8..2e916cd 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -1,17 +1,22 @@ -from extracted_text import * +import unicodedata +import pytest +from extracted_text import ExtractedText, ExtractedTextSegment + def test_text(): test1 = ExtractedText([ ExtractedTextSegment('s0', 'foo'), - ExtractedTextSegment('s1', 'bar'), + ExtractedTextSegment(1, 'bar'), ExtractedTextSegment('s2', 'bazinga') ], ' ') - assert test1.text == 'foo bar bazinga' assert test1.segment_id_for_pos(0) == 's0' assert test1.segment_id_for_pos(3) is None assert test1.segment_id_for_pos(10) == 's2' -# ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) -ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) + +def test_normalization_check(): + with pytest.raises(ValueError, match=r'.*is not normalized.*'): + ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) + assert ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) From 354afdc0b221faf60ff5cf0e0f2fa515827512f6 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 20:31:54 +0200 Subject: [PATCH 14/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 4 ++-- extracted_text_test.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/extracted_text.py b/extracted_text.py index 23cd519..c84c77b 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -4,14 +4,14 @@ import enum # TODO handle grapheme cluster positions? +# TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped +# TODO types are not validated (attr does not do this yet) @attr.s(frozen=True) class ExtractedText: segments = attr.ib() joiner = attr.ib(type=str) - # XXX Use type annotations for attr types when support for Python 3.5 is dropped - # XXX Also I think these are not validated? @property def text(self): diff --git a/extracted_text_test.py b/extracted_text_test.py index 2e916cd..4919a76 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -6,7 +6,7 @@ from extracted_text import ExtractedText, ExtractedTextSegment def test_text(): test1 = ExtractedText([ ExtractedTextSegment('s0', 'foo'), - ExtractedTextSegment(1, 'bar'), + ExtractedTextSegment('s1', 'bar'), ExtractedTextSegment('s2', 'bazinga') ], ' ') From 1d553bb4e3bf5cb6b8dd1ebb21f576a810d151ef Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 13:04:36 +0200 Subject: [PATCH 15/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20=20Test=20?= =?UTF-8?q?aligning=20by=20character=20while=20retaining=20segment=20id=20?= =?UTF-8?q?info?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text_test.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/extracted_text_test.py b/extracted_text_test.py index 4919a76..890b045 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -1,6 +1,8 @@ import unicodedata import pytest from extracted_text import ExtractedText, ExtractedTextSegment +from uniseg.graphemecluster import grapheme_clusters +from qurator.dinglehopper import seq_align def test_text(): @@ -20,3 +22,33 @@ def test_normalization_check(): with pytest.raises(ValueError, match=r'.*is not normalized.*'): ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) assert ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) + + +def test_align(): + """ + Test aligning by character while retaining segment id info + + The difficulty here is that aligning should work on grapheme clusters, + not Python characters. + """ + + test1 = ExtractedText([ + ExtractedTextSegment('s0', 'foo'), + ExtractedTextSegment('s1', 'bar'), + ExtractedTextSegment('s2', 'bazinga') + ], ' ') + test2 = ExtractedText([ + ExtractedTextSegment('x0', 'foo'), + ExtractedTextSegment('x1', 'bar'), + ExtractedTextSegment('x2', '.'), # extra . + ExtractedTextSegment('x2', 'bazim̃ga'), # different grapheme cluster, m̃ also is two Python characters + ], ' ') + + left_pos = 0; right_pos = 0 + for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)): + print(left, right, test1.segment_id_for_pos(left_pos), test2.segment_id_for_pos(right_pos)) + if left is not None: + left_pos += len(left) + if right is not None: + right_pos += len(right) + assert False From 98f6c68df7ad0afeb55af877564789fe06e2747d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 13:54:46 +0200 Subject: [PATCH 16/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20=20Test=20?= =?UTF-8?q?aligning=20by=20character=20while=20retaining=20segment=20id=20?= =?UTF-8?q?info?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text_test.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/extracted_text_test.py b/extracted_text_test.py index 890b045..60785b7 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -3,6 +3,7 @@ import pytest from extracted_text import ExtractedText, ExtractedTextSegment from uniseg.graphemecluster import grapheme_clusters from qurator.dinglehopper import seq_align +from collections import namedtuple def test_text(): @@ -24,6 +25,9 @@ def test_normalization_check(): assert ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) +AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id') + + def test_align(): """ Test aligning by character while retaining segment id info @@ -35,20 +39,30 @@ def test_align(): test1 = ExtractedText([ ExtractedTextSegment('s0', 'foo'), ExtractedTextSegment('s1', 'bar'), - ExtractedTextSegment('s2', 'bazinga') + ExtractedTextSegment('s2', 'batzinga') ], ' ') test2 = ExtractedText([ ExtractedTextSegment('x0', 'foo'), ExtractedTextSegment('x1', 'bar'), ExtractedTextSegment('x2', '.'), # extra . - ExtractedTextSegment('x2', 'bazim̃ga'), # different grapheme cluster, m̃ also is two Python characters + ExtractedTextSegment('x3', 'bazim̃ga'), # deletion + different grapheme cluster, m̃ also is two Python characters ], ' ') - left_pos = 0; right_pos = 0 + left_pos = 0; right_pos = 0; alignment = [] for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)): - print(left, right, test1.segment_id_for_pos(left_pos), test2.segment_id_for_pos(right_pos)) + left_id = test1.segment_id_for_pos(left_pos) if left is not None else None + right_id = test2.segment_id_for_pos(right_pos) if right is not None else None + el = AlignmentElement(left, right, left_id, right_id) + alignment.append(el) if left is not None: left_pos += len(left) if right is not None: right_pos += len(right) - assert False + + print('test1: {}'.format(test1.text)) + print('test2: {}'.format(test2.text)) + + assert alignment[0] == ('f', 'f', 's0', 'x0') + assert alignment[8] == (None, '.', None, 'x2') + assert alignment[12] == ('t', None, 's2', None) + assert alignment[15] == ('n', 'm̃', 's2', 'x3') From 1b9497dfb0a223c7679f63aff1e912513c313215 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 14:50:32 +0200 Subject: [PATCH 17/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Test=20ali?= =?UTF-8?q?gning=20by=20character=20while=20retaining=20segment=20id=20inf?= =?UTF-8?q?o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/extracted_text_test.py b/extracted_text_test.py index 60785b7..9eb57b6 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -52,6 +52,8 @@ def test_align(): for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)): left_id = test1.segment_id_for_pos(left_pos) if left is not None else None right_id = test2.segment_id_for_pos(right_pos) if right is not None else None + # XXX note that deletions and inserts only produce one id + None, UI must + # support this, i.e. display for the one id produced el = AlignmentElement(left, right, left_id, right_id) alignment.append(el) if left is not None: From 278e52868fb9b5601b0b18de68d806f33c6ff172 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 14:54:50 +0200 Subject: [PATCH 18/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Test=20ali?= =?UTF-8?q?gning=20by=20character=20while=20retaining=20segment=20id=20inf?= =?UTF-8?q?o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 1 - 1 file changed, 1 deletion(-) diff --git a/extracted_text.py b/extracted_text.py index c84c77b..b37f341 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -3,7 +3,6 @@ import unicodedata import enum -# TODO handle grapheme cluster positions? # TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped # TODO types are not validated (attr does not do this yet) From 5b353a22329f01678529e6ddc4f390be01e75a12 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 14:56:23 +0200 Subject: [PATCH 19/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Test=20ali?= =?UTF-8?q?gning=20by=20character=20while=20retaining=20segment=20id=20inf?= =?UTF-8?q?o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/extracted_text_test.py b/extracted_text_test.py index 9eb57b6..b84df87 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -54,6 +54,7 @@ def test_align(): right_id = test2.segment_id_for_pos(right_pos) if right is not None else None # XXX note that deletions and inserts only produce one id + None, UI must # support this, i.e. display for the one id produced + # XXX otherwise, it should always display for BOTH ids el = AlignmentElement(left, right, left_id, right_id) alignment.append(el) if left is not None: From 5dbf563d6a510398ca370f809495915da274195e Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 15:35:52 +0200 Subject: [PATCH 20/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Extract=20?= =?UTF-8?q?text=20while=20retaining=20segment=20id=20info?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 50 ---------------------- extracted_text_test.py | 2 +- qurator/dinglehopper/ocr_files.py | 71 ++++++++++++++++++++++++++++--- 3 files changed, 65 insertions(+), 58 deletions(-) delete mode 100644 extracted_text.py diff --git a/extracted_text.py b/extracted_text.py deleted file mode 100644 index b37f341..0000000 --- a/extracted_text.py +++ /dev/null @@ -1,50 +0,0 @@ -import attr -import unicodedata -import enum - - -# TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped -# TODO types are not validated (attr does not do this yet) - - -@attr.s(frozen=True) -class ExtractedText: - segments = attr.ib() - joiner = attr.ib(type=str) - - @property - def text(self): - return self.joiner.join(s.text for s in self.segments) - - def segment_id_for_pos(self, pos): - i = 0 - for s in self.segments: - if i <= pos < i + len(s.text): - return s.id - i += len(s.text) - if i <= pos < i + len(self.joiner): - return None - i += len(self.joiner) - - -class Normalization(enum.Enum): - NFC = 1 - NFC_MUFI = 2 - - -def normalize(text, normalization): - if normalization == Normalization.NFC: - return unicodedata.normalize('NFC', text) - else: - raise ValueError() - - -@attr.s(frozen=True) -class ExtractedTextSegment: - id = attr.ib(type=str) - text = attr.ib(type=str) - @text.validator - def check(self, attribute, value): - if normalize(value, self.normalization) != value: - raise ValueError('String "{}" is not normalized.'.format(value)) - normalization = attr.ib(converter=Normalization, default=Normalization.NFC) diff --git a/extracted_text_test.py b/extracted_text_test.py index b84df87..82c3a53 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -1,6 +1,6 @@ import unicodedata import pytest -from extracted_text import ExtractedText, ExtractedTextSegment +from qurator.dinglehopper import ExtractedText, ExtractedTextSegment from uniseg.graphemecluster import grapheme_clusters from qurator.dinglehopper import seq_align from collections import namedtuple diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index b57a047..7d06dbe 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -3,9 +3,57 @@ from __future__ import division, print_function from warnings import warn from lxml import etree as ET +from lxml.etree import XMLSyntaxError import sys +import attr +import enum +import unicodedata + + +@attr.s(frozen=True) +class ExtractedText: + segments = attr.ib() + joiner = attr.ib(type=str) + # TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped + # TODO Types are not validated (attr does not do this yet) + + @property + def text(self): + return self.joiner.join(s.text for s in self.segments) + + def segment_id_for_pos(self, pos): + i = 0 + for s in self.segments: + if i <= pos < i + len(s.text): + return s.id + i += len(s.text) + if i <= pos < i + len(self.joiner): + return None + i += len(self.joiner) + # XXX Cache results + + +class Normalization(enum.Enum): + NFC = 1 + NFC_MUFI = 2 + + +def normalize(text, normalization): + if normalization == Normalization.NFC: + return unicodedata.normalize('NFC', text) + else: + raise ValueError() -from lxml.etree import XMLSyntaxError + +@attr.s(frozen=True) +class ExtractedTextSegment: + id = attr.ib(type=str) + text = attr.ib(type=str) + @text.validator + def check(self, attribute, value): + if normalize(value, self.normalization) != value: + raise ValueError('String "{}" is not normalized.'.format(value)) + normalization = attr.ib(converter=Normalization, default=Normalization.NFC) def alto_namespace(tree): @@ -21,7 +69,7 @@ def alto_namespace(tree): raise ValueError('Not an ALTO tree') -def alto_text(tree): +def alto_extract(tree): """Extract text from the given ALTO ElementTree.""" nsmap = {'alto': alto_namespace(tree)} @@ -29,9 +77,15 @@ def alto_text(tree): lines = ( ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap)) for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap)) - text_ = '\n'.join(lines) - return text_ + return ExtractedText((ExtractedTextSegment(None, line_text) for line_text in lines), '\n') + # TODO This currently does not extract any segment id, because we are + # clueless about the ALTO format. + # FIXME needs to handle normalization + + +def alto_text(tree): + return alto_extract(tree).text def page_namespace(tree): @@ -47,7 +101,7 @@ def page_namespace(tree): raise ValueError('Not a PAGE tree') -def page_text(tree): +def page_extract(tree): """Extract text from the given PAGE content ElementTree.""" nsmap = {'page': page_namespace(tree)} @@ -80,10 +134,13 @@ def page_text(tree): # XXX Does a file have to have regions etc.? region vs lines etc. # Filter empty region texts region_texts = (t for t in region_texts if t) + return ExtractedText((ExtractedTextSegment(None, region_text) for region_text in region_texts), '\n') + # TODO This currently does not extract any segment id + # FIXME needs to handle normalization - text_ = '\n'.join(region_texts) - return text_ +def page_text(tree): + return page_extract(tree).text def text(filename): From a09c1eae7e4ff40ad6dd93791818b901f0fa1f7d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 15:37:34 +0200 Subject: [PATCH 21/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Extract=20?= =?UTF-8?q?text=20while=20retaining=20segment=20id=20info?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../dinglehopper/tests/extracted_text_test.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename extracted_text_test.py => qurator/dinglehopper/tests/extracted_text_test.py (100%) diff --git a/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py similarity index 100% rename from extracted_text_test.py rename to qurator/dinglehopper/tests/extracted_text_test.py From 6d0db229fa531245c4673202c14d2a5dd04aa82d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 16:54:48 +0200 Subject: [PATCH 22/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Extract=20?= =?UTF-8?q?text=20while=20retaining=20segment=20id=20info?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 7d06dbe..a5187c5 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -4,6 +4,7 @@ from warnings import warn from lxml import etree as ET from lxml.etree import XMLSyntaxError +from contextlib import suppress import sys import attr import enum @@ -51,10 +52,20 @@ class ExtractedTextSegment: text = attr.ib(type=str) @text.validator def check(self, attribute, value): - if normalize(value, self.normalization) != value: + if value is not None and normalize(value, self.normalization) != value: raise ValueError('String "{}" is not normalized.'.format(value)) normalization = attr.ib(converter=Normalization, default=Normalization.NFC) + @classmethod + def from_text_segment(cls, text_segment, nsmap): + """Build an ExtractedTextSegment from a PAGE content text element""" + + segment_id = text_segment.attrib['id'] + segment_text = None + with suppress(AttributeError): + segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text + return cls(segment_id, segment_text) + def alto_namespace(tree): """Return the ALTO namespace used in the given ElementTree. @@ -106,13 +117,7 @@ def page_extract(tree): nsmap = {'page': page_namespace(tree)} - def region_text(region): - try: - return region.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text - except AttributeError: - return None - - region_texts = [] + regions = [] reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap) if reading_order is not None: for group in reading_order.iterfind('./*', namespaces=nsmap): @@ -122,20 +127,20 @@ def page_extract(tree): region_id = region_ref_indexed.attrib['regionRef'] region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap) if region is not None: - region_texts.append(region_text(region)) + regions.append(ExtractedTextSegment.from_text_segment(region, nsmap)) else: warn('Not a TextRegion: "%s"' % region_id) else: raise NotImplementedError else: for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap): - region_texts.append(region_text(region)) + regions.append(ExtractedTextSegment.from_text_segment(region, nsmap)) # XXX Does a file have to have regions etc.? region vs lines etc. # Filter empty region texts - region_texts = (t for t in region_texts if t) - return ExtractedText((ExtractedTextSegment(None, region_text) for region_text in region_texts), '\n') - # TODO This currently does not extract any segment id + regions = (r for r in regions if r.text is not None) + + return ExtractedText(regions, '\n') # FIXME needs to handle normalization From bc1002b1e69f32893f5d1ecaf6d0f1ee3a5e9acb Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 17:43:30 +0200 Subject: [PATCH 23/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Extract=20?= =?UTF-8?q?text=20while=20retaining=20segment=20id=20info?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index a5187c5..fd89b03 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -148,21 +148,35 @@ def page_text(tree): return page_extract(tree).text -def text(filename): - """Read the text from the given file. +def plain_extract(filename): + with open(filename, 'r') as f: + return ExtractedText( + (ExtractedTextSegment('line %d' % no, line) for no, line in enumerate(f.readlines())), + '\n' + ) + + +def plain_text(filename): + return plain_extract(filename).text + + +def extract(filename): + """Extract the text from the given file. Supports PAGE, ALTO and falls back to plain text. """ - try: tree = ET.parse(filename) except XMLSyntaxError: - with open(filename, 'r') as f: - return f.read() + return plain_extract(filename) try: - return page_text(tree) + return page_extract(tree) except ValueError: - return alto_text(tree) + return alto_extract(tree) + + +def text(filename): + return extract(filename).text if __name__ == '__main__': From a448133394075adcdc7ae5e40295ecbb82d00023 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 13:25:35 +0200 Subject: [PATCH 24/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Display=20?= =?UTF-8?q?segment=20id=20when=20hovering=20over=20a=20character=20differe?= =?UTF-8?q?nce?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/character_error_rate.py | 4 ++ qurator/dinglehopper/cli.py | 64 ++++++++++++++----- qurator/dinglehopper/edit_distance.py | 7 ++ qurator/dinglehopper/ocr_files.py | 2 +- qurator/dinglehopper/templates/report.html.j2 | 11 ++++ qurator/dinglehopper/templates/report.html.js | 13 ++-- qurator/dinglehopper/word_error_rate.py | 15 +++++ 7 files changed, 96 insertions(+), 20 deletions(-) diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py index 05cc931..e99f391 100644 --- a/qurator/dinglehopper/character_error_rate.py +++ b/qurator/dinglehopper/character_error_rate.py @@ -15,6 +15,10 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]: :return: character error rate and length of the reference """ d = distance(reference, compared) + # XXX + from .cli import ExtractedText + if isinstance(reference, ExtractedText): + reference = reference.text n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference)))) if d == 0: diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 63bfd92..8e18b26 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -8,11 +8,11 @@ from markupsafe import escape from qurator.dinglehopper import * -def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align): +def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): gtx = '' ocrx = '' - def format_thing(t, css_classes=None): + def format_thing(t, css_classes=None, id_=None): if t is None: html_t = none css_classes += ' ellipsis' @@ -21,19 +21,52 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align): else: html_t = escape(t) + html_custom_attrs = "" + # XXX must sanitize id_ or do we trust the XML? + if id_: + html_custom_attrs = 'data-segment-id="{}"'.format(id_) + if css_classes: - return '{html_t}'.format(css_classes=css_classes, html_t=html_t) + return '{html_t}'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs) else: return '{html_t}'.format(html_t=html_t) - for k, (g, o) in enumerate(align(gt_things, ocr_things)): - if g == o: - css_classes = None - else: + if isinstance(gt_in, ExtractedText): + print(gt_in.text) + if not isinstance(ocr_in, ExtractedText): + raise TypeError() + # XXX splitting should be done in ExtractedText + gt_things = list(grapheme_clusters(gt_in.text)) + ocr_things = list(grapheme_clusters(ocr_in.text)) + else: + gt_things = gt_in + ocr_things = ocr_in + + + + g_pos = 0 + o_pos = 0 + for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)): + css_classes = None + gt_id = None + ocr_id = None + if g != o: css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k) + if isinstance(gt_in, ExtractedText): + gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None + ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None + # XXX note that deletions and inserts only produce one id + None, UI must + # support this, i.e. display for the one id produced + # XXX otherwise, it should always display for BOTH ids + + gtx += joiner + format_thing(g, css_classes, gt_id) + ocrx += joiner + format_thing(o, css_classes, ocr_id) + + if g is not None: + g_pos += len(g) + if o is not None: + o_pos += len(o) - gtx += joiner + format_thing(g, css_classes) - ocrx += joiner + format_thing(o, css_classes) return \ ''' @@ -51,20 +84,21 @@ def process(gt, ocr, report_prefix, *, metrics=True): Click on a wrapper. """ - gt_text = text(gt) - ocr_text = text(ocr) + gt_text = extract(gt) + ocr_text = extract(ocr) - gt_text = substitute_equivalences(gt_text) - ocr_text = substitute_equivalences(ocr_text) + # FIXME + #gt_text = substitute_equivalences(gt_text) + #ocr_text = substitute_equivalences(ocr_text) cer, n_characters = character_error_rate_n(gt_text, ocr_text) wer, n_words = word_error_rate_n(gt_text, ocr_text) - char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align) + char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·') gt_words = words_normalized(gt_text) ocr_words = words_normalized(ocr_text) - word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯', align=seq_align) + word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯') def json_float(value): """Convert a float value to an JSON float. diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index 8ca24d3..dc1cb24 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -8,6 +8,7 @@ import numpy as np from uniseg.graphemecluster import grapheme_clusters + def levenshtein_matrix(seq1: Sequence, seq2: Sequence): """Compute the matrix commonly computed to produce the Levenshtein distance. This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired @@ -75,6 +76,12 @@ def distance(s1, s2): Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme clusters. This should be the correct way to compare two Unicode strings. """ + # XXX + from .cli import ExtractedText + if isinstance(s1, ExtractedText): + s1 = s1.text + if isinstance(s2, ExtractedText): + s2 = s2.text s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1))) s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2))) return levenshtein(s1, s2) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index fd89b03..17868a7 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -138,7 +138,7 @@ def page_extract(tree): # XXX Does a file have to have regions etc.? region vs lines etc. # Filter empty region texts - regions = (r for r in regions if r.text is not None) + regions = [r for r in regions if r.text is not None] return ExtractedText(regions, '\n') # FIXME needs to handle normalization diff --git a/qurator/dinglehopper/templates/report.html.j2 b/qurator/dinglehopper/templates/report.html.j2 index 0c2f464..f7b2efb 100644 --- a/qurator/dinglehopper/templates/report.html.j2 +++ b/qurator/dinglehopper/templates/report.html.j2 @@ -26,12 +26,23 @@ border: 2px solid; border-radius: 5px; } + #status-box { + position: fixed; + background: grey; + color: white; + width: 100%; + height: 2em; + } +
foo
+ + +
{{ gt }}
diff --git a/qurator/dinglehopper/templates/report.html.js b/qurator/dinglehopper/templates/report.html.js index ac43676..01f5323 100644 --- a/qurator/dinglehopper/templates/report.html.js +++ b/qurator/dinglehopper/templates/report.html.js @@ -4,11 +4,16 @@ function find_diff_class(classes) { $(document).ready(function() { $('.diff').mouseover(function() { - let c = find_diff_class($(this).attr('class')) - $('.' + c).addClass('diff-highlight') + let c = find_diff_class($(this).attr('class')); + $('.' + c).addClass('diff-highlight'); + + segment_id = $(this).attr('data-segment-id'); + $('#status-box').text(segment_id); }); $('.diff').mouseout(function() { - let c = find_diff_class($(this).attr('class')) - $('.' + c).removeClass('diff-highlight') + let c = find_diff_class($(this).attr('class')); + $('.' + c).removeClass('diff-highlight'); + + $('#status-box').text(''); }); }); diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py index 7ed56e4..64eba0a 100644 --- a/qurator/dinglehopper/word_error_rate.py +++ b/qurator/dinglehopper/word_error_rate.py @@ -32,6 +32,11 @@ def words(s): cat = subcat[0] return cat in unwanted_categories or subcat in unwanted_subcategories + # XXX + from .cli import ExtractedText + if isinstance(s, ExtractedText): + s = s.text + # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters." for word in uniseg.wordbreak.words(s): @@ -42,10 +47,20 @@ def words(s): def words_normalized(s): + # XXX + from .cli import ExtractedText + if isinstance(s, ExtractedText): + s = s.text return words(unicodedata.normalize('NFC', s)) def word_error_rate_n(reference, compared) -> Tuple[float, int]: + # XXX + from .cli import ExtractedText + if isinstance(reference, ExtractedText): + reference = reference.text + if isinstance(compared, ExtractedText): + compared = compared.text if isinstance(reference, str): reference_seq = list(words_normalized(reference)) compared_seq = list(words_normalized(compared)) From 25191b24f655a6dc276dee79ac65aa7ba1356692 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 13:46:28 +0200 Subject: [PATCH 25/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Display=20?= =?UTF-8?q?segment=20id=20in=20the=20corresponding=20column?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli.py | 4 ++-- qurator/dinglehopper/templates/report.html.j2 | 6 +----- qurator/dinglehopper/templates/report.html.js | 4 ++-- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 8e18b26..2099c57 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -71,8 +71,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): return \ '''
-
{}
-
{}
+
{}
+
{}
'''.format(gtx, ocrx) diff --git a/qurator/dinglehopper/templates/report.html.j2 b/qurator/dinglehopper/templates/report.html.j2 index f7b2efb..f829ef8 100644 --- a/qurator/dinglehopper/templates/report.html.j2 +++ b/qurator/dinglehopper/templates/report.html.j2 @@ -26,7 +26,7 @@ border: 2px solid; border-radius: 5px; } - #status-box { + .status-box { position: fixed; background: grey; color: white; @@ -39,10 +39,6 @@ -
foo
- - -
{{ gt }}
diff --git a/qurator/dinglehopper/templates/report.html.js b/qurator/dinglehopper/templates/report.html.js index 01f5323..0baaa30 100644 --- a/qurator/dinglehopper/templates/report.html.js +++ b/qurator/dinglehopper/templates/report.html.js @@ -8,12 +8,12 @@ $(document).ready(function() { $('.' + c).addClass('diff-highlight'); segment_id = $(this).attr('data-segment-id'); - $('#status-box').text(segment_id); + $(this).closest('div').find('.status-box').text(segment_id); }); $('.diff').mouseout(function() { let c = find_diff_class($(this).attr('class')); $('.' + c).removeClass('diff-highlight'); - $('#status-box').text(''); + $(this).closest('div').find('.status-box').text(''); }); }); From 28849c701bbccd14cebe335bcc94d6fca3871b3a Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 14:25:11 +0200 Subject: [PATCH 26/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Remove=20d?= =?UTF-8?q?ebug=20output?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 2099c57..13543a5 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -32,7 +32,6 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): return '{html_t}'.format(html_t=html_t) if isinstance(gt_in, ExtractedText): - print(gt_in.text) if not isinstance(ocr_in, ExtractedText): raise TypeError() # XXX splitting should be done in ExtractedText From a1c1b9c5ca4d5620ff4782ad967058bed44344c1 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 15:53:15 +0200 Subject: [PATCH 27/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Re-introdu?= =?UTF-8?q?ce=20"substitute=5Fequivalences"=20as=20Normalization.NFC=5FSBB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 17868a7..2d88498 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -5,6 +5,7 @@ from warnings import warn from lxml import etree as ET from lxml.etree import XMLSyntaxError from contextlib import suppress +from .substitute_equivalences import substitute_equivalences import sys import attr import enum @@ -36,16 +37,27 @@ class ExtractedText: class Normalization(enum.Enum): NFC = 1 - NFC_MUFI = 2 + NFC_MUFI = 2 # TODO + NFC_SBB = 3 def normalize(text, normalization): if normalization == Normalization.NFC: return unicodedata.normalize('NFC', text) + if normalization == Normalization.NFC_MUFI: + raise NotImplementedError() + if normalization == Normalization.NFC_SBB: + # XXX This needs to be redone + # https://github.com/qurator-spk/dinglehopper/issues/11 + return substitute_equivalences(text) else: raise ValueError() +# XXX hack +normalize_sbb = lambda t: normalize(t, Normalization.NFC_SBB) + + @attr.s(frozen=True) class ExtractedTextSegment: id = attr.ib(type=str) @@ -54,7 +66,7 @@ class ExtractedTextSegment: def check(self, attribute, value): if value is not None and normalize(value, self.normalization) != value: raise ValueError('String "{}" is not normalized.'.format(value)) - normalization = attr.ib(converter=Normalization, default=Normalization.NFC) + normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB) @classmethod def from_text_segment(cls, text_segment, nsmap): @@ -64,6 +76,7 @@ class ExtractedTextSegment: segment_text = None with suppress(AttributeError): segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text + segment_text = normalize_sbb(segment_text) return cls(segment_id, segment_text) @@ -89,7 +102,10 @@ def alto_extract(tree): ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap)) for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap)) - return ExtractedText((ExtractedTextSegment(None, line_text) for line_text in lines), '\n') + return ExtractedText( + (ExtractedTextSegment(None, normalize_sbb(line_text)) for line_text in lines), + '\n' + ) # TODO This currently does not extract any segment id, because we are # clueless about the ALTO format. # FIXME needs to handle normalization From 4b86f01b15321c8a802d9c0ffaec4cc36be4de7d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 15:56:01 +0200 Subject: [PATCH 28/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Use=20a=20?= =?UTF-8?q?Bootstrap=20tooltip=20for=20the=20segment=20id?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli.py | 10 ++++++---- qurator/dinglehopper/templates/report.html.j2 | 7 ------- qurator/dinglehopper/templates/report.html.js | 16 ++++++---------- 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 13543a5..ea0c9bb 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -22,9 +22,11 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): html_t = escape(t) html_custom_attrs = "" - # XXX must sanitize id_ or do we trust the XML? + + # Set Bootstrap tooltip to the segment id if id_: - html_custom_attrs = 'data-segment-id="{}"'.format(id_) + html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_) + # XXX must sanitize id_ or do we trust the XML? if css_classes: return '{html_t}'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs) @@ -70,8 +72,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): return \ '''
-
{}
-
{}
+
{}
+
{}
'''.format(gtx, ocrx) diff --git a/qurator/dinglehopper/templates/report.html.j2 b/qurator/dinglehopper/templates/report.html.j2 index f829ef8..0c2f464 100644 --- a/qurator/dinglehopper/templates/report.html.j2 +++ b/qurator/dinglehopper/templates/report.html.j2 @@ -26,13 +26,6 @@ border: 2px solid; border-radius: 5px; } - .status-box { - position: fixed; - background: grey; - color: white; - width: 100%; - height: 2em; - } diff --git a/qurator/dinglehopper/templates/report.html.js b/qurator/dinglehopper/templates/report.html.js index 0baaa30..4c2ba28 100644 --- a/qurator/dinglehopper/templates/report.html.js +++ b/qurator/dinglehopper/templates/report.html.js @@ -1,19 +1,15 @@ function find_diff_class(classes) { - return classes.split(/\s+/).find(x => x.match(/.diff\d.*/)); + return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/))); } $(document).ready(function() { - $('.diff').mouseover(function() { - let c = find_diff_class($(this).attr('class')); - $('.' + c).addClass('diff-highlight'); + /* Enable Bootstrap tooltips */ + $('[data-toggle="tooltip"]').tooltip(); - segment_id = $(this).attr('data-segment-id'); - $(this).closest('div').find('.status-box').text(segment_id); + $('.diff').mouseover(function() { + find_diff_class($(this).attr('class')).addClass('diff-highlight'); }); $('.diff').mouseout(function() { - let c = find_diff_class($(this).attr('class')); - $('.' + c).removeClass('diff-highlight'); - - $(this).closest('div').find('.status-box').text(''); + find_diff_class($(this).attr('class')).removeClass('diff-highlight'); }); }); From e1c854633677da07df9c264c4031a836995a6a65 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 16:08:56 +0200 Subject: [PATCH 29/87] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Move=20Pyt?= =?UTF-8?q?hon=203.5=20XXXs=20to=20a=20GitHub=20issue?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/qurator-spk/dinglehopper/issues/20. --- qurator/dinglehopper/ocr_files.py | 1 - qurator/dinglehopper/tests/test_integ_cli_valid_json.py | 2 -- qurator/dinglehopper/tests/test_integ_ocrd_cli.py | 2 -- 3 files changed, 5 deletions(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 2d88498..2ceebfd 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -16,7 +16,6 @@ import unicodedata class ExtractedText: segments = attr.ib() joiner = attr.ib(type=str) - # TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped # TODO Types are not validated (attr does not do this yet) @property diff --git a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py index 5699700..35421bb 100644 --- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py +++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py @@ -10,7 +10,6 @@ from ..cli import process def test_cli_json(tmp_path): """Test that the cli/process() yields a loadable JSON report""" - # XXX Path.__str__() is necessary for Python 3.5 with working_directory(str(tmp_path)): with open('gt.txt', 'w') as gtf: gtf.write('AAAAA') @@ -26,7 +25,6 @@ def test_cli_json(tmp_path): def test_cli_json_cer_is_infinity(tmp_path): """Test that the cli/process() yields a loadable JSON report when CER == inf""" - # XXX Path.__str__() is necessary for Python 3.5 with working_directory(str(tmp_path)): with open('gt.txt', 'w') as gtf: gtf.write('') # Empty to yield CER == inf diff --git a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py index 41da748..3d78f57 100644 --- a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py +++ b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py @@ -17,8 +17,6 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') def test_ocrd_cli(tmp_path): """Test OCR-D interface""" - # XXX Path.str() is necessary for Python 3.5 - # Copy test workspace test_workspace_dir_source = Path(data_dir) / 'actevedef_718448162' test_workspace_dir = tmp_path / 'test_ocrd_cli' From dc85294380406ee1d8e67cf4165d21409c7d24ff Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 17:01:28 +0200 Subject: [PATCH 30/87] =?UTF-8?q?=F0=9F=93=93=20dinglehopper:=20Document?= =?UTF-8?q?=20editops()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/edit_distance.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index dc1cb24..a6643c7 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -123,7 +123,11 @@ def seq_editops(seq1, seq2): def editops(word1, word2): - # XXX Note that this returns indices to the _grapheme clusters_, not characters! + """ + Return sequence of edit operations transforming one string to another. + + Note that this returns indices to the _grapheme clusters_, not characters! + """ word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1))) word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2))) return seq_editops(word1, word2) From 453247c2f36c93b1602b23b32a6a25af440540bb Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 17:01:56 +0200 Subject: [PATCH 31/87] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Clean=20up?= =?UTF-8?q?=20test=5Flines=5Fsimilar()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/tests/test_align.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/qurator/dinglehopper/tests/test_align.py b/qurator/dinglehopper/tests/test_align.py index cc5cb43..23483f8 100644 --- a/qurator/dinglehopper/tests/test_align.py +++ b/qurator/dinglehopper/tests/test_align.py @@ -78,7 +78,8 @@ def test_lines(): def test_lines_similar(): - """Test comparing list of lines while using a "weaker equivalence". + """ + Test comparing list of lines while using a "weaker equivalence". This mainly serves as documentation. """ @@ -88,7 +89,14 @@ def test_lines_similar(): self._string = string def __eq__(self, other): - return distance(self._string, other._string) < 2 # XXX NOT the final version + # Just an example! + min_len = min(len(self._string), len(other._string)) + if min_len > 0: + normalized_distance = distance(self._string, other._string)/min_len + similar = normalized_distance < 0.1 + else: + similar = False + return similar def __ne__(self, other): return not self.__eq__(other) @@ -106,3 +114,6 @@ def test_lines_similar(): left, right = unzip(result) assert list(left) == [SimilarString('This is a line.'), SimilarString('This is another'), None, SimilarString('And the last line')] assert list(right) == [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')] + + # Test __eq__ (i.e. is it a substitution or a similar string?) + assert list(left)[0] == list(right)[0] From bc05f830881d1dc573e3f91495f8c6a42837c5bc Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 17:04:07 +0200 Subject: [PATCH 32/87] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Remove=20o?= =?UTF-8?q?bsolete=20XXX=20about=20the=20PAGE=20hierarchy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 1 - 1 file changed, 1 deletion(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 2ceebfd..5ce0bcd 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -151,7 +151,6 @@ def page_extract(tree): for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap): regions.append(ExtractedTextSegment.from_text_segment(region, nsmap)) - # XXX Does a file have to have regions etc.? region vs lines etc. # Filter empty region texts regions = [r for r in regions if r.text is not None] From c3ae73d576b333788ffa8215c76db5006e23de8d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 18:06:42 +0200 Subject: [PATCH 33/87] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Calculate?= =?UTF-8?q?=20segment=20ids=20once,=20on=20the=20first=20call?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 5ce0bcd..180ecd3 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -5,6 +5,7 @@ from warnings import warn from lxml import etree as ET from lxml.etree import XMLSyntaxError from contextlib import suppress +from itertools import repeat from .substitute_equivalences import substitute_equivalences import sys import attr @@ -22,16 +23,20 @@ class ExtractedText: def text(self): return self.joiner.join(s.text for s in self.segments) + _segment_id_for_pos = None + def segment_id_for_pos(self, pos): - i = 0 - for s in self.segments: - if i <= pos < i + len(s.text): - return s.id - i += len(s.text) - if i <= pos < i + len(self.joiner): - return None - i += len(self.joiner) - # XXX Cache results + # Calculate segment ids once, on the first call + if not self._segment_id_for_pos: + segment_id_for_pos = [] + for s in self.segments: + segment_id_for_pos.extend(repeat(s.id, len(s.text))) + segment_id_for_pos.extend(repeat(None, len(self.joiner))) + # This is frozen, so we have to jump through the hoop: + object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos) + assert self._segment_id_for_pos + + return self._segment_id_for_pos[pos] class Normalization(enum.Enum): From e3e79381621f1a5b1d81a9abb053d1a380d96090 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 20:04:24 +0200 Subject: [PATCH 34/87] =?UTF-8?q?=F0=9F=90=9B=20dinglehopper:=20Fix=20test?= =?UTF-8?q?s=20to=20deal=20with=20new=20normalization=20logic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 5 +- .../dinglehopper/tests/test_integ_align.py | 8 +- .../test_integ_character_error_rate_ocr.py | 8 +- .../tests/test_integ_cli_valid_json.py | 5 +- .../tests/test_integ_edit_distance_ocr.py | 4 +- .../tests/test_integ_word_error_rate_ocr.py | 5 +- qurator/dinglehopper/tests/test_ocr_files.py | 82 +++++++++++++------ 7 files changed, 85 insertions(+), 32 deletions(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 180ecd3..e1267f7 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -15,7 +15,7 @@ import unicodedata @attr.s(frozen=True) class ExtractedText: - segments = attr.ib() + segments = attr.ib(converter=list) joiner = attr.ib(type=str) # TODO Types are not validated (attr does not do this yet) @@ -80,6 +80,7 @@ class ExtractedTextSegment: segment_text = None with suppress(AttributeError): segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text + segment_text = segment_text or '' segment_text = normalize_sbb(segment_text) return cls(segment_id, segment_text) @@ -157,7 +158,7 @@ def page_extract(tree): regions.append(ExtractedTextSegment.from_text_segment(region, nsmap)) # Filter empty region texts - regions = [r for r in regions if r.text is not None] + regions = (r for r in regions if r.text is not None) return ExtractedText(regions, '\n') # FIXME needs to handle normalization diff --git a/qurator/dinglehopper/tests/test_integ_align.py b/qurator/dinglehopper/tests/test_integ_align.py index df1e230..b35974b 100644 --- a/qurator/dinglehopper/tests/test_integ_align.py +++ b/qurator/dinglehopper/tests/test_integ_align.py @@ -13,11 +13,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @pytest.mark.integration def test_align_page_files(): # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. - # → 4 elements in the alignment should be different. + # → 2 elements in the alignment should be different, the ligature is + # (currently) not counted due to normalization. # NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters. gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) result = list(align(gt, ocr)) - assert sum(left != right for left, right in result) == 4 + for left, right in result: + if left != right: + print(left, right) + assert sum(left != right for left, right in result) == 2 diff --git a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py index c27cd31..1c3bf52 100644 --- a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py +++ b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py @@ -4,6 +4,7 @@ import os import pytest from lxml import etree as ET +from uniseg.graphemecluster import grapheme_clusters from .. import character_error_rate, page_text, alto_text @@ -13,9 +14,14 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @pytest.mark.integration def test_character_error_rate_between_page_files(): # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. + # The fi ligature does not count. gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) - assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311) # 2 TextRegions, 1 \n + + gt_len = len(list(grapheme_clusters(gt))) + expected_cer = 2/gt_len + + assert character_error_rate(gt, ocr) == expected_cer @pytest.mark.integration diff --git a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py index 35421bb..d71bc14 100644 --- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py +++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py @@ -1,4 +1,3 @@ -import os import json import pytest @@ -16,7 +15,11 @@ def test_cli_json(tmp_path): with open('ocr.txt', 'w') as ocrf: ocrf.write('AAAAB') + with open('gt.txt', 'r') as gtf: + print(gtf.read()) process('gt.txt', 'ocr.txt', 'report') + with open('report.json', 'r') as jsonf: + print(jsonf.read()) with open('report.json', 'r') as jsonf: j = json.load(jsonf) assert j['cer'] == pytest.approx(0.2) diff --git a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py index 2857d56..cbe12f8 100644 --- a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py +++ b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py @@ -13,9 +13,11 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @pytest.mark.integration def test_distance_between_page_files(): # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. + # Due to normalization, we don't count the ligature. + # → 2 differences gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) - assert distance(gt, ocr) == 4 + assert distance(gt, ocr) == 2 @pytest.mark.integration diff --git a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py index 1d2dead..f5c922b 100644 --- a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py +++ b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py @@ -12,14 +12,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @pytest.mark.integration def test_word_error_rate_between_page_files(): - # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. → 3 changed words + # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words, + # the ligature does not count → 2 errors gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line assert len(list(words(gt))) == gt_word_count ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) - assert word_error_rate(gt, ocr) == 3/gt_word_count + assert word_error_rate(gt, ocr) == 2/gt_word_count @pytest.mark.integration diff --git a/qurator/dinglehopper/tests/test_ocr_files.py b/qurator/dinglehopper/tests/test_ocr_files.py index dd9377a..3291152 100644 --- a/qurator/dinglehopper/tests/test_ocr_files.py +++ b/qurator/dinglehopper/tests/test_ocr_files.py @@ -6,7 +6,8 @@ import textwrap import pytest -from .. import alto_namespace, alto_text, page_namespace, page_text, text +from .util import working_directory +from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @@ -49,27 +50,51 @@ def test_page_namespace(): def test_page_test(): tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml')) result = page_text(tree) + + # We are currently normalizing on extraction, so the text is normalized. + # + # expected = textwrap.dedent("""\ + # ber die vielen Sorgen wegen deelben vergaß + # Hartkopf, der Frau Amtmnnin das ver⸗ + # ſproene zu berliefern. — Ein Erpreer + # wurde an ihn abgeſit, um ihn ums Him⸗ + # melswien zu ſagen, daß er das Verſproene + # glei den Augenbli berbringen mte, die + # Frau Amtmnnin htte  auf ihn verlaen, + # und nun wßte e nit, was e anfangen + # ſote. Den Augenbli ſote er kommen, + # ſon vergieng e in ihrer Ang. — Die + # Ge wren ſon angekommen, und es fehlte + # ihr do no an aem. — + # Hartkopf mußte  er bennen, und + # endli na langem Nadenken fiel es ihm er + # wieder ein. — Er langte den Zettel aus dem + # Accisbue heraus, und ſagte ſeiner Frau, daß + # e das, was da wre, herbeyſaffen mte. + # Jndeß mangelten do einige Generalia, die + # alſo wegfielen. — Hartkopf gieng ſelb + # mit und berbrate es. —""") expected = textwrap.dedent("""\ - ber die vielen Sorgen wegen deelben vergaß - Hartkopf, der Frau Amtmnnin das ver⸗ - ſproene zu berliefern. — Ein Erpreer - wurde an ihn abgeſit, um ihn ums Him⸗ - melswien zu ſagen, daß er das Verſproene - glei den Augenbli berbringen mte, die - Frau Amtmnnin htte  auf ihn verlaen, - und nun wßte e nit, was e anfangen - ſote. Den Augenbli ſote er kommen, - ſon vergieng e in ihrer Ang. — Die - Ge wren ſon angekommen, und es fehlte - ihr do no an aem. — - Hartkopf mußte  er bennen, und - endli na langem Nadenken fiel es ihm er - wieder ein. — Er langte den Zettel aus dem - Accisbue heraus, und ſagte ſeiner Frau, daß - e das, was da wre, herbeyſaffen mte. - Jndeß mangelten do einige Generalia, die - alſo wegfielen. — Hartkopf gieng ſelb - mit und berbrate es. —""") + über die vielen Sorgen wegen deſſelben vergaß + Hartkopf, der Frau Amtmännin das ver- + ſprochene zu überliefern. – Ein Erpreſſer + wurde an ihn abgeſchickt, um ihn ums Him- + melswillen zu ſagen, daß er das Verſprochene + gleich den Augenblick überbringen möchte, die + Frau Amtmännin hätte ſich auf ihn verlaſſen, + und nun wüßte ſie nicht, was ſie anfangen + ſollte. Den Augenblick ſollte er kommen, + ſonſt vergieng ſie in ihrer Angſt. – Die + Gäſte wären ſchon angekommen, und es fehlte + ihr doch noch an allem. – + Hartkopf mußte ſich erſt beſinnen, und + endlich nach langem Nachdenken fiel es ihm erſt + wieder ein. – Er langte den Zettel aus dem + Accisbuche heraus, und ſagte ſeiner Frau, daß + ſie das, was da wäre, herbeyſchaffen möchte. + Jndeß mangelten doch einige Generalia, die + alſo wegfielen. – Hartkopf gieng ſelbſt + mit und überbrachte es. –""") assert result == expected @@ -92,7 +117,8 @@ def test_page_order(): tree = ET.parse(os.path.join(data_dir, 'order.page.xml')) result = page_text(tree) - assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL) + print(result) + assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die', result, re.DOTALL) def test_page_mixed_regions(): @@ -106,5 +132,15 @@ def test_page_mixed_regions(): def test_text(): assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml')) - assert "wieder ein. — Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml')) + assert "wieder ein. – Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml')) assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt')) + + +def test_plain(tmp_path): + with working_directory(str(tmp_path)): + with open('ocr.txt', 'w') as ocrf: + ocrf.write('AAAAB') + + result = plain_text('ocr.txt') + expected = 'AAAAB' + assert result == expected From 6eb0a9350cc3112ab61be0076542b02eab431eb9 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 20:05:33 +0200 Subject: [PATCH 35/87] =?UTF-8?q?=F0=9F=8E=A8=20dinglehopper:=20Unfuck=20s?= =?UTF-8?q?ubstitutions=20a=20bit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../dinglehopper/substitute_equivalences.py | 37 ++++++++++++------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/qurator/dinglehopper/substitute_equivalences.py b/qurator/dinglehopper/substitute_equivalences.py index 1b7e0cf..39be276 100644 --- a/qurator/dinglehopper/substitute_equivalences.py +++ b/qurator/dinglehopper/substitute_equivalences.py @@ -1,21 +1,15 @@ import unicodedata -def substitute_equivalences(s): +def unjoin_ligatures(s): + """Unjoin ligatures, i.e. ff becomes ff.""" - # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR - # It might make sense to use different rules for GT and for the different OCR equivalences = { - '': 'ü', '': 'ſſ', "\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I - '': 'ä', '': 'ch', - '==': '–', # → en-dash - '—': '–', # em-dash → en-dash '': 'ck', '': 'll', - '': 'ö', '': 'ſi', '': 'ſt', 'fi': 'fi', @@ -23,12 +17,7 @@ def substitute_equivalences(s): 'fl': 'fl', 'ffi': 'ffi', '': 'ct', - '’': '\'', - '⸗': '-', '': 'tz', # MUFI: LATIN SMALL LIGATURE TZ - 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E - 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E - 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E '\uf532': 'as', # eMOP: Latin small ligature as '\uf533': 'is', # eMOP: Latin small ligature is '\uf534': 'us', # eMOP: Latin small ligature us @@ -37,10 +26,32 @@ def substitute_equivalences(s): '\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly? '\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P 'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST + } + s = unicodedata.normalize('NFC', s) + for fr, to in equivalences.items(): + s = s.replace(fr, to) + return s + + +def substitute_equivalences(s): + # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR + # It might make sense to use different rules for GT and for the different OCR + equivalences = { + '': 'ü', + '': 'ä', + '==': '–', # → en-dash + '—': '–', # em-dash → en-dash + '': 'ö', + '’': '\'', + '⸗': '-', + 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E + 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E + 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E '\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT } s = unicodedata.normalize('NFC', s) + s = unjoin_ligatures(s) for fr, to in equivalences.items(): s = s.replace(fr, to) return s From e0aa9bc3f4b9caf8d469033e55e2ac3817c1bd4f Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 20:19:38 +0200 Subject: [PATCH 36/87] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Remove=20o?= =?UTF-8?q?bsolete=20XXX=20about=20None=20ids?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/tests/extracted_text_test.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index 82c3a53..2e6a9e6 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -52,9 +52,6 @@ def test_align(): for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)): left_id = test1.segment_id_for_pos(left_pos) if left is not None else None right_id = test2.segment_id_for_pos(right_pos) if right is not None else None - # XXX note that deletions and inserts only produce one id + None, UI must - # support this, i.e. display for the one id produced - # XXX otherwise, it should always display for BOTH ids el = AlignmentElement(left, right, left_id, right_id) alignment.append(el) if left is not None: From 507ad6b6a4e08c3234de6d58b2f80b2e264657ed Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 20:21:18 +0200 Subject: [PATCH 37/87] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Remove=20o?= =?UTF-8?q?bsolete=20XXX=20that=20has=20a=20GitHub=20issue?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index e1267f7..1652b71 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -51,8 +51,6 @@ def normalize(text, normalization): if normalization == Normalization.NFC_MUFI: raise NotImplementedError() if normalization == Normalization.NFC_SBB: - # XXX This needs to be redone - # https://github.com/qurator-spk/dinglehopper/issues/11 return substitute_equivalences(text) else: raise ValueError() From bc006746dd023a9a0e845a245598b03ffe0f1caf Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 20:24:58 +0200 Subject: [PATCH 38/87] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Replace=20?= =?UTF-8?q?XXX=20with=20an=20actual=20comment?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index ea0c9bb..81acb41 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -56,9 +56,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): if isinstance(gt_in, ExtractedText): gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None - # XXX note that deletions and inserts only produce one id + None, UI must - # support this, i.e. display for the one id produced - # XXX otherwise, it should always display for BOTH ids + # Deletions and inserts only produce one id + None, UI must + # support this, i.e. display for the one id produced gtx += joiner + format_thing(g, css_classes, gt_id) ocrx += joiner + format_thing(o, css_classes, ocr_id) From c9109999db0f5d787edad1d7c30e979e98b18190 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 20:29:50 +0200 Subject: [PATCH 39/87] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Remove=20o?= =?UTF-8?q?bsolete=20normalization-related=20FIXME?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 81acb41..2889e46 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -87,10 +87,6 @@ def process(gt, ocr, report_prefix, *, metrics=True): gt_text = extract(gt) ocr_text = extract(ocr) - # FIXME - #gt_text = substitute_equivalences(gt_text) - #ocr_text = substitute_equivalences(ocr_text) - cer, n_characters = character_error_rate_n(gt_text, ocr_text) wer, n_words = word_error_rate_n(gt_text, ocr_text) From e972328e51710676e7e02256a1b580ee504b9834 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 20:43:25 +0200 Subject: [PATCH 40/87] =?UTF-8?q?=E2=9C=A8=20dinglehopper:=20Validate=20re?= =?UTF-8?q?ad=20segment=20ids?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli.py | 1 - qurator/dinglehopper/ocr_files.py | 11 +++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 2889e46..9c963c1 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -26,7 +26,6 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): # Set Bootstrap tooltip to the segment id if id_: html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_) - # XXX must sanitize id_ or do we trust the XML? if css_classes: return '{html_t}'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 1652b71..d3918d1 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -11,6 +11,7 @@ import sys import attr import enum import unicodedata +import re @attr.s(frozen=True) @@ -30,7 +31,7 @@ class ExtractedText: if not self._segment_id_for_pos: segment_id_for_pos = [] for s in self.segments: - segment_id_for_pos.extend(repeat(s.id, len(s.text))) + segment_id_for_pos.extend(repeat(s.segment_id, len(s.text))) segment_id_for_pos.extend(repeat(None, len(self.joiner))) # This is frozen, so we have to jump through the hoop: object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos) @@ -62,7 +63,13 @@ normalize_sbb = lambda t: normalize(t, Normalization.NFC_SBB) @attr.s(frozen=True) class ExtractedTextSegment: - id = attr.ib(type=str) + segment_id = attr.ib(type=str) + @segment_id.validator + def check(self, attribute, value): + if value is None: + return + if not re.match(r'[\w\d_-]+', value): + raise ValueError('Malformed segment id "{}"'.format(value)) text = attr.ib(type=str) @text.validator def check(self, attribute, value): From 5aa74e83831ac9c8b8008fec82a4fa597567e85c Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 20:59:37 +0200 Subject: [PATCH 41/87] =?UTF-8?q?=F0=9F=8E=A8=20dinglehopper:=20Make=20PyC?= =?UTF-8?q?harm=20happier=20with=20the=20type=20hinting,=20newlines=20etc.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/align.py | 8 ++++---- qurator/dinglehopper/edit_distance.py | 1 - qurator/dinglehopper/ocr_files.py | 12 ++++++++---- qurator/dinglehopper/tests/test_integ_ocrd_cli.py | 2 -- qurator/dinglehopper/tests/util.py | 4 ++-- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py index ab44760..87febb7 100644 --- a/qurator/dinglehopper/align.py +++ b/qurator/dinglehopper/align.py @@ -28,16 +28,16 @@ def seq_align(s1, s2): if o: if o[0] == 'insert': - yield (None, s2[j]) + yield None, s2[j] j += 1 elif o[0] == 'delete': - yield (s1[i], None) + yield s1[i], None i += 1 elif o[0] == 'replace': - yield (s1[i], s2[j]) + yield s1[i], s2[j] i += 1 j += 1 else: - yield (s1[i], s2[j]) + yield s1[i], s2[j] i += 1 j += 1 diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index a6643c7..284b676 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -8,7 +8,6 @@ import numpy as np from uniseg.graphemecluster import grapheme_clusters - def levenshtein_matrix(seq1: Sequence, seq2: Sequence): """Compute the matrix commonly computed to produce the Levenshtein distance. This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index d3918d1..a048b1e 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -1,5 +1,6 @@ from __future__ import division, print_function +from typing import Optional from warnings import warn from lxml import etree as ET @@ -58,21 +59,24 @@ def normalize(text, normalization): # XXX hack -normalize_sbb = lambda t: normalize(t, Normalization.NFC_SBB) +def normalize_sbb(t): + return normalize(t, Normalization.NFC_SBB) @attr.s(frozen=True) class ExtractedTextSegment: - segment_id = attr.ib(type=str) + segment_id = attr.ib(type=Optional[str]) + @segment_id.validator - def check(self, attribute, value): + def check(self, _, value): if value is None: return if not re.match(r'[\w\d_-]+', value): raise ValueError('Malformed segment id "{}"'.format(value)) text = attr.ib(type=str) + @text.validator - def check(self, attribute, value): + def check(self, _, value): if value is not None and normalize(value, self.normalization) != value: raise ValueError('String "{}" is not normalized.'.format(value)) normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB) diff --git a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py index 3d78f57..75bb816 100644 --- a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py +++ b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py @@ -1,11 +1,9 @@ import os -import re import shutil import json from pathlib import Path from click.testing import CliRunner -import pytest from .util import working_directory diff --git a/qurator/dinglehopper/tests/util.py b/qurator/dinglehopper/tests/util.py index 52b7506..1f224e5 100644 --- a/qurator/dinglehopper/tests/util.py +++ b/qurator/dinglehopper/tests/util.py @@ -21,8 +21,8 @@ def diffprint(x, y): _diffprint(x, y) -def unzip(l): - return zip(*l) +def unzip(an_iterable_of_tuples): + return zip(*an_iterable_of_tuples) class working_directory: From 84c9e6a9c7258057262f26972a946ac7a666c4f1 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 18:29:11 +0200 Subject: [PATCH 42/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 extracted_text.py diff --git a/extracted_text.py b/extracted_text.py new file mode 100644 index 0000000..d1dc6f0 --- /dev/null +++ b/extracted_text.py @@ -0,0 +1,42 @@ +import attr + + +@attr.s(frozen=True) +class ExtractedText: + segments = attr.ib() + joiner = attr.ib(type=str) + # XXX Use type annotations for attr types when support for Python 3.5 is dropped + # XXX Also I think these are not validated? + + @property + def text(self): + return self.joiner.join(s.text for s in self.segments) + + def segment_id_for_pos(self, pos): + i = 0 + for s in self.segments: + if i <= pos < i + len(s.text): + return s.id + i += len(s.text) + if i <= pos < i + len(self.joiner): + return None + i += len(self.joiner) + + +@attr.s(frozen=True) +class ExtractedTextSegment: + id = attr.ib(type=str) + text = attr.ib(type=str) + + +test1 = ExtractedText([ + ExtractedTextSegment('s0', 'foo'), + ExtractedTextSegment('s1', 'bar'), + ExtractedTextSegment('s2', 'bazinga') +], ' ') + + +assert test1.text == 'foo bar bazinga' +assert test1.segment_id_for_pos(0) == 's0' +assert test1.segment_id_for_pos(3) == None +assert test1.segment_id_for_pos(10) == 's2' From 2c69e077fe952ddf55d5fb0eb715759cae091efb Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 18:30:34 +0200 Subject: [PATCH 43/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index a275fc7..846990b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ numpy colorama MarkupSafe ocrd >= 1.0.0b15 +attrs From bc630233d0d15aff7dabe8bcda83fe23e766a200 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 19:36:49 +0200 Subject: [PATCH 44/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/extracted_text.py b/extracted_text.py index d1dc6f0..f99c8ac 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -1,4 +1,5 @@ import attr +import unicodedata @attr.s(frozen=True) @@ -23,10 +24,25 @@ class ExtractedText: i += len(self.joiner) +NORM_NFC = 0 + + +def normalize(text, normalization): + if normalization == NORM_NFC: + return unicodedata.normalize('NFC', text) + else: + raise ValueError() + + @attr.s(frozen=True) class ExtractedTextSegment: id = attr.ib(type=str) text = attr.ib(type=str) + @text.validator + def check(self, attribute, value): + if normalize(value, self.normalization) != value: + raise ValueError('String "{}" is not normalized.'.format(value)) + normalization = attr.ib(default=NORM_NFC) test1 = ExtractedText([ @@ -40,3 +56,6 @@ assert test1.text == 'foo bar bazinga' assert test1.segment_id_for_pos(0) == 's0' assert test1.segment_id_for_pos(3) == None assert test1.segment_id_for_pos(10) == 's2' + +# ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) +ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) From 4bd30e6686ebbcba3cb1f7f170825a163b01b53b Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 19:40:57 +0200 Subject: [PATCH 45/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 16 ---------------- extracted_text_test.py | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 16 deletions(-) create mode 100644 extracted_text_test.py diff --git a/extracted_text.py b/extracted_text.py index f99c8ac..a76f402 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -43,19 +43,3 @@ class ExtractedTextSegment: if normalize(value, self.normalization) != value: raise ValueError('String "{}" is not normalized.'.format(value)) normalization = attr.ib(default=NORM_NFC) - - -test1 = ExtractedText([ - ExtractedTextSegment('s0', 'foo'), - ExtractedTextSegment('s1', 'bar'), - ExtractedTextSegment('s2', 'bazinga') -], ' ') - - -assert test1.text == 'foo bar bazinga' -assert test1.segment_id_for_pos(0) == 's0' -assert test1.segment_id_for_pos(3) == None -assert test1.segment_id_for_pos(10) == 's2' - -# ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) -ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) diff --git a/extracted_text_test.py b/extracted_text_test.py new file mode 100644 index 0000000..29fabfe --- /dev/null +++ b/extracted_text_test.py @@ -0,0 +1,19 @@ +from extracted_text import * + +def test_text(): + test1 = ExtractedText([ + ExtractedTextSegment('s0', 'foo'), + ExtractedTextSegment('s1', 'bar'), + ExtractedTextSegment('s2', 'bazinga') + ], ' ') + + + assert test1.text == 'foo bar bazinga' + assert test1.segment_id_for_pos(0) == 's0' + assert test1.segment_id_for_pos(3) is None + assert test1.segment_id_for_pos(10) == 's2' + +# TODO handle grapheme cluster positions? + +# ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) +ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) From 89852314dc7cf57aa85574a613b008fee58d3226 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 19:49:12 +0200 Subject: [PATCH 46/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 3 +++ extracted_text_test.py | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/extracted_text.py b/extracted_text.py index a76f402..69d836b 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -2,6 +2,9 @@ import attr import unicodedata +# TODO handle grapheme cluster positions? + + @attr.s(frozen=True) class ExtractedText: segments = attr.ib() diff --git a/extracted_text_test.py b/extracted_text_test.py index 29fabfe..b302ca8 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -13,7 +13,5 @@ def test_text(): assert test1.segment_id_for_pos(3) is None assert test1.segment_id_for_pos(10) == 's2' -# TODO handle grapheme cluster positions? - # ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) From 534e042f9e14007121d05ab5716ac838ab52c808 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 20:29:01 +0200 Subject: [PATCH 47/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 9 ++++++--- extracted_text_test.py | 15 ++++++++++----- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/extracted_text.py b/extracted_text.py index 69d836b..23cd519 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -1,5 +1,6 @@ import attr import unicodedata +import enum # TODO handle grapheme cluster positions? @@ -27,11 +28,13 @@ class ExtractedText: i += len(self.joiner) -NORM_NFC = 0 +class Normalization(enum.Enum): + NFC = 1 + NFC_MUFI = 2 def normalize(text, normalization): - if normalization == NORM_NFC: + if normalization == Normalization.NFC: return unicodedata.normalize('NFC', text) else: raise ValueError() @@ -45,4 +48,4 @@ class ExtractedTextSegment: def check(self, attribute, value): if normalize(value, self.normalization) != value: raise ValueError('String "{}" is not normalized.'.format(value)) - normalization = attr.ib(default=NORM_NFC) + normalization = attr.ib(converter=Normalization, default=Normalization.NFC) diff --git a/extracted_text_test.py b/extracted_text_test.py index b302ca8..2e916cd 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -1,17 +1,22 @@ -from extracted_text import * +import unicodedata +import pytest +from extracted_text import ExtractedText, ExtractedTextSegment + def test_text(): test1 = ExtractedText([ ExtractedTextSegment('s0', 'foo'), - ExtractedTextSegment('s1', 'bar'), + ExtractedTextSegment(1, 'bar'), ExtractedTextSegment('s2', 'bazinga') ], ' ') - assert test1.text == 'foo bar bazinga' assert test1.segment_id_for_pos(0) == 's0' assert test1.segment_id_for_pos(3) is None assert test1.segment_id_for_pos(10) == 's2' -# ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) -ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) + +def test_normalization_check(): + with pytest.raises(ValueError, match=r'.*is not normalized.*'): + ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) + assert ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) From 8435d88419c75e9a7551da29294069fd4ad84937 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 20:31:54 +0200 Subject: [PATCH 48/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data?= =?UTF-8?q?=20structure=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 4 ++-- extracted_text_test.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/extracted_text.py b/extracted_text.py index 23cd519..c84c77b 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -4,14 +4,14 @@ import enum # TODO handle grapheme cluster positions? +# TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped +# TODO types are not validated (attr does not do this yet) @attr.s(frozen=True) class ExtractedText: segments = attr.ib() joiner = attr.ib(type=str) - # XXX Use type annotations for attr types when support for Python 3.5 is dropped - # XXX Also I think these are not validated? @property def text(self): diff --git a/extracted_text_test.py b/extracted_text_test.py index 2e916cd..4919a76 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -6,7 +6,7 @@ from extracted_text import ExtractedText, ExtractedTextSegment def test_text(): test1 = ExtractedText([ ExtractedTextSegment('s0', 'foo'), - ExtractedTextSegment(1, 'bar'), + ExtractedTextSegment('s1', 'bar'), ExtractedTextSegment('s2', 'bazinga') ], ' ') From 4cd835ae51348e61a72ade2422c947a528bc4765 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 13:04:36 +0200 Subject: [PATCH 49/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20=20Test=20?= =?UTF-8?q?aligning=20by=20character=20while=20retaining=20segment=20id=20?= =?UTF-8?q?info?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text_test.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/extracted_text_test.py b/extracted_text_test.py index 4919a76..890b045 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -1,6 +1,8 @@ import unicodedata import pytest from extracted_text import ExtractedText, ExtractedTextSegment +from uniseg.graphemecluster import grapheme_clusters +from qurator.dinglehopper import seq_align def test_text(): @@ -20,3 +22,33 @@ def test_normalization_check(): with pytest.raises(ValueError, match=r'.*is not normalized.*'): ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) assert ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) + + +def test_align(): + """ + Test aligning by character while retaining segment id info + + The difficulty here is that aligning should work on grapheme clusters, + not Python characters. + """ + + test1 = ExtractedText([ + ExtractedTextSegment('s0', 'foo'), + ExtractedTextSegment('s1', 'bar'), + ExtractedTextSegment('s2', 'bazinga') + ], ' ') + test2 = ExtractedText([ + ExtractedTextSegment('x0', 'foo'), + ExtractedTextSegment('x1', 'bar'), + ExtractedTextSegment('x2', '.'), # extra . + ExtractedTextSegment('x2', 'bazim̃ga'), # different grapheme cluster, m̃ also is two Python characters + ], ' ') + + left_pos = 0; right_pos = 0 + for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)): + print(left, right, test1.segment_id_for_pos(left_pos), test2.segment_id_for_pos(right_pos)) + if left is not None: + left_pos += len(left) + if right is not None: + right_pos += len(right) + assert False From 167dad18f4e8b6fe48325881fe0c383032dc9b7c Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 13:54:46 +0200 Subject: [PATCH 50/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20=20Test=20?= =?UTF-8?q?aligning=20by=20character=20while=20retaining=20segment=20id=20?= =?UTF-8?q?info?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text_test.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/extracted_text_test.py b/extracted_text_test.py index 890b045..60785b7 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -3,6 +3,7 @@ import pytest from extracted_text import ExtractedText, ExtractedTextSegment from uniseg.graphemecluster import grapheme_clusters from qurator.dinglehopper import seq_align +from collections import namedtuple def test_text(): @@ -24,6 +25,9 @@ def test_normalization_check(): assert ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) +AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id') + + def test_align(): """ Test aligning by character while retaining segment id info @@ -35,20 +39,30 @@ def test_align(): test1 = ExtractedText([ ExtractedTextSegment('s0', 'foo'), ExtractedTextSegment('s1', 'bar'), - ExtractedTextSegment('s2', 'bazinga') + ExtractedTextSegment('s2', 'batzinga') ], ' ') test2 = ExtractedText([ ExtractedTextSegment('x0', 'foo'), ExtractedTextSegment('x1', 'bar'), ExtractedTextSegment('x2', '.'), # extra . - ExtractedTextSegment('x2', 'bazim̃ga'), # different grapheme cluster, m̃ also is two Python characters + ExtractedTextSegment('x3', 'bazim̃ga'), # deletion + different grapheme cluster, m̃ also is two Python characters ], ' ') - left_pos = 0; right_pos = 0 + left_pos = 0; right_pos = 0; alignment = [] for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)): - print(left, right, test1.segment_id_for_pos(left_pos), test2.segment_id_for_pos(right_pos)) + left_id = test1.segment_id_for_pos(left_pos) if left is not None else None + right_id = test2.segment_id_for_pos(right_pos) if right is not None else None + el = AlignmentElement(left, right, left_id, right_id) + alignment.append(el) if left is not None: left_pos += len(left) if right is not None: right_pos += len(right) - assert False + + print('test1: {}'.format(test1.text)) + print('test2: {}'.format(test2.text)) + + assert alignment[0] == ('f', 'f', 's0', 'x0') + assert alignment[8] == (None, '.', None, 'x2') + assert alignment[12] == ('t', None, 's2', None) + assert alignment[15] == ('n', 'm̃', 's2', 'x3') From 0d569e81c33f3e3a4770f594d0d9c17a7e6e80ed Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 14:50:32 +0200 Subject: [PATCH 51/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Test=20ali?= =?UTF-8?q?gning=20by=20character=20while=20retaining=20segment=20id=20inf?= =?UTF-8?q?o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/extracted_text_test.py b/extracted_text_test.py index 60785b7..9eb57b6 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -52,6 +52,8 @@ def test_align(): for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)): left_id = test1.segment_id_for_pos(left_pos) if left is not None else None right_id = test2.segment_id_for_pos(right_pos) if right is not None else None + # XXX note that deletions and inserts only produce one id + None, UI must + # support this, i.e. display for the one id produced el = AlignmentElement(left, right, left_id, right_id) alignment.append(el) if left is not None: From 55db2b713f80db7ea19fa1b7cec5a49d7f70b512 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 14:54:50 +0200 Subject: [PATCH 52/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Test=20ali?= =?UTF-8?q?gning=20by=20character=20while=20retaining=20segment=20id=20inf?= =?UTF-8?q?o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 1 - 1 file changed, 1 deletion(-) diff --git a/extracted_text.py b/extracted_text.py index c84c77b..b37f341 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -3,7 +3,6 @@ import unicodedata import enum -# TODO handle grapheme cluster positions? # TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped # TODO types are not validated (attr does not do this yet) From 1083dcc5b96e7cc1f53789cb993c6b18081aaed9 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 14:56:23 +0200 Subject: [PATCH 53/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Test=20ali?= =?UTF-8?q?gning=20by=20character=20while=20retaining=20segment=20id=20inf?= =?UTF-8?q?o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/extracted_text_test.py b/extracted_text_test.py index 9eb57b6..b84df87 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -54,6 +54,7 @@ def test_align(): right_id = test2.segment_id_for_pos(right_pos) if right is not None else None # XXX note that deletions and inserts only produce one id + None, UI must # support this, i.e. display for the one id produced + # XXX otherwise, it should always display for BOTH ids el = AlignmentElement(left, right, left_id, right_id) alignment.append(el) if left is not None: From 9f8bb1d8ea0ec0b314cc7cc67f93ba394cdd5d68 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 15:35:52 +0200 Subject: [PATCH 54/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Extract=20?= =?UTF-8?q?text=20while=20retaining=20segment=20id=20info?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 50 ---------------------- extracted_text_test.py | 2 +- qurator/dinglehopper/ocr_files.py | 71 ++++++++++++++++++++++++++++--- 3 files changed, 65 insertions(+), 58 deletions(-) delete mode 100644 extracted_text.py diff --git a/extracted_text.py b/extracted_text.py deleted file mode 100644 index b37f341..0000000 --- a/extracted_text.py +++ /dev/null @@ -1,50 +0,0 @@ -import attr -import unicodedata -import enum - - -# TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped -# TODO types are not validated (attr does not do this yet) - - -@attr.s(frozen=True) -class ExtractedText: - segments = attr.ib() - joiner = attr.ib(type=str) - - @property - def text(self): - return self.joiner.join(s.text for s in self.segments) - - def segment_id_for_pos(self, pos): - i = 0 - for s in self.segments: - if i <= pos < i + len(s.text): - return s.id - i += len(s.text) - if i <= pos < i + len(self.joiner): - return None - i += len(self.joiner) - - -class Normalization(enum.Enum): - NFC = 1 - NFC_MUFI = 2 - - -def normalize(text, normalization): - if normalization == Normalization.NFC: - return unicodedata.normalize('NFC', text) - else: - raise ValueError() - - -@attr.s(frozen=True) -class ExtractedTextSegment: - id = attr.ib(type=str) - text = attr.ib(type=str) - @text.validator - def check(self, attribute, value): - if normalize(value, self.normalization) != value: - raise ValueError('String "{}" is not normalized.'.format(value)) - normalization = attr.ib(converter=Normalization, default=Normalization.NFC) diff --git a/extracted_text_test.py b/extracted_text_test.py index b84df87..82c3a53 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -1,6 +1,6 @@ import unicodedata import pytest -from extracted_text import ExtractedText, ExtractedTextSegment +from qurator.dinglehopper import ExtractedText, ExtractedTextSegment from uniseg.graphemecluster import grapheme_clusters from qurator.dinglehopper import seq_align from collections import namedtuple diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index b57a047..7d06dbe 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -3,9 +3,57 @@ from __future__ import division, print_function from warnings import warn from lxml import etree as ET +from lxml.etree import XMLSyntaxError import sys +import attr +import enum +import unicodedata + + +@attr.s(frozen=True) +class ExtractedText: + segments = attr.ib() + joiner = attr.ib(type=str) + # TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped + # TODO Types are not validated (attr does not do this yet) + + @property + def text(self): + return self.joiner.join(s.text for s in self.segments) + + def segment_id_for_pos(self, pos): + i = 0 + for s in self.segments: + if i <= pos < i + len(s.text): + return s.id + i += len(s.text) + if i <= pos < i + len(self.joiner): + return None + i += len(self.joiner) + # XXX Cache results + + +class Normalization(enum.Enum): + NFC = 1 + NFC_MUFI = 2 + + +def normalize(text, normalization): + if normalization == Normalization.NFC: + return unicodedata.normalize('NFC', text) + else: + raise ValueError() -from lxml.etree import XMLSyntaxError + +@attr.s(frozen=True) +class ExtractedTextSegment: + id = attr.ib(type=str) + text = attr.ib(type=str) + @text.validator + def check(self, attribute, value): + if normalize(value, self.normalization) != value: + raise ValueError('String "{}" is not normalized.'.format(value)) + normalization = attr.ib(converter=Normalization, default=Normalization.NFC) def alto_namespace(tree): @@ -21,7 +69,7 @@ def alto_namespace(tree): raise ValueError('Not an ALTO tree') -def alto_text(tree): +def alto_extract(tree): """Extract text from the given ALTO ElementTree.""" nsmap = {'alto': alto_namespace(tree)} @@ -29,9 +77,15 @@ def alto_text(tree): lines = ( ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap)) for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap)) - text_ = '\n'.join(lines) - return text_ + return ExtractedText((ExtractedTextSegment(None, line_text) for line_text in lines), '\n') + # TODO This currently does not extract any segment id, because we are + # clueless about the ALTO format. + # FIXME needs to handle normalization + + +def alto_text(tree): + return alto_extract(tree).text def page_namespace(tree): @@ -47,7 +101,7 @@ def page_namespace(tree): raise ValueError('Not a PAGE tree') -def page_text(tree): +def page_extract(tree): """Extract text from the given PAGE content ElementTree.""" nsmap = {'page': page_namespace(tree)} @@ -80,10 +134,13 @@ def page_text(tree): # XXX Does a file have to have regions etc.? region vs lines etc. # Filter empty region texts region_texts = (t for t in region_texts if t) + return ExtractedText((ExtractedTextSegment(None, region_text) for region_text in region_texts), '\n') + # TODO This currently does not extract any segment id + # FIXME needs to handle normalization - text_ = '\n'.join(region_texts) - return text_ +def page_text(tree): + return page_extract(tree).text def text(filename): From 4e182e0794457da329ee6cdeb0b8c818046fcd4f Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 15:37:34 +0200 Subject: [PATCH 55/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Extract=20?= =?UTF-8?q?text=20while=20retaining=20segment=20id=20info?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../dinglehopper/tests/extracted_text_test.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename extracted_text_test.py => qurator/dinglehopper/tests/extracted_text_test.py (100%) diff --git a/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py similarity index 100% rename from extracted_text_test.py rename to qurator/dinglehopper/tests/extracted_text_test.py From 275ff32524886df5c14b313b996c65bb3ad2f6a4 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 16:54:48 +0200 Subject: [PATCH 56/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Extract=20?= =?UTF-8?q?text=20while=20retaining=20segment=20id=20info?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 7d06dbe..a5187c5 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -4,6 +4,7 @@ from warnings import warn from lxml import etree as ET from lxml.etree import XMLSyntaxError +from contextlib import suppress import sys import attr import enum @@ -51,10 +52,20 @@ class ExtractedTextSegment: text = attr.ib(type=str) @text.validator def check(self, attribute, value): - if normalize(value, self.normalization) != value: + if value is not None and normalize(value, self.normalization) != value: raise ValueError('String "{}" is not normalized.'.format(value)) normalization = attr.ib(converter=Normalization, default=Normalization.NFC) + @classmethod + def from_text_segment(cls, text_segment, nsmap): + """Build an ExtractedTextSegment from a PAGE content text element""" + + segment_id = text_segment.attrib['id'] + segment_text = None + with suppress(AttributeError): + segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text + return cls(segment_id, segment_text) + def alto_namespace(tree): """Return the ALTO namespace used in the given ElementTree. @@ -106,13 +117,7 @@ def page_extract(tree): nsmap = {'page': page_namespace(tree)} - def region_text(region): - try: - return region.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text - except AttributeError: - return None - - region_texts = [] + regions = [] reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap) if reading_order is not None: for group in reading_order.iterfind('./*', namespaces=nsmap): @@ -122,20 +127,20 @@ def page_extract(tree): region_id = region_ref_indexed.attrib['regionRef'] region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap) if region is not None: - region_texts.append(region_text(region)) + regions.append(ExtractedTextSegment.from_text_segment(region, nsmap)) else: warn('Not a TextRegion: "%s"' % region_id) else: raise NotImplementedError else: for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap): - region_texts.append(region_text(region)) + regions.append(ExtractedTextSegment.from_text_segment(region, nsmap)) # XXX Does a file have to have regions etc.? region vs lines etc. # Filter empty region texts - region_texts = (t for t in region_texts if t) - return ExtractedText((ExtractedTextSegment(None, region_text) for region_text in region_texts), '\n') - # TODO This currently does not extract any segment id + regions = (r for r in regions if r.text is not None) + + return ExtractedText(regions, '\n') # FIXME needs to handle normalization From 1f6538b44c64d3defc28bbd34de195360c9c53d6 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 17:43:30 +0200 Subject: [PATCH 57/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Extract=20?= =?UTF-8?q?text=20while=20retaining=20segment=20id=20info?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index a5187c5..fd89b03 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -148,21 +148,35 @@ def page_text(tree): return page_extract(tree).text -def text(filename): - """Read the text from the given file. +def plain_extract(filename): + with open(filename, 'r') as f: + return ExtractedText( + (ExtractedTextSegment('line %d' % no, line) for no, line in enumerate(f.readlines())), + '\n' + ) + + +def plain_text(filename): + return plain_extract(filename).text + + +def extract(filename): + """Extract the text from the given file. Supports PAGE, ALTO and falls back to plain text. """ - try: tree = ET.parse(filename) except XMLSyntaxError: - with open(filename, 'r') as f: - return f.read() + return plain_extract(filename) try: - return page_text(tree) + return page_extract(tree) except ValueError: - return alto_text(tree) + return alto_extract(tree) + + +def text(filename): + return extract(filename).text if __name__ == '__main__': From 48ad340428d277e9ac71c82a8197f69aee890c30 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 13:25:35 +0200 Subject: [PATCH 58/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Display=20?= =?UTF-8?q?segment=20id=20when=20hovering=20over=20a=20character=20differe?= =?UTF-8?q?nce?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/character_error_rate.py | 4 ++ qurator/dinglehopper/cli.py | 64 ++++++++++++++----- qurator/dinglehopper/edit_distance.py | 7 ++ qurator/dinglehopper/ocr_files.py | 2 +- qurator/dinglehopper/templates/report.html.j2 | 11 ++++ qurator/dinglehopper/templates/report.html.js | 13 ++-- qurator/dinglehopper/word_error_rate.py | 15 +++++ 7 files changed, 96 insertions(+), 20 deletions(-) diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py index 05cc931..e99f391 100644 --- a/qurator/dinglehopper/character_error_rate.py +++ b/qurator/dinglehopper/character_error_rate.py @@ -15,6 +15,10 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]: :return: character error rate and length of the reference """ d = distance(reference, compared) + # XXX + from .cli import ExtractedText + if isinstance(reference, ExtractedText): + reference = reference.text n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference)))) if d == 0: diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 63bfd92..8e18b26 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -8,11 +8,11 @@ from markupsafe import escape from qurator.dinglehopper import * -def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align): +def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): gtx = '' ocrx = '' - def format_thing(t, css_classes=None): + def format_thing(t, css_classes=None, id_=None): if t is None: html_t = none css_classes += ' ellipsis' @@ -21,19 +21,52 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align): else: html_t = escape(t) + html_custom_attrs = "" + # XXX must sanitize id_ or do we trust the XML? + if id_: + html_custom_attrs = 'data-segment-id="{}"'.format(id_) + if css_classes: - return '{html_t}'.format(css_classes=css_classes, html_t=html_t) + return '{html_t}'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs) else: return '{html_t}'.format(html_t=html_t) - for k, (g, o) in enumerate(align(gt_things, ocr_things)): - if g == o: - css_classes = None - else: + if isinstance(gt_in, ExtractedText): + print(gt_in.text) + if not isinstance(ocr_in, ExtractedText): + raise TypeError() + # XXX splitting should be done in ExtractedText + gt_things = list(grapheme_clusters(gt_in.text)) + ocr_things = list(grapheme_clusters(ocr_in.text)) + else: + gt_things = gt_in + ocr_things = ocr_in + + + + g_pos = 0 + o_pos = 0 + for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)): + css_classes = None + gt_id = None + ocr_id = None + if g != o: css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k) + if isinstance(gt_in, ExtractedText): + gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None + ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None + # XXX note that deletions and inserts only produce one id + None, UI must + # support this, i.e. display for the one id produced + # XXX otherwise, it should always display for BOTH ids + + gtx += joiner + format_thing(g, css_classes, gt_id) + ocrx += joiner + format_thing(o, css_classes, ocr_id) + + if g is not None: + g_pos += len(g) + if o is not None: + o_pos += len(o) - gtx += joiner + format_thing(g, css_classes) - ocrx += joiner + format_thing(o, css_classes) return \ ''' @@ -51,20 +84,21 @@ def process(gt, ocr, report_prefix, *, metrics=True): Click on a wrapper. """ - gt_text = text(gt) - ocr_text = text(ocr) + gt_text = extract(gt) + ocr_text = extract(ocr) - gt_text = substitute_equivalences(gt_text) - ocr_text = substitute_equivalences(ocr_text) + # FIXME + #gt_text = substitute_equivalences(gt_text) + #ocr_text = substitute_equivalences(ocr_text) cer, n_characters = character_error_rate_n(gt_text, ocr_text) wer, n_words = word_error_rate_n(gt_text, ocr_text) - char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align) + char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·') gt_words = words_normalized(gt_text) ocr_words = words_normalized(ocr_text) - word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯', align=seq_align) + word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯') def json_float(value): """Convert a float value to an JSON float. diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index 8ca24d3..dc1cb24 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -8,6 +8,7 @@ import numpy as np from uniseg.graphemecluster import grapheme_clusters + def levenshtein_matrix(seq1: Sequence, seq2: Sequence): """Compute the matrix commonly computed to produce the Levenshtein distance. This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired @@ -75,6 +76,12 @@ def distance(s1, s2): Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme clusters. This should be the correct way to compare two Unicode strings. """ + # XXX + from .cli import ExtractedText + if isinstance(s1, ExtractedText): + s1 = s1.text + if isinstance(s2, ExtractedText): + s2 = s2.text s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1))) s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2))) return levenshtein(s1, s2) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index fd89b03..17868a7 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -138,7 +138,7 @@ def page_extract(tree): # XXX Does a file have to have regions etc.? region vs lines etc. # Filter empty region texts - regions = (r for r in regions if r.text is not None) + regions = [r for r in regions if r.text is not None] return ExtractedText(regions, '\n') # FIXME needs to handle normalization diff --git a/qurator/dinglehopper/templates/report.html.j2 b/qurator/dinglehopper/templates/report.html.j2 index 0c2f464..f7b2efb 100644 --- a/qurator/dinglehopper/templates/report.html.j2 +++ b/qurator/dinglehopper/templates/report.html.j2 @@ -26,12 +26,23 @@ border: 2px solid; border-radius: 5px; } + #status-box { + position: fixed; + background: grey; + color: white; + width: 100%; + height: 2em; + } +
foo
+ + +
{{ gt }}
diff --git a/qurator/dinglehopper/templates/report.html.js b/qurator/dinglehopper/templates/report.html.js index ac43676..01f5323 100644 --- a/qurator/dinglehopper/templates/report.html.js +++ b/qurator/dinglehopper/templates/report.html.js @@ -4,11 +4,16 @@ function find_diff_class(classes) { $(document).ready(function() { $('.diff').mouseover(function() { - let c = find_diff_class($(this).attr('class')) - $('.' + c).addClass('diff-highlight') + let c = find_diff_class($(this).attr('class')); + $('.' + c).addClass('diff-highlight'); + + segment_id = $(this).attr('data-segment-id'); + $('#status-box').text(segment_id); }); $('.diff').mouseout(function() { - let c = find_diff_class($(this).attr('class')) - $('.' + c).removeClass('diff-highlight') + let c = find_diff_class($(this).attr('class')); + $('.' + c).removeClass('diff-highlight'); + + $('#status-box').text(''); }); }); diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py index 7ed56e4..64eba0a 100644 --- a/qurator/dinglehopper/word_error_rate.py +++ b/qurator/dinglehopper/word_error_rate.py @@ -32,6 +32,11 @@ def words(s): cat = subcat[0] return cat in unwanted_categories or subcat in unwanted_subcategories + # XXX + from .cli import ExtractedText + if isinstance(s, ExtractedText): + s = s.text + # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters." for word in uniseg.wordbreak.words(s): @@ -42,10 +47,20 @@ def words(s): def words_normalized(s): + # XXX + from .cli import ExtractedText + if isinstance(s, ExtractedText): + s = s.text return words(unicodedata.normalize('NFC', s)) def word_error_rate_n(reference, compared) -> Tuple[float, int]: + # XXX + from .cli import ExtractedText + if isinstance(reference, ExtractedText): + reference = reference.text + if isinstance(compared, ExtractedText): + compared = compared.text if isinstance(reference, str): reference_seq = list(words_normalized(reference)) compared_seq = list(words_normalized(compared)) From d4e39d3d26ec6c975d7e02ac2ebae6549436d537 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 13:46:28 +0200 Subject: [PATCH 59/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Display=20?= =?UTF-8?q?segment=20id=20in=20the=20corresponding=20column?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli.py | 4 ++-- qurator/dinglehopper/templates/report.html.j2 | 6 +----- qurator/dinglehopper/templates/report.html.js | 4 ++-- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 8e18b26..2099c57 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -71,8 +71,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): return \ '''
-
{}
-
{}
+
{}
+
{}
'''.format(gtx, ocrx) diff --git a/qurator/dinglehopper/templates/report.html.j2 b/qurator/dinglehopper/templates/report.html.j2 index f7b2efb..f829ef8 100644 --- a/qurator/dinglehopper/templates/report.html.j2 +++ b/qurator/dinglehopper/templates/report.html.j2 @@ -26,7 +26,7 @@ border: 2px solid; border-radius: 5px; } - #status-box { + .status-box { position: fixed; background: grey; color: white; @@ -39,10 +39,6 @@ -
foo
- - -
{{ gt }}
diff --git a/qurator/dinglehopper/templates/report.html.js b/qurator/dinglehopper/templates/report.html.js index 01f5323..0baaa30 100644 --- a/qurator/dinglehopper/templates/report.html.js +++ b/qurator/dinglehopper/templates/report.html.js @@ -8,12 +8,12 @@ $(document).ready(function() { $('.' + c).addClass('diff-highlight'); segment_id = $(this).attr('data-segment-id'); - $('#status-box').text(segment_id); + $(this).closest('div').find('.status-box').text(segment_id); }); $('.diff').mouseout(function() { let c = find_diff_class($(this).attr('class')); $('.' + c).removeClass('diff-highlight'); - $('#status-box').text(''); + $(this).closest('div').find('.status-box').text(''); }); }); From 2579e0220c41a02f0696dbdea41e29f216d30715 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 14:25:11 +0200 Subject: [PATCH 60/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Remove=20d?= =?UTF-8?q?ebug=20output?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 2099c57..13543a5 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -32,7 +32,6 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): return '{html_t}'.format(html_t=html_t) if isinstance(gt_in, ExtractedText): - print(gt_in.text) if not isinstance(ocr_in, ExtractedText): raise TypeError() # XXX splitting should be done in ExtractedText From a320d5fd8ff211b70095ecdf5b17927d470fd4b3 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 15:53:15 +0200 Subject: [PATCH 61/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Re-introdu?= =?UTF-8?q?ce=20"substitute=5Fequivalences"=20as=20Normalization.NFC=5FSBB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 17868a7..2d88498 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -5,6 +5,7 @@ from warnings import warn from lxml import etree as ET from lxml.etree import XMLSyntaxError from contextlib import suppress +from .substitute_equivalences import substitute_equivalences import sys import attr import enum @@ -36,16 +37,27 @@ class ExtractedText: class Normalization(enum.Enum): NFC = 1 - NFC_MUFI = 2 + NFC_MUFI = 2 # TODO + NFC_SBB = 3 def normalize(text, normalization): if normalization == Normalization.NFC: return unicodedata.normalize('NFC', text) + if normalization == Normalization.NFC_MUFI: + raise NotImplementedError() + if normalization == Normalization.NFC_SBB: + # XXX This needs to be redone + # https://github.com/qurator-spk/dinglehopper/issues/11 + return substitute_equivalences(text) else: raise ValueError() +# XXX hack +normalize_sbb = lambda t: normalize(t, Normalization.NFC_SBB) + + @attr.s(frozen=True) class ExtractedTextSegment: id = attr.ib(type=str) @@ -54,7 +66,7 @@ class ExtractedTextSegment: def check(self, attribute, value): if value is not None and normalize(value, self.normalization) != value: raise ValueError('String "{}" is not normalized.'.format(value)) - normalization = attr.ib(converter=Normalization, default=Normalization.NFC) + normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB) @classmethod def from_text_segment(cls, text_segment, nsmap): @@ -64,6 +76,7 @@ class ExtractedTextSegment: segment_text = None with suppress(AttributeError): segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text + segment_text = normalize_sbb(segment_text) return cls(segment_id, segment_text) @@ -89,7 +102,10 @@ def alto_extract(tree): ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap)) for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap)) - return ExtractedText((ExtractedTextSegment(None, line_text) for line_text in lines), '\n') + return ExtractedText( + (ExtractedTextSegment(None, normalize_sbb(line_text)) for line_text in lines), + '\n' + ) # TODO This currently does not extract any segment id, because we are # clueless about the ALTO format. # FIXME needs to handle normalization From 257e4986cc161934caf9cc1e8d55c2b8eb91f566 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 15:56:01 +0200 Subject: [PATCH 62/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Use=20a=20?= =?UTF-8?q?Bootstrap=20tooltip=20for=20the=20segment=20id?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli.py | 10 ++++++---- qurator/dinglehopper/templates/report.html.j2 | 7 ------- qurator/dinglehopper/templates/report.html.js | 16 ++++++---------- 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 13543a5..ea0c9bb 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -22,9 +22,11 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): html_t = escape(t) html_custom_attrs = "" - # XXX must sanitize id_ or do we trust the XML? + + # Set Bootstrap tooltip to the segment id if id_: - html_custom_attrs = 'data-segment-id="{}"'.format(id_) + html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_) + # XXX must sanitize id_ or do we trust the XML? if css_classes: return '{html_t}'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs) @@ -70,8 +72,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): return \ '''
-
{}
-
{}
+
{}
+
{}
'''.format(gtx, ocrx) diff --git a/qurator/dinglehopper/templates/report.html.j2 b/qurator/dinglehopper/templates/report.html.j2 index f829ef8..0c2f464 100644 --- a/qurator/dinglehopper/templates/report.html.j2 +++ b/qurator/dinglehopper/templates/report.html.j2 @@ -26,13 +26,6 @@ border: 2px solid; border-radius: 5px; } - .status-box { - position: fixed; - background: grey; - color: white; - width: 100%; - height: 2em; - } diff --git a/qurator/dinglehopper/templates/report.html.js b/qurator/dinglehopper/templates/report.html.js index 0baaa30..4c2ba28 100644 --- a/qurator/dinglehopper/templates/report.html.js +++ b/qurator/dinglehopper/templates/report.html.js @@ -1,19 +1,15 @@ function find_diff_class(classes) { - return classes.split(/\s+/).find(x => x.match(/.diff\d.*/)); + return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/))); } $(document).ready(function() { - $('.diff').mouseover(function() { - let c = find_diff_class($(this).attr('class')); - $('.' + c).addClass('diff-highlight'); + /* Enable Bootstrap tooltips */ + $('[data-toggle="tooltip"]').tooltip(); - segment_id = $(this).attr('data-segment-id'); - $(this).closest('div').find('.status-box').text(segment_id); + $('.diff').mouseover(function() { + find_diff_class($(this).attr('class')).addClass('diff-highlight'); }); $('.diff').mouseout(function() { - let c = find_diff_class($(this).attr('class')); - $('.' + c).removeClass('diff-highlight'); - - $(this).closest('div').find('.status-box').text(''); + find_diff_class($(this).attr('class')).removeClass('diff-highlight'); }); }); From a61c9356244275b79ae98f9eafd67016ebeb8359 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 16:08:56 +0200 Subject: [PATCH 63/87] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Move=20Pyt?= =?UTF-8?q?hon=203.5=20XXXs=20to=20a=20GitHub=20issue?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/qurator-spk/dinglehopper/issues/20. --- qurator/dinglehopper/ocr_files.py | 1 - qurator/dinglehopper/tests/test_integ_cli_valid_json.py | 2 -- qurator/dinglehopper/tests/test_integ_ocrd_cli.py | 2 -- 3 files changed, 5 deletions(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 2d88498..2ceebfd 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -16,7 +16,6 @@ import unicodedata class ExtractedText: segments = attr.ib() joiner = attr.ib(type=str) - # TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped # TODO Types are not validated (attr does not do this yet) @property diff --git a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py index 5699700..35421bb 100644 --- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py +++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py @@ -10,7 +10,6 @@ from ..cli import process def test_cli_json(tmp_path): """Test that the cli/process() yields a loadable JSON report""" - # XXX Path.__str__() is necessary for Python 3.5 with working_directory(str(tmp_path)): with open('gt.txt', 'w') as gtf: gtf.write('AAAAA') @@ -26,7 +25,6 @@ def test_cli_json(tmp_path): def test_cli_json_cer_is_infinity(tmp_path): """Test that the cli/process() yields a loadable JSON report when CER == inf""" - # XXX Path.__str__() is necessary for Python 3.5 with working_directory(str(tmp_path)): with open('gt.txt', 'w') as gtf: gtf.write('') # Empty to yield CER == inf diff --git a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py index 41da748..3d78f57 100644 --- a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py +++ b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py @@ -17,8 +17,6 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') def test_ocrd_cli(tmp_path): """Test OCR-D interface""" - # XXX Path.str() is necessary for Python 3.5 - # Copy test workspace test_workspace_dir_source = Path(data_dir) / 'actevedef_718448162' test_workspace_dir = tmp_path / 'test_ocrd_cli' From 0c33e8441587175e9bc0c8ca13877e533917842e Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 17:01:28 +0200 Subject: [PATCH 64/87] =?UTF-8?q?=F0=9F=93=93=20dinglehopper:=20Document?= =?UTF-8?q?=20editops()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/edit_distance.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index dc1cb24..a6643c7 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -123,7 +123,11 @@ def seq_editops(seq1, seq2): def editops(word1, word2): - # XXX Note that this returns indices to the _grapheme clusters_, not characters! + """ + Return sequence of edit operations transforming one string to another. + + Note that this returns indices to the _grapheme clusters_, not characters! + """ word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1))) word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2))) return seq_editops(word1, word2) From c432cb505a62aabe8a0225d644aeba4cde90acea Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 17:01:56 +0200 Subject: [PATCH 65/87] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Clean=20up?= =?UTF-8?q?=20test=5Flines=5Fsimilar()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/tests/test_align.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/qurator/dinglehopper/tests/test_align.py b/qurator/dinglehopper/tests/test_align.py index cc5cb43..23483f8 100644 --- a/qurator/dinglehopper/tests/test_align.py +++ b/qurator/dinglehopper/tests/test_align.py @@ -78,7 +78,8 @@ def test_lines(): def test_lines_similar(): - """Test comparing list of lines while using a "weaker equivalence". + """ + Test comparing list of lines while using a "weaker equivalence". This mainly serves as documentation. """ @@ -88,7 +89,14 @@ def test_lines_similar(): self._string = string def __eq__(self, other): - return distance(self._string, other._string) < 2 # XXX NOT the final version + # Just an example! + min_len = min(len(self._string), len(other._string)) + if min_len > 0: + normalized_distance = distance(self._string, other._string)/min_len + similar = normalized_distance < 0.1 + else: + similar = False + return similar def __ne__(self, other): return not self.__eq__(other) @@ -106,3 +114,6 @@ def test_lines_similar(): left, right = unzip(result) assert list(left) == [SimilarString('This is a line.'), SimilarString('This is another'), None, SimilarString('And the last line')] assert list(right) == [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')] + + # Test __eq__ (i.e. is it a substitution or a similar string?) + assert list(left)[0] == list(right)[0] From 0cf7ff472143e24c06477c4d3a1056992a753ed7 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 17:04:07 +0200 Subject: [PATCH 66/87] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Remove=20o?= =?UTF-8?q?bsolete=20XXX=20about=20the=20PAGE=20hierarchy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 1 - 1 file changed, 1 deletion(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 2ceebfd..5ce0bcd 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -151,7 +151,6 @@ def page_extract(tree): for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap): regions.append(ExtractedTextSegment.from_text_segment(region, nsmap)) - # XXX Does a file have to have regions etc.? region vs lines etc. # Filter empty region texts regions = [r for r in regions if r.text is not None] From c010a7f05e4a8e0909006036c91e42629e0713be Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 18:06:42 +0200 Subject: [PATCH 67/87] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Calculate?= =?UTF-8?q?=20segment=20ids=20once,=20on=20the=20first=20call?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 5ce0bcd..180ecd3 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -5,6 +5,7 @@ from warnings import warn from lxml import etree as ET from lxml.etree import XMLSyntaxError from contextlib import suppress +from itertools import repeat from .substitute_equivalences import substitute_equivalences import sys import attr @@ -22,16 +23,20 @@ class ExtractedText: def text(self): return self.joiner.join(s.text for s in self.segments) + _segment_id_for_pos = None + def segment_id_for_pos(self, pos): - i = 0 - for s in self.segments: - if i <= pos < i + len(s.text): - return s.id - i += len(s.text) - if i <= pos < i + len(self.joiner): - return None - i += len(self.joiner) - # XXX Cache results + # Calculate segment ids once, on the first call + if not self._segment_id_for_pos: + segment_id_for_pos = [] + for s in self.segments: + segment_id_for_pos.extend(repeat(s.id, len(s.text))) + segment_id_for_pos.extend(repeat(None, len(self.joiner))) + # This is frozen, so we have to jump through the hoop: + object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos) + assert self._segment_id_for_pos + + return self._segment_id_for_pos[pos] class Normalization(enum.Enum): From 079be203bd66e39a1b0b69ac6609418d6e9fbcb2 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 20:04:24 +0200 Subject: [PATCH 68/87] =?UTF-8?q?=F0=9F=90=9B=20dinglehopper:=20Fix=20test?= =?UTF-8?q?s=20to=20deal=20with=20new=20normalization=20logic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 5 +- .../dinglehopper/tests/test_integ_align.py | 8 +- .../test_integ_character_error_rate_ocr.py | 8 +- .../tests/test_integ_cli_valid_json.py | 5 +- .../tests/test_integ_edit_distance_ocr.py | 4 +- .../tests/test_integ_word_error_rate_ocr.py | 5 +- qurator/dinglehopper/tests/test_ocr_files.py | 82 +++++++++++++------ 7 files changed, 85 insertions(+), 32 deletions(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 180ecd3..e1267f7 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -15,7 +15,7 @@ import unicodedata @attr.s(frozen=True) class ExtractedText: - segments = attr.ib() + segments = attr.ib(converter=list) joiner = attr.ib(type=str) # TODO Types are not validated (attr does not do this yet) @@ -80,6 +80,7 @@ class ExtractedTextSegment: segment_text = None with suppress(AttributeError): segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text + segment_text = segment_text or '' segment_text = normalize_sbb(segment_text) return cls(segment_id, segment_text) @@ -157,7 +158,7 @@ def page_extract(tree): regions.append(ExtractedTextSegment.from_text_segment(region, nsmap)) # Filter empty region texts - regions = [r for r in regions if r.text is not None] + regions = (r for r in regions if r.text is not None) return ExtractedText(regions, '\n') # FIXME needs to handle normalization diff --git a/qurator/dinglehopper/tests/test_integ_align.py b/qurator/dinglehopper/tests/test_integ_align.py index df1e230..b35974b 100644 --- a/qurator/dinglehopper/tests/test_integ_align.py +++ b/qurator/dinglehopper/tests/test_integ_align.py @@ -13,11 +13,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @pytest.mark.integration def test_align_page_files(): # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. - # → 4 elements in the alignment should be different. + # → 2 elements in the alignment should be different, the ligature is + # (currently) not counted due to normalization. # NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters. gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) result = list(align(gt, ocr)) - assert sum(left != right for left, right in result) == 4 + for left, right in result: + if left != right: + print(left, right) + assert sum(left != right for left, right in result) == 2 diff --git a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py index c27cd31..1c3bf52 100644 --- a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py +++ b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py @@ -4,6 +4,7 @@ import os import pytest from lxml import etree as ET +from uniseg.graphemecluster import grapheme_clusters from .. import character_error_rate, page_text, alto_text @@ -13,9 +14,14 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @pytest.mark.integration def test_character_error_rate_between_page_files(): # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. + # The fi ligature does not count. gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) - assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311) # 2 TextRegions, 1 \n + + gt_len = len(list(grapheme_clusters(gt))) + expected_cer = 2/gt_len + + assert character_error_rate(gt, ocr) == expected_cer @pytest.mark.integration diff --git a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py index 35421bb..d71bc14 100644 --- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py +++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py @@ -1,4 +1,3 @@ -import os import json import pytest @@ -16,7 +15,11 @@ def test_cli_json(tmp_path): with open('ocr.txt', 'w') as ocrf: ocrf.write('AAAAB') + with open('gt.txt', 'r') as gtf: + print(gtf.read()) process('gt.txt', 'ocr.txt', 'report') + with open('report.json', 'r') as jsonf: + print(jsonf.read()) with open('report.json', 'r') as jsonf: j = json.load(jsonf) assert j['cer'] == pytest.approx(0.2) diff --git a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py index 2857d56..cbe12f8 100644 --- a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py +++ b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py @@ -13,9 +13,11 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @pytest.mark.integration def test_distance_between_page_files(): # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. + # Due to normalization, we don't count the ligature. + # → 2 differences gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) - assert distance(gt, ocr) == 4 + assert distance(gt, ocr) == 2 @pytest.mark.integration diff --git a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py index 1d2dead..f5c922b 100644 --- a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py +++ b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py @@ -12,14 +12,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @pytest.mark.integration def test_word_error_rate_between_page_files(): - # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. → 3 changed words + # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words, + # the ligature does not count → 2 errors gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line assert len(list(words(gt))) == gt_word_count ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) - assert word_error_rate(gt, ocr) == 3/gt_word_count + assert word_error_rate(gt, ocr) == 2/gt_word_count @pytest.mark.integration diff --git a/qurator/dinglehopper/tests/test_ocr_files.py b/qurator/dinglehopper/tests/test_ocr_files.py index dd9377a..3291152 100644 --- a/qurator/dinglehopper/tests/test_ocr_files.py +++ b/qurator/dinglehopper/tests/test_ocr_files.py @@ -6,7 +6,8 @@ import textwrap import pytest -from .. import alto_namespace, alto_text, page_namespace, page_text, text +from .util import working_directory +from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') @@ -49,27 +50,51 @@ def test_page_namespace(): def test_page_test(): tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml')) result = page_text(tree) + + # We are currently normalizing on extraction, so the text is normalized. + # + # expected = textwrap.dedent("""\ + # ber die vielen Sorgen wegen deelben vergaß + # Hartkopf, der Frau Amtmnnin das ver⸗ + # ſproene zu berliefern. — Ein Erpreer + # wurde an ihn abgeſit, um ihn ums Him⸗ + # melswien zu ſagen, daß er das Verſproene + # glei den Augenbli berbringen mte, die + # Frau Amtmnnin htte  auf ihn verlaen, + # und nun wßte e nit, was e anfangen + # ſote. Den Augenbli ſote er kommen, + # ſon vergieng e in ihrer Ang. — Die + # Ge wren ſon angekommen, und es fehlte + # ihr do no an aem. — + # Hartkopf mußte  er bennen, und + # endli na langem Nadenken fiel es ihm er + # wieder ein. — Er langte den Zettel aus dem + # Accisbue heraus, und ſagte ſeiner Frau, daß + # e das, was da wre, herbeyſaffen mte. + # Jndeß mangelten do einige Generalia, die + # alſo wegfielen. — Hartkopf gieng ſelb + # mit und berbrate es. —""") expected = textwrap.dedent("""\ - ber die vielen Sorgen wegen deelben vergaß - Hartkopf, der Frau Amtmnnin das ver⸗ - ſproene zu berliefern. — Ein Erpreer - wurde an ihn abgeſit, um ihn ums Him⸗ - melswien zu ſagen, daß er das Verſproene - glei den Augenbli berbringen mte, die - Frau Amtmnnin htte  auf ihn verlaen, - und nun wßte e nit, was e anfangen - ſote. Den Augenbli ſote er kommen, - ſon vergieng e in ihrer Ang. — Die - Ge wren ſon angekommen, und es fehlte - ihr do no an aem. — - Hartkopf mußte  er bennen, und - endli na langem Nadenken fiel es ihm er - wieder ein. — Er langte den Zettel aus dem - Accisbue heraus, und ſagte ſeiner Frau, daß - e das, was da wre, herbeyſaffen mte. - Jndeß mangelten do einige Generalia, die - alſo wegfielen. — Hartkopf gieng ſelb - mit und berbrate es. —""") + über die vielen Sorgen wegen deſſelben vergaß + Hartkopf, der Frau Amtmännin das ver- + ſprochene zu überliefern. – Ein Erpreſſer + wurde an ihn abgeſchickt, um ihn ums Him- + melswillen zu ſagen, daß er das Verſprochene + gleich den Augenblick überbringen möchte, die + Frau Amtmännin hätte ſich auf ihn verlaſſen, + und nun wüßte ſie nicht, was ſie anfangen + ſollte. Den Augenblick ſollte er kommen, + ſonſt vergieng ſie in ihrer Angſt. – Die + Gäſte wären ſchon angekommen, und es fehlte + ihr doch noch an allem. – + Hartkopf mußte ſich erſt beſinnen, und + endlich nach langem Nachdenken fiel es ihm erſt + wieder ein. – Er langte den Zettel aus dem + Accisbuche heraus, und ſagte ſeiner Frau, daß + ſie das, was da wäre, herbeyſchaffen möchte. + Jndeß mangelten doch einige Generalia, die + alſo wegfielen. – Hartkopf gieng ſelbſt + mit und überbrachte es. –""") assert result == expected @@ -92,7 +117,8 @@ def test_page_order(): tree = ET.parse(os.path.join(data_dir, 'order.page.xml')) result = page_text(tree) - assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL) + print(result) + assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die', result, re.DOTALL) def test_page_mixed_regions(): @@ -106,5 +132,15 @@ def test_page_mixed_regions(): def test_text(): assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml')) - assert "wieder ein. — Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml')) + assert "wieder ein. – Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml')) assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt')) + + +def test_plain(tmp_path): + with working_directory(str(tmp_path)): + with open('ocr.txt', 'w') as ocrf: + ocrf.write('AAAAB') + + result = plain_text('ocr.txt') + expected = 'AAAAB' + assert result == expected From 4469af62c8240ff44d4e05014cdff3b691f9c006 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 20:05:33 +0200 Subject: [PATCH 69/87] =?UTF-8?q?=F0=9F=8E=A8=20dinglehopper:=20Unfuck=20s?= =?UTF-8?q?ubstitutions=20a=20bit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../dinglehopper/substitute_equivalences.py | 37 ++++++++++++------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/qurator/dinglehopper/substitute_equivalences.py b/qurator/dinglehopper/substitute_equivalences.py index 1b7e0cf..39be276 100644 --- a/qurator/dinglehopper/substitute_equivalences.py +++ b/qurator/dinglehopper/substitute_equivalences.py @@ -1,21 +1,15 @@ import unicodedata -def substitute_equivalences(s): +def unjoin_ligatures(s): + """Unjoin ligatures, i.e. ff becomes ff.""" - # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR - # It might make sense to use different rules for GT and for the different OCR equivalences = { - '': 'ü', '': 'ſſ', "\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I - '': 'ä', '': 'ch', - '==': '–', # → en-dash - '—': '–', # em-dash → en-dash '': 'ck', '': 'll', - '': 'ö', '': 'ſi', '': 'ſt', 'fi': 'fi', @@ -23,12 +17,7 @@ def substitute_equivalences(s): 'fl': 'fl', 'ffi': 'ffi', '': 'ct', - '’': '\'', - '⸗': '-', '': 'tz', # MUFI: LATIN SMALL LIGATURE TZ - 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E - 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E - 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E '\uf532': 'as', # eMOP: Latin small ligature as '\uf533': 'is', # eMOP: Latin small ligature is '\uf534': 'us', # eMOP: Latin small ligature us @@ -37,10 +26,32 @@ def substitute_equivalences(s): '\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly? '\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P 'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST + } + s = unicodedata.normalize('NFC', s) + for fr, to in equivalences.items(): + s = s.replace(fr, to) + return s + + +def substitute_equivalences(s): + # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR + # It might make sense to use different rules for GT and for the different OCR + equivalences = { + '': 'ü', + '': 'ä', + '==': '–', # → en-dash + '—': '–', # em-dash → en-dash + '': 'ö', + '’': '\'', + '⸗': '-', + 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E + 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E + 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E '\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT } s = unicodedata.normalize('NFC', s) + s = unjoin_ligatures(s) for fr, to in equivalences.items(): s = s.replace(fr, to) return s From 9f05e6ca4cef16480a2f44649c6d56edab03cb1c Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 20:19:38 +0200 Subject: [PATCH 70/87] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Remove=20o?= =?UTF-8?q?bsolete=20XXX=20about=20None=20ids?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/tests/extracted_text_test.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index 82c3a53..2e6a9e6 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -52,9 +52,6 @@ def test_align(): for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)): left_id = test1.segment_id_for_pos(left_pos) if left is not None else None right_id = test2.segment_id_for_pos(right_pos) if right is not None else None - # XXX note that deletions and inserts only produce one id + None, UI must - # support this, i.e. display for the one id produced - # XXX otherwise, it should always display for BOTH ids el = AlignmentElement(left, right, left_id, right_id) alignment.append(el) if left is not None: From 37edc0336f9d8f654c8311943f2c524d3ced3a59 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 20:21:18 +0200 Subject: [PATCH 71/87] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Remove=20o?= =?UTF-8?q?bsolete=20XXX=20that=20has=20a=20GitHub=20issue?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index e1267f7..1652b71 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -51,8 +51,6 @@ def normalize(text, normalization): if normalization == Normalization.NFC_MUFI: raise NotImplementedError() if normalization == Normalization.NFC_SBB: - # XXX This needs to be redone - # https://github.com/qurator-spk/dinglehopper/issues/11 return substitute_equivalences(text) else: raise ValueError() From 8c5f7c73d55e10104a7a6733aab5f5ea5f80c07b Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 20:24:58 +0200 Subject: [PATCH 72/87] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Replace=20?= =?UTF-8?q?XXX=20with=20an=20actual=20comment?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index ea0c9bb..81acb41 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -56,9 +56,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): if isinstance(gt_in, ExtractedText): gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None - # XXX note that deletions and inserts only produce one id + None, UI must - # support this, i.e. display for the one id produced - # XXX otherwise, it should always display for BOTH ids + # Deletions and inserts only produce one id + None, UI must + # support this, i.e. display for the one id produced gtx += joiner + format_thing(g, css_classes, gt_id) ocrx += joiner + format_thing(o, css_classes, ocr_id) From d39f74f11a1c329bd53883c010574b95d982c05b Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 20:29:50 +0200 Subject: [PATCH 73/87] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Remove=20o?= =?UTF-8?q?bsolete=20normalization-related=20FIXME?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 81acb41..2889e46 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -87,10 +87,6 @@ def process(gt, ocr, report_prefix, *, metrics=True): gt_text = extract(gt) ocr_text = extract(ocr) - # FIXME - #gt_text = substitute_equivalences(gt_text) - #ocr_text = substitute_equivalences(ocr_text) - cer, n_characters = character_error_rate_n(gt_text, ocr_text) wer, n_words = word_error_rate_n(gt_text, ocr_text) From d4848100388a5d24983286ef23dca285c78d89f6 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 20:43:25 +0200 Subject: [PATCH 74/87] =?UTF-8?q?=E2=9C=A8=20dinglehopper:=20Validate=20re?= =?UTF-8?q?ad=20segment=20ids?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli.py | 1 - qurator/dinglehopper/ocr_files.py | 11 +++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 2889e46..9c963c1 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -26,7 +26,6 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): # Set Bootstrap tooltip to the segment id if id_: html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_) - # XXX must sanitize id_ or do we trust the XML? if css_classes: return '{html_t}'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 1652b71..d3918d1 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -11,6 +11,7 @@ import sys import attr import enum import unicodedata +import re @attr.s(frozen=True) @@ -30,7 +31,7 @@ class ExtractedText: if not self._segment_id_for_pos: segment_id_for_pos = [] for s in self.segments: - segment_id_for_pos.extend(repeat(s.id, len(s.text))) + segment_id_for_pos.extend(repeat(s.segment_id, len(s.text))) segment_id_for_pos.extend(repeat(None, len(self.joiner))) # This is frozen, so we have to jump through the hoop: object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos) @@ -62,7 +63,13 @@ normalize_sbb = lambda t: normalize(t, Normalization.NFC_SBB) @attr.s(frozen=True) class ExtractedTextSegment: - id = attr.ib(type=str) + segment_id = attr.ib(type=str) + @segment_id.validator + def check(self, attribute, value): + if value is None: + return + if not re.match(r'[\w\d_-]+', value): + raise ValueError('Malformed segment id "{}"'.format(value)) text = attr.ib(type=str) @text.validator def check(self, attribute, value): From 6ab38f1bda653aa6bcbc7d1cdbd1dd737699ae97 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 20:59:37 +0200 Subject: [PATCH 75/87] =?UTF-8?q?=F0=9F=8E=A8=20dinglehopper:=20Make=20PyC?= =?UTF-8?q?harm=20happier=20with=20the=20type=20hinting,=20newlines=20etc.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/align.py | 8 ++++---- qurator/dinglehopper/edit_distance.py | 1 - qurator/dinglehopper/ocr_files.py | 12 ++++++++---- qurator/dinglehopper/tests/test_integ_ocrd_cli.py | 2 -- qurator/dinglehopper/tests/util.py | 4 ++-- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py index ab44760..87febb7 100644 --- a/qurator/dinglehopper/align.py +++ b/qurator/dinglehopper/align.py @@ -28,16 +28,16 @@ def seq_align(s1, s2): if o: if o[0] == 'insert': - yield (None, s2[j]) + yield None, s2[j] j += 1 elif o[0] == 'delete': - yield (s1[i], None) + yield s1[i], None i += 1 elif o[0] == 'replace': - yield (s1[i], s2[j]) + yield s1[i], s2[j] i += 1 j += 1 else: - yield (s1[i], s2[j]) + yield s1[i], s2[j] i += 1 j += 1 diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index a6643c7..284b676 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -8,7 +8,6 @@ import numpy as np from uniseg.graphemecluster import grapheme_clusters - def levenshtein_matrix(seq1: Sequence, seq2: Sequence): """Compute the matrix commonly computed to produce the Levenshtein distance. This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index d3918d1..a048b1e 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -1,5 +1,6 @@ from __future__ import division, print_function +from typing import Optional from warnings import warn from lxml import etree as ET @@ -58,21 +59,24 @@ def normalize(text, normalization): # XXX hack -normalize_sbb = lambda t: normalize(t, Normalization.NFC_SBB) +def normalize_sbb(t): + return normalize(t, Normalization.NFC_SBB) @attr.s(frozen=True) class ExtractedTextSegment: - segment_id = attr.ib(type=str) + segment_id = attr.ib(type=Optional[str]) + @segment_id.validator - def check(self, attribute, value): + def check(self, _, value): if value is None: return if not re.match(r'[\w\d_-]+', value): raise ValueError('Malformed segment id "{}"'.format(value)) text = attr.ib(type=str) + @text.validator - def check(self, attribute, value): + def check(self, _, value): if value is not None and normalize(value, self.normalization) != value: raise ValueError('String "{}" is not normalized.'.format(value)) normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB) diff --git a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py index 3d78f57..75bb816 100644 --- a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py +++ b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py @@ -1,11 +1,9 @@ import os -import re import shutil import json from pathlib import Path from click.testing import CliRunner -import pytest from .util import working_directory diff --git a/qurator/dinglehopper/tests/util.py b/qurator/dinglehopper/tests/util.py index 52b7506..1f224e5 100644 --- a/qurator/dinglehopper/tests/util.py +++ b/qurator/dinglehopper/tests/util.py @@ -21,8 +21,8 @@ def diffprint(x, y): _diffprint(x, y) -def unzip(l): - return zip(*l) +def unzip(an_iterable_of_tuples): + return zip(*an_iterable_of_tuples) class working_directory: From 7e3dafd3bcd6ab5d9c45622d1c233e781324f04c Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 10 Aug 2020 18:03:00 +0200 Subject: [PATCH 76/87] =?UTF-8?q?=F0=9F=94=A7=20dinglehopper:=20Add=20PyCh?= =?UTF-8?q?arm=20code=20style=20config?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .idea/codeStyles/codeStyleConfig.xml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .idea/codeStyles/codeStyleConfig.xml diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml new file mode 100644 index 0000000..a55e7a1 --- /dev/null +++ b/.idea/codeStyles/codeStyleConfig.xml @@ -0,0 +1,5 @@ + + + + \ No newline at end of file From db6292611fb739baf20038ac0a7e63847bd6a96f Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 7 Oct 2020 16:07:27 +0200 Subject: [PATCH 77/87] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Remove=20m?= =?UTF-8?q?erged=20text=20extraction=20test=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 51 ------------------------------------------ extracted_text_test.py | 22 ------------------ 2 files changed, 73 deletions(-) delete mode 100644 extracted_text.py delete mode 100644 extracted_text_test.py diff --git a/extracted_text.py b/extracted_text.py deleted file mode 100644 index c84c77b..0000000 --- a/extracted_text.py +++ /dev/null @@ -1,51 +0,0 @@ -import attr -import unicodedata -import enum - - -# TODO handle grapheme cluster positions? -# TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped -# TODO types are not validated (attr does not do this yet) - - -@attr.s(frozen=True) -class ExtractedText: - segments = attr.ib() - joiner = attr.ib(type=str) - - @property - def text(self): - return self.joiner.join(s.text for s in self.segments) - - def segment_id_for_pos(self, pos): - i = 0 - for s in self.segments: - if i <= pos < i + len(s.text): - return s.id - i += len(s.text) - if i <= pos < i + len(self.joiner): - return None - i += len(self.joiner) - - -class Normalization(enum.Enum): - NFC = 1 - NFC_MUFI = 2 - - -def normalize(text, normalization): - if normalization == Normalization.NFC: - return unicodedata.normalize('NFC', text) - else: - raise ValueError() - - -@attr.s(frozen=True) -class ExtractedTextSegment: - id = attr.ib(type=str) - text = attr.ib(type=str) - @text.validator - def check(self, attribute, value): - if normalize(value, self.normalization) != value: - raise ValueError('String "{}" is not normalized.'.format(value)) - normalization = attr.ib(converter=Normalization, default=Normalization.NFC) diff --git a/extracted_text_test.py b/extracted_text_test.py deleted file mode 100644 index 4919a76..0000000 --- a/extracted_text_test.py +++ /dev/null @@ -1,22 +0,0 @@ -import unicodedata -import pytest -from extracted_text import ExtractedText, ExtractedTextSegment - - -def test_text(): - test1 = ExtractedText([ - ExtractedTextSegment('s0', 'foo'), - ExtractedTextSegment('s1', 'bar'), - ExtractedTextSegment('s2', 'bazinga') - ], ' ') - - assert test1.text == 'foo bar bazinga' - assert test1.segment_id_for_pos(0) == 's0' - assert test1.segment_id_for_pos(3) is None - assert test1.segment_id_for_pos(10) == 's2' - - -def test_normalization_check(): - with pytest.raises(ValueError, match=r'.*is not normalized.*'): - ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) - assert ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) From 96b55f1806fa4467cd20d5baa9beadffb358bba9 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 7 Oct 2020 18:31:52 +0200 Subject: [PATCH 78/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Hierarchic?= =?UTF-8?q?al=20text=20representation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 119 +++++++++++------- .../dinglehopper/tests/extracted_text_test.py | 38 +++--- 2 files changed, 90 insertions(+), 67 deletions(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index a048b1e..2b8b0de 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -15,15 +15,53 @@ import unicodedata import re +class Normalization(enum.Enum): + NFC = 1 + NFC_MUFI = 2 # TODO + NFC_SBB = 3 + + @attr.s(frozen=True) class ExtractedText: - segments = attr.ib(converter=list) - joiner = attr.ib(type=str) - # TODO Types are not validated (attr does not do this yet) + segment_id = attr.ib(type=Optional[str]) + + @segment_id.validator + def check(self, _, value): + if value is None: + return + if not re.match(r'[\w\d_-]+', value): + raise ValueError('Malformed segment id "{}"'.format(value)) + + # An object contains either + # a. _text itself + # b. or segments (ExtractedText) and a joiner + # TODO validator + + segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list)) + joiner = attr.ib(type=Optional[str]) + _text = attr.ib(type=Optional[str]) + + @segments.validator + def check(self, _, value): + if value is not None and self._text is not None: + raise ValueError("Can't have both segments and text") + + @_text.validator + def check(self, _, value): + if value is not None and normalize(value, self.normalization) != value: + raise ValueError('String "{}" is not normalized.'.format(value)) + + normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB) @property def text(self): - return self.joiner.join(s.text for s in self.segments) + if self._text is not None: + if self._text == '': + return None + else: + return self._text + else: + return self.joiner.join(s.text for s in self.segments) _segment_id_for_pos = None @@ -34,17 +72,30 @@ class ExtractedText: for s in self.segments: segment_id_for_pos.extend(repeat(s.segment_id, len(s.text))) segment_id_for_pos.extend(repeat(None, len(self.joiner))) + segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)] # This is frozen, so we have to jump through the hoop: object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos) assert self._segment_id_for_pos return self._segment_id_for_pos[pos] + @classmethod + def from_text_segment(cls, text_segment, nsmap): + """Build an ExtractedText from a PAGE content text element""" + + segment_id = text_segment.attrib['id'] + segment_text = None + with suppress(AttributeError): + segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text + segment_text = segment_text or '' + segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization + segment_text = segment_text or '' + return cls(segment_id, None, None, segment_text) + + @classmethod + def from_text(cls, text): + return cls(None, None, None, text) -class Normalization(enum.Enum): - NFC = 1 - NFC_MUFI = 2 # TODO - NFC_SBB = 3 def normalize(text, normalization): @@ -63,37 +114,6 @@ def normalize_sbb(t): return normalize(t, Normalization.NFC_SBB) -@attr.s(frozen=True) -class ExtractedTextSegment: - segment_id = attr.ib(type=Optional[str]) - - @segment_id.validator - def check(self, _, value): - if value is None: - return - if not re.match(r'[\w\d_-]+', value): - raise ValueError('Malformed segment id "{}"'.format(value)) - text = attr.ib(type=str) - - @text.validator - def check(self, _, value): - if value is not None and normalize(value, self.normalization) != value: - raise ValueError('String "{}" is not normalized.'.format(value)) - normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB) - - @classmethod - def from_text_segment(cls, text_segment, nsmap): - """Build an ExtractedTextSegment from a PAGE content text element""" - - segment_id = text_segment.attrib['id'] - segment_text = None - with suppress(AttributeError): - segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text - segment_text = segment_text or '' - segment_text = normalize_sbb(segment_text) - return cls(segment_id, segment_text) - - def alto_namespace(tree): """Return the ALTO namespace used in the given ElementTree. @@ -117,12 +137,14 @@ def alto_extract(tree): for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap)) return ExtractedText( - (ExtractedTextSegment(None, normalize_sbb(line_text)) for line_text in lines), - '\n' + None, + (ExtractedText.from_text(normalize_sbb(line_text)) for line_text in lines), + '\n', + None ) + # FIXME hardcoded SBB normalization # TODO This currently does not extract any segment id, because we are # clueless about the ALTO format. - # FIXME needs to handle normalization def alto_text(tree): @@ -157,20 +179,19 @@ def page_extract(tree): region_id = region_ref_indexed.attrib['regionRef'] region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap) if region is not None: - regions.append(ExtractedTextSegment.from_text_segment(region, nsmap)) + regions.append(ExtractedText.from_text_segment(region, nsmap)) else: warn('Not a TextRegion: "%s"' % region_id) else: raise NotImplementedError else: for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap): - regions.append(ExtractedTextSegment.from_text_segment(region, nsmap)) + regions.append(ExtractedText.from_text_segment(region, nsmap)) # Filter empty region texts regions = (r for r in regions if r.text is not None) - return ExtractedText(regions, '\n') - # FIXME needs to handle normalization + return ExtractedText(None, regions, '\n', None) def page_text(tree): @@ -180,8 +201,10 @@ def page_text(tree): def plain_extract(filename): with open(filename, 'r') as f: return ExtractedText( - (ExtractedTextSegment('line %d' % no, line) for no, line in enumerate(f.readlines())), - '\n' + None, + [ExtractedText('line %d' % no, None, None, line) for no, line in enumerate(f.readlines())], + '\n', + None ) diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index 2e6a9e6..8cac4c1 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -1,17 +1,17 @@ import unicodedata import pytest -from qurator.dinglehopper import ExtractedText, ExtractedTextSegment +from qurator.dinglehopper import ExtractedText from uniseg.graphemecluster import grapheme_clusters from qurator.dinglehopper import seq_align from collections import namedtuple def test_text(): - test1 = ExtractedText([ - ExtractedTextSegment('s0', 'foo'), - ExtractedTextSegment('s1', 'bar'), - ExtractedTextSegment('s2', 'bazinga') - ], ' ') + test1 = ExtractedText(None, [ + ExtractedText('s0', None, None, 'foo'), + ExtractedText('s1', None, None, 'bar'), + ExtractedText('s2', None, None, 'bazinga') + ], ' ', None) assert test1.text == 'foo bar bazinga' assert test1.segment_id_for_pos(0) == 's0' @@ -21,8 +21,8 @@ def test_text(): def test_normalization_check(): with pytest.raises(ValueError, match=r'.*is not normalized.*'): - ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) - assert ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) + ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ')) + assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ')) AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id') @@ -36,17 +36,17 @@ def test_align(): not Python characters. """ - test1 = ExtractedText([ - ExtractedTextSegment('s0', 'foo'), - ExtractedTextSegment('s1', 'bar'), - ExtractedTextSegment('s2', 'batzinga') - ], ' ') - test2 = ExtractedText([ - ExtractedTextSegment('x0', 'foo'), - ExtractedTextSegment('x1', 'bar'), - ExtractedTextSegment('x2', '.'), # extra . - ExtractedTextSegment('x3', 'bazim̃ga'), # deletion + different grapheme cluster, m̃ also is two Python characters - ], ' ') + test1 = ExtractedText(None, [ + ExtractedText('s0', None, None, 'foo'), + ExtractedText('s1', None, None, 'bar'), + ExtractedText('s2', None, None, 'batzinga') + ], ' ', None) + test2 = ExtractedText(None, [ + ExtractedText('x0', None, None, 'foo'), + ExtractedText('x1', None, None, 'bar'), + ExtractedText('x2', None, None, '.'), # extra . + ExtractedText('x3', None, None, 'bazim̃ga'), # deletion + different grapheme cluster, m̃ also is two Python characters + ], ' ', None) left_pos = 0; right_pos = 0; alignment = [] for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)): From 5bee55c8965612c8ddf04393c206cb8d4554c843 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 25 Sep 2020 14:53:19 +0200 Subject: [PATCH 79/87] =?UTF-8?q?=F0=9F=92=A9=20dinglehopper:=20Fix=20OCR-?= =?UTF-8?q?D=20CLI=20test=20by=20working=20around=20ocrd=5Fcli=5Fwrap=5Fpr?= =?UTF-8?q?ocessor()=20check=20for=20arguments?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/tests/test_integ_ocrd_cli.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py index 75bb816..5e535b5 100644 --- a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py +++ b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py @@ -1,6 +1,7 @@ import os import shutil import json +import sys from pathlib import Path from click.testing import CliRunner @@ -23,11 +24,13 @@ def test_ocrd_cli(tmp_path): # Run through the OCR-D interface with working_directory(str(test_workspace_dir)): runner = CliRunner() - result = runner.invoke(ocrd_dinglehopper, [ + args = [ '-m', 'mets.xml', '-I', 'OCR-D-GT-PAGE,OCR-D-OCR-CALAMARI', '-O', 'OCR-D-OCR-CALAMARI-EVAL' - ]) + ] + sys.argv[1:] = args # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments + result = runner.invoke(ocrd_dinglehopper, args) assert result.exit_code == 0 result_json = list((test_workspace_dir / 'OCR-D-OCR-CALAMARI-EVAL').glob('*.json')) assert json.load(open(str(result_json[0])))['cer'] < 0.03 From 7843824eafb5581a3ccdbc24284d049525fdc2f1 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Oct 2020 10:47:20 +0200 Subject: [PATCH 80/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Support=20?= =?UTF-8?q?str=20&=20ExtractedText=20in=20CER=20and=20distance=20functions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/character_error_rate.py | 12 +++++++----- qurator/dinglehopper/edit_distance.py | 5 +++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py index e99f391..29826e3 100644 --- a/qurator/dinglehopper/character_error_rate.py +++ b/qurator/dinglehopper/character_error_rate.py @@ -6,6 +6,7 @@ from typing import Tuple from uniseg.graphemecluster import grapheme_clusters from qurator.dinglehopper.edit_distance import distance +from qurator.dinglehopper.ocr_files import ExtractedText def character_error_rate_n(reference, compared) -> Tuple[float, int]: @@ -14,12 +15,13 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]: :return: character error rate and length of the reference """ + if isinstance(reference, str): + return character_error_rate_n( + ExtractedText.from_text(reference), + compared) + d = distance(reference, compared) - # XXX - from .cli import ExtractedText - if isinstance(reference, ExtractedText): - reference = reference.text - n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference)))) + n = len(list(grapheme_clusters(reference.text))) if d == 0: return 0, n diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index 284b676..bc607a9 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -7,6 +7,7 @@ from typing import Sequence, Tuple import numpy as np from uniseg.graphemecluster import grapheme_clusters +from .ocr_files import ExtractedText def levenshtein_matrix(seq1: Sequence, seq2: Sequence): """Compute the matrix commonly computed to produce the Levenshtein distance. @@ -75,12 +76,12 @@ def distance(s1, s2): Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme clusters. This should be the correct way to compare two Unicode strings. """ - # XXX - from .cli import ExtractedText + if isinstance(s1, ExtractedText): s1 = s1.text if isinstance(s2, ExtractedText): s2 = s2.text + s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1))) s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2))) return levenshtein(s1, s2) From a17ee2afeccc305f0c00cf5adad4b0ccf341b67b Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Oct 2020 11:25:01 +0200 Subject: [PATCH 81/87] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Guarantee?= =?UTF-8?q?=20NFC=20+=20rename=20from=5Ftext=20=E2=86=92=20from=5Fstr?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .idea/vcs.xml | 6 ++++++ qurator/dinglehopper/character_error_rate.py | 2 +- qurator/dinglehopper/edit_distance.py | 18 ++++++++++-------- qurator/dinglehopper/ocr_files.py | 16 ++++++++++++---- .../dinglehopper/tests/extracted_text_test.py | 2 +- 5 files changed, 30 insertions(+), 14 deletions(-) create mode 100644 .idea/vcs.xml diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py index 29826e3..9f5fda0 100644 --- a/qurator/dinglehopper/character_error_rate.py +++ b/qurator/dinglehopper/character_error_rate.py @@ -17,7 +17,7 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]: """ if isinstance(reference, str): return character_error_rate_n( - ExtractedText.from_text(reference), + ExtractedText.from_str(reference), compared) d = distance(reference, compared) diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index bc607a9..88d3127 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -77,14 +77,16 @@ def distance(s1, s2): clusters. This should be the correct way to compare two Unicode strings. """ - if isinstance(s1, ExtractedText): - s1 = s1.text - if isinstance(s2, ExtractedText): - s2 = s2.text - - s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1))) - s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2))) - return levenshtein(s1, s2) + # XXX Implicit normalization + if isinstance(s1, str): + s1 = ExtractedText.from_str(s1) + if isinstance(s2, str): + s2 = ExtractedText.from_str(s2) + # s1 and s2 are now guaranteed (by ExtractedText) to be in NFC + + seq1 = list(grapheme_clusters(s1.text)) + seq2 = list(grapheme_clusters(s2.text)) + return levenshtein(seq1, seq2) def seq_editops(seq1, seq2): diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 2b8b0de..5824dda 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -23,6 +23,12 @@ class Normalization(enum.Enum): @attr.s(frozen=True) class ExtractedText: + """ + Extracted text + + Objects of this class are guaranteed to be a. always in their normalization and + b. in NFC. + """ segment_id = attr.ib(type=Optional[str]) @segment_id.validator @@ -48,6 +54,8 @@ class ExtractedText: @_text.validator def check(self, _, value): + if value is not None and unicodedata.normalize('NFC', value) != value: + raise ValueError('String "{}" is not in NFC.'.format(value)) if value is not None and normalize(value, self.normalization) != value: raise ValueError('String "{}" is not normalized.'.format(value)) @@ -93,9 +101,9 @@ class ExtractedText: return cls(segment_id, None, None, segment_text) @classmethod - def from_text(cls, text): - return cls(None, None, None, text) - + def from_str(cls, text, normalization=Normalization.NFC_SBB): + normalized_text = normalize(text, normalization) + return cls(None, None, None, normalized_text, normalization=normalization) def normalize(text, normalization): @@ -138,7 +146,7 @@ def alto_extract(tree): return ExtractedText( None, - (ExtractedText.from_text(normalize_sbb(line_text)) for line_text in lines), + (ExtractedText.from_str(normalize_sbb(line_text)) for line_text in lines), '\n', None ) diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index 8cac4c1..ef2776c 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -20,7 +20,7 @@ def test_text(): def test_normalization_check(): - with pytest.raises(ValueError, match=r'.*is not normalized.*'): + with pytest.raises(ValueError, match=r'.*is not in NFC.*'): ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ')) assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ')) From b14c35e14761f604bdaacf73181b3c4d6da03511 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Oct 2020 12:15:58 +0200 Subject: [PATCH 82/87] =?UTF-8?q?=F0=9F=8E=A8=20dinglehopper:=20Use=20mult?= =?UTF-8?q?imethod=20to=20handle=20str=20vs=20ExtractedText?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/character_error_rate.py | 16 +++--- qurator/dinglehopper/edit_distance.py | 19 +++---- qurator/dinglehopper/word_error_rate.py | 60 ++++++++++++-------- requirements.txt | 1 + 4 files changed, 54 insertions(+), 42 deletions(-) diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py index 9f5fda0..998a3c2 100644 --- a/qurator/dinglehopper/character_error_rate.py +++ b/qurator/dinglehopper/character_error_rate.py @@ -3,25 +3,22 @@ from __future__ import division import unicodedata from typing import Tuple +from multimethod import multimethod from uniseg.graphemecluster import grapheme_clusters from qurator.dinglehopper.edit_distance import distance from qurator.dinglehopper.ocr_files import ExtractedText - -def character_error_rate_n(reference, compared) -> Tuple[float, int]: +@multimethod +def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]: """ Compute character error rate. :return: character error rate and length of the reference """ - if isinstance(reference, str): - return character_error_rate_n( - ExtractedText.from_str(reference), - compared) d = distance(reference, compared) - n = len(list(grapheme_clusters(reference.text))) + n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference)))) if d == 0: return 0, n @@ -32,6 +29,11 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]: # XXX Should we really count newlines here? +@multimethod +def character_error_rate_n(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]: + return character_error_rate_n(reference.text, compared.text) + + def character_error_rate(reference, compared) -> float: """ Compute character error rate. diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index 88d3127..ed91443 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -5,6 +5,7 @@ from functools import partial, lru_cache from typing import Sequence, Tuple import numpy as np +from multimethod import multimethod from uniseg.graphemecluster import grapheme_clusters from .ocr_files import ExtractedText @@ -70,23 +71,21 @@ def levenshtein_matrix_cache_clear(): _levenshtein_matrix.cache_clear() -def distance(s1, s2): +@multimethod +def distance(s1: str, s2: str): """Compute the Levenshtein edit distance between two Unicode strings Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme clusters. This should be the correct way to compare two Unicode strings. """ + seq1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1))) + seq2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2))) + return levenshtein(seq1, seq2) - # XXX Implicit normalization - if isinstance(s1, str): - s1 = ExtractedText.from_str(s1) - if isinstance(s2, str): - s2 = ExtractedText.from_str(s2) - # s1 and s2 are now guaranteed (by ExtractedText) to be in NFC - seq1 = list(grapheme_clusters(s1.text)) - seq2 = list(grapheme_clusters(s2.text)) - return levenshtein(seq1, seq2) +@multimethod +def distance(s1: ExtractedText, s2: ExtractedText): + return distance(s1.text, s2.text) def seq_editops(seq1, seq2): diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py index 64eba0a..95ea7f8 100644 --- a/qurator/dinglehopper/word_error_rate.py +++ b/qurator/dinglehopper/word_error_rate.py @@ -1,14 +1,19 @@ from __future__ import division import unicodedata -from typing import Tuple +from typing import Tuple, Iterable +from multimethod import multimethod import uniseg.wordbreak from .edit_distance import levenshtein +from .ocr_files import ExtractedText -def words(s): +@multimethod +def words(s: str): + """Extract words from a string""" + # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt old_word_break = uniseg.wordbreak.word_break @@ -32,11 +37,6 @@ def words(s): cat = subcat[0] return cat in unwanted_categories or subcat in unwanted_subcategories - # XXX - from .cli import ExtractedText - if isinstance(s, ExtractedText): - s = s.text - # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters." for word in uniseg.wordbreak.words(s): @@ -46,27 +46,37 @@ def words(s): yield word -def words_normalized(s): - # XXX - from .cli import ExtractedText - if isinstance(s, ExtractedText): - s = s.text +@multimethod +def words(s: ExtractedText): + return words(s.text) + + +@multimethod +def words_normalized(s: str): return words(unicodedata.normalize('NFC', s)) -def word_error_rate_n(reference, compared) -> Tuple[float, int]: - # XXX - from .cli import ExtractedText - if isinstance(reference, ExtractedText): - reference = reference.text - if isinstance(compared, ExtractedText): - compared = compared.text - if isinstance(reference, str): - reference_seq = list(words_normalized(reference)) - compared_seq = list(words_normalized(compared)) - else: - reference_seq = list(reference) - compared_seq = list(compared) +@multimethod +def words_normalized(s: ExtractedText): + return words_normalized(s.text) + + +@multimethod +def word_error_rate_n(reference: str, compared: str) -> Tuple[float, int]: + reference_seq = list(words_normalized(reference)) + compared_seq = list(words_normalized(compared)) + return word_error_rate_n(reference_seq, compared_seq) + + +@multimethod +def word_error_rate_n(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]: + return word_error_rate_n(reference.text, compared.text) + + +@multimethod +def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, int]: + reference_seq = list(reference) + compared_seq = list(compared) d = levenshtein(reference_seq, compared_seq) n = len(reference_seq) diff --git a/requirements.txt b/requirements.txt index 846990b..287c959 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ colorama MarkupSafe ocrd >= 1.0.0b15 attrs +multimethod == 1.3 # latest version to officially support Python 3.5 \ No newline at end of file From 1f9a680fe7e47fb9a860e8e3b4d85d40ec46e38e Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Oct 2020 12:16:42 +0200 Subject: [PATCH 83/87] =?UTF-8?q?=E2=9A=99=EF=B8=8F=20dinglehopper:=20PyCh?= =?UTF-8?q?arm=20should=20use=20dinglehopper-github=20virtualenv?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .idea/dinglehopper.iml | 3 +-- .idea/misc.xml | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.idea/dinglehopper.iml b/.idea/dinglehopper.iml index 7c9d48f..0f3d9e5 100644 --- a/.idea/dinglehopper.iml +++ b/.idea/dinglehopper.iml @@ -2,11 +2,10 @@ - + - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index 2b68f30..88565d3 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file From f3aafb6fdfc86a35315e5ededdac69e01d6ac8b4 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Oct 2020 12:20:27 +0200 Subject: [PATCH 84/87] =?UTF-8?q?=E2=9C=A8=20dinglehopper:=20Validate=20Ex?= =?UTF-8?q?tractedText.{segments,=5Ftext}=20in=20both=20directions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 5824dda..6cda10c 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -41,7 +41,6 @@ class ExtractedText: # An object contains either # a. _text itself # b. or segments (ExtractedText) and a joiner - # TODO validator segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list)) joiner = attr.ib(type=Optional[str]) @@ -54,6 +53,8 @@ class ExtractedText: @_text.validator def check(self, _, value): + if value is not None and self.segments is not None: + raise ValueError("Can't have both segments and text") if value is not None and unicodedata.normalize('NFC', value) != value: raise ValueError('String "{}" is not in NFC.'.format(value)) if value is not None and normalize(value, self.normalization) != value: From 9dd4ff0aae5cb86b3cdcfb9a0e0ed85b881c1301 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Oct 2020 12:54:28 +0200 Subject: [PATCH 85/87] =?UTF-8?q?=E2=9C=A8=20dinglehopper:=20Extract=20lin?= =?UTF-8?q?e=20IDs=20for=20ALTO?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 6cda10c..11a9836 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -1,6 +1,6 @@ from __future__ import division, print_function -from typing import Optional +from typing import Optional, Generator from warnings import warn from lxml import etree as ET @@ -123,7 +123,7 @@ def normalize_sbb(t): return normalize(t, Normalization.NFC_SBB) -def alto_namespace(tree): +def alto_namespace(tree: ET.ElementTree) -> str: """Return the ALTO namespace used in the given ElementTree. This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not @@ -136,24 +136,18 @@ def alto_namespace(tree): raise ValueError('Not an ALTO tree') -def alto_extract(tree): - """Extract text from the given ALTO ElementTree.""" - +def alto_extract_lines(tree: ET.ElementTree) -> Generator[ExtractedText, None, None]: nsmap = {'alto': alto_namespace(tree)} + for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap): + line_id = line.attrib.get('ID') + line_text = ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap)) + yield ExtractedText(line_id, None, None, normalize_sbb(line_text)) + # FIXME hardcoded SBB normalization - lines = ( - ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap)) - for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap)) - - return ExtractedText( - None, - (ExtractedText.from_str(normalize_sbb(line_text)) for line_text in lines), - '\n', - None - ) - # FIXME hardcoded SBB normalization - # TODO This currently does not extract any segment id, because we are - # clueless about the ALTO format. + +def alto_extract(tree: ET.ElementTree()) -> ExtractedText: + """Extract text from the given ALTO ElementTree.""" + return ExtractedText(None, list(alto_extract_lines(tree)), '\n', None) def alto_text(tree): From 1077dc64ce918315ccafa49a26196296d86f32ab Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Oct 2020 13:25:20 +0200 Subject: [PATCH 86/87] =?UTF-8?q?=E2=9E=A1=EF=B8=8F=20dinglehopper:=20Move?= =?UTF-8?q?=20ExtractedText=20to=20its=20own=20file?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/character_error_rate.py | 3 +- qurator/dinglehopper/cli.py | 1 + qurator/dinglehopper/edit_distance.py | 3 +- qurator/dinglehopper/extracted_text.py | 118 ++++++++++++++++++ qurator/dinglehopper/ocr_files.py | 121 +------------------ qurator/dinglehopper/word_error_rate.py | 2 +- 6 files changed, 128 insertions(+), 120 deletions(-) create mode 100644 qurator/dinglehopper/extracted_text.py diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py index 998a3c2..055d6de 100644 --- a/qurator/dinglehopper/character_error_rate.py +++ b/qurator/dinglehopper/character_error_rate.py @@ -7,7 +7,8 @@ from multimethod import multimethod from uniseg.graphemecluster import grapheme_clusters from qurator.dinglehopper.edit_distance import distance -from qurator.dinglehopper.ocr_files import ExtractedText +from qurator.dinglehopper.extracted_text import ExtractedText + @multimethod def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]: diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 9c963c1..f568399 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -6,6 +6,7 @@ from markupsafe import escape from qurator.dinglehopper import * +from qurator.dinglehopper import ExtractedText def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index ed91443..e91d063 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -8,7 +8,8 @@ import numpy as np from multimethod import multimethod from uniseg.graphemecluster import grapheme_clusters -from .ocr_files import ExtractedText +from . import ExtractedText + def levenshtein_matrix(seq1: Sequence, seq2: Sequence): """Compute the matrix commonly computed to produce the Levenshtein distance. diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py new file mode 100644 index 0000000..6dcd921 --- /dev/null +++ b/qurator/dinglehopper/extracted_text.py @@ -0,0 +1,118 @@ +import enum +import re +import unicodedata +from contextlib import suppress +from itertools import repeat +from typing import Optional + +import attr + +from .substitute_equivalences import substitute_equivalences + + +class Normalization(enum.Enum): + NFC = 1 + NFC_MUFI = 2 # TODO + NFC_SBB = 3 + + +def normalize(text, normalization): + if normalization == Normalization.NFC: + return unicodedata.normalize('NFC', text) + if normalization == Normalization.NFC_MUFI: + raise NotImplementedError() + if normalization == Normalization.NFC_SBB: + return substitute_equivalences(text) + else: + raise ValueError() + + +# XXX hack +def normalize_sbb(t): + return normalize(t, Normalization.NFC_SBB) + + +@attr.s(frozen=True) +class ExtractedText: + """ + Extracted text + + Objects of this class are guaranteed to be a. always in their normalization and + b. in NFC. + """ + segment_id = attr.ib(type=Optional[str]) + + @segment_id.validator + def check(self, _, value): + if value is None: + return + if not re.match(r'[\w\d_-]+', value): + raise ValueError('Malformed segment id "{}"'.format(value)) + + # An object contains either + # a. _text itself + # b. or segments (ExtractedText) and a joiner + + segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list)) + joiner = attr.ib(type=Optional[str]) + _text = attr.ib(type=Optional[str]) + + @segments.validator + def check(self, _, value): + if value is not None and self._text is not None: + raise ValueError("Can't have both segments and text") + + @_text.validator + def check(self, _, value): + if value is not None and self.segments is not None: + raise ValueError("Can't have both segments and text") + if value is not None and unicodedata.normalize('NFC', value) != value: + raise ValueError('String "{}" is not in NFC.'.format(value)) + if value is not None and normalize(value, self.normalization) != value: + raise ValueError('String "{}" is not normalized.'.format(value)) + + normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB) + + @property + def text(self): + if self._text is not None: + if self._text == '': + return None + else: + return self._text + else: + return self.joiner.join(s.text for s in self.segments) + + _segment_id_for_pos = None + + def segment_id_for_pos(self, pos): + # Calculate segment ids once, on the first call + if not self._segment_id_for_pos: + segment_id_for_pos = [] + for s in self.segments: + segment_id_for_pos.extend(repeat(s.segment_id, len(s.text))) + segment_id_for_pos.extend(repeat(None, len(self.joiner))) + segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)] + # This is frozen, so we have to jump through the hoop: + object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos) + assert self._segment_id_for_pos + + return self._segment_id_for_pos[pos] + + @classmethod + def from_text_segment(cls, text_segment, nsmap): + """Build an ExtractedText from a PAGE content text element""" + + segment_id = text_segment.attrib['id'] + segment_text = None + with suppress(AttributeError): + segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text + segment_text = segment_text or '' + segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization + segment_text = segment_text or '' + return cls(segment_id, None, None, segment_text) + + @classmethod + def from_str(cls, text, normalization=Normalization.NFC_SBB): + normalized_text = normalize(text, normalization) + return cls(None, None, None, normalized_text, normalization=normalization) \ No newline at end of file diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 11a9836..78648eb 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -1,126 +1,13 @@ from __future__ import division, print_function -from typing import Optional, Generator +from typing import Generator from warnings import warn +import sys from lxml import etree as ET from lxml.etree import XMLSyntaxError -from contextlib import suppress -from itertools import repeat -from .substitute_equivalences import substitute_equivalences -import sys -import attr -import enum -import unicodedata -import re - - -class Normalization(enum.Enum): - NFC = 1 - NFC_MUFI = 2 # TODO - NFC_SBB = 3 - - -@attr.s(frozen=True) -class ExtractedText: - """ - Extracted text - - Objects of this class are guaranteed to be a. always in their normalization and - b. in NFC. - """ - segment_id = attr.ib(type=Optional[str]) - - @segment_id.validator - def check(self, _, value): - if value is None: - return - if not re.match(r'[\w\d_-]+', value): - raise ValueError('Malformed segment id "{}"'.format(value)) - - # An object contains either - # a. _text itself - # b. or segments (ExtractedText) and a joiner - - segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list)) - joiner = attr.ib(type=Optional[str]) - _text = attr.ib(type=Optional[str]) - - @segments.validator - def check(self, _, value): - if value is not None and self._text is not None: - raise ValueError("Can't have both segments and text") - - @_text.validator - def check(self, _, value): - if value is not None and self.segments is not None: - raise ValueError("Can't have both segments and text") - if value is not None and unicodedata.normalize('NFC', value) != value: - raise ValueError('String "{}" is not in NFC.'.format(value)) - if value is not None and normalize(value, self.normalization) != value: - raise ValueError('String "{}" is not normalized.'.format(value)) - - normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB) - - @property - def text(self): - if self._text is not None: - if self._text == '': - return None - else: - return self._text - else: - return self.joiner.join(s.text for s in self.segments) - - _segment_id_for_pos = None - - def segment_id_for_pos(self, pos): - # Calculate segment ids once, on the first call - if not self._segment_id_for_pos: - segment_id_for_pos = [] - for s in self.segments: - segment_id_for_pos.extend(repeat(s.segment_id, len(s.text))) - segment_id_for_pos.extend(repeat(None, len(self.joiner))) - segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)] - # This is frozen, so we have to jump through the hoop: - object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos) - assert self._segment_id_for_pos - - return self._segment_id_for_pos[pos] - - @classmethod - def from_text_segment(cls, text_segment, nsmap): - """Build an ExtractedText from a PAGE content text element""" - - segment_id = text_segment.attrib['id'] - segment_text = None - with suppress(AttributeError): - segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text - segment_text = segment_text or '' - segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization - segment_text = segment_text or '' - return cls(segment_id, None, None, segment_text) - - @classmethod - def from_str(cls, text, normalization=Normalization.NFC_SBB): - normalized_text = normalize(text, normalization) - return cls(None, None, None, normalized_text, normalization=normalization) - - -def normalize(text, normalization): - if normalization == Normalization.NFC: - return unicodedata.normalize('NFC', text) - if normalization == Normalization.NFC_MUFI: - raise NotImplementedError() - if normalization == Normalization.NFC_SBB: - return substitute_equivalences(text) - else: - raise ValueError() - -# XXX hack -def normalize_sbb(t): - return normalize(t, Normalization.NFC_SBB) +from .extracted_text import ExtractedText, normalize_sbb def alto_namespace(tree: ET.ElementTree) -> str: @@ -192,7 +79,7 @@ def page_extract(tree): regions.append(ExtractedText.from_text_segment(region, nsmap)) # Filter empty region texts - regions = (r for r in regions if r.text is not None) + regions = [r for r in regions if r.text is not None] return ExtractedText(None, regions, '\n', None) diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py index 95ea7f8..2f5a1f6 100644 --- a/qurator/dinglehopper/word_error_rate.py +++ b/qurator/dinglehopper/word_error_rate.py @@ -7,7 +7,7 @@ from multimethod import multimethod import uniseg.wordbreak from .edit_distance import levenshtein -from .ocr_files import ExtractedText +from . import ExtractedText @multimethod From c514abfb9f8fbb8de96ffc789e39c3d1e85843dd Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Oct 2020 13:33:19 +0200 Subject: [PATCH 87/87] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Sanitize?= =?UTF-8?q?=20imports?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/character_error_rate.py | 4 ++-- qurator/dinglehopper/cli.py | 9 ++++++--- qurator/dinglehopper/edit_distance.py | 2 +- qurator/dinglehopper/ocrd_cli.py | 4 ++-- qurator/dinglehopper/tests/extracted_text_test.py | 4 ++-- 5 files changed, 13 insertions(+), 10 deletions(-) diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py index 055d6de..2b13f55 100644 --- a/qurator/dinglehopper/character_error_rate.py +++ b/qurator/dinglehopper/character_error_rate.py @@ -6,8 +6,8 @@ from typing import Tuple from multimethod import multimethod from uniseg.graphemecluster import grapheme_clusters -from qurator.dinglehopper.edit_distance import distance -from qurator.dinglehopper.extracted_text import ExtractedText +from .edit_distance import distance +from .extracted_text import ExtractedText @multimethod diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index f568399..87485bc 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -3,10 +3,13 @@ import os import click from jinja2 import Environment, FileSystemLoader from markupsafe import escape +from uniseg.graphemecluster import grapheme_clusters - -from qurator.dinglehopper import * -from qurator.dinglehopper import ExtractedText +from .character_error_rate import character_error_rate_n +from .word_error_rate import word_error_rate_n, words_normalized +from .align import seq_align +from .extracted_text import ExtractedText +from .ocr_files import extract def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index e91d063..ec49338 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -8,7 +8,7 @@ import numpy as np from multimethod import multimethod from uniseg.graphemecluster import grapheme_clusters -from . import ExtractedText +from .extracted_text import ExtractedText def levenshtein_matrix(seq1: Sequence, seq2: Sequence): diff --git a/qurator/dinglehopper/ocrd_cli.py b/qurator/dinglehopper/ocrd_cli.py index d98c21c..ed5ecd3 100644 --- a/qurator/dinglehopper/ocrd_cli.py +++ b/qurator/dinglehopper/ocrd_cli.py @@ -7,8 +7,8 @@ from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd_utils import concat_padded, getLogger from pkg_resources import resource_string -from qurator.dinglehopper.cli import process as cli_process -from qurator.dinglehopper.edit_distance import levenshtein_matrix_cache_clear +from .cli import process as cli_process +from .edit_distance import levenshtein_matrix_cache_clear log = getLogger('processor.OcrdDinglehopperEvaluate') diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index ef2776c..98788f6 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -1,10 +1,10 @@ import unicodedata import pytest -from qurator.dinglehopper import ExtractedText from uniseg.graphemecluster import grapheme_clusters -from qurator.dinglehopper import seq_align from collections import namedtuple +from .. import seq_align, ExtractedText + def test_text(): test1 = ExtractedText(None, [