From 5dbf563d6a510398ca370f809495915da274195e Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 11 Jun 2020 15:35:52 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Extract=20text?= =?UTF-8?q?=20while=20retaining=20segment=20id=20info?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 50 ---------------------- extracted_text_test.py | 2 +- qurator/dinglehopper/ocr_files.py | 71 ++++++++++++++++++++++++++++--- 3 files changed, 65 insertions(+), 58 deletions(-) delete mode 100644 extracted_text.py diff --git a/extracted_text.py b/extracted_text.py deleted file mode 100644 index b37f341..0000000 --- a/extracted_text.py +++ /dev/null @@ -1,50 +0,0 @@ -import attr -import unicodedata -import enum - - -# TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped -# TODO types are not validated (attr does not do this yet) - - -@attr.s(frozen=True) -class ExtractedText: - segments = attr.ib() - joiner = attr.ib(type=str) - - @property - def text(self): - return self.joiner.join(s.text for s in self.segments) - - def segment_id_for_pos(self, pos): - i = 0 - for s in self.segments: - if i <= pos < i + len(s.text): - return s.id - i += len(s.text) - if i <= pos < i + len(self.joiner): - return None - i += len(self.joiner) - - -class Normalization(enum.Enum): - NFC = 1 - NFC_MUFI = 2 - - -def normalize(text, normalization): - if normalization == Normalization.NFC: - return unicodedata.normalize('NFC', text) - else: - raise ValueError() - - -@attr.s(frozen=True) -class ExtractedTextSegment: - id = attr.ib(type=str) - text = attr.ib(type=str) - @text.validator - def check(self, attribute, value): - if normalize(value, self.normalization) != value: - raise ValueError('String "{}" is not normalized.'.format(value)) - normalization = attr.ib(converter=Normalization, default=Normalization.NFC) diff --git a/extracted_text_test.py b/extracted_text_test.py index b84df87..82c3a53 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -1,6 +1,6 @@ import unicodedata import pytest -from extracted_text import ExtractedText, ExtractedTextSegment +from qurator.dinglehopper import ExtractedText, ExtractedTextSegment from uniseg.graphemecluster import grapheme_clusters from qurator.dinglehopper import seq_align from collections import namedtuple diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index b57a047..7d06dbe 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -3,9 +3,57 @@ from __future__ import division, print_function from warnings import warn from lxml import etree as ET +from lxml.etree import XMLSyntaxError import sys +import attr +import enum +import unicodedata + + +@attr.s(frozen=True) +class ExtractedText: + segments = attr.ib() + joiner = attr.ib(type=str) + # TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped + # TODO Types are not validated (attr does not do this yet) + + @property + def text(self): + return self.joiner.join(s.text for s in self.segments) + + def segment_id_for_pos(self, pos): + i = 0 + for s in self.segments: + if i <= pos < i + len(s.text): + return s.id + i += len(s.text) + if i <= pos < i + len(self.joiner): + return None + i += len(self.joiner) + # XXX Cache results + + +class Normalization(enum.Enum): + NFC = 1 + NFC_MUFI = 2 + + +def normalize(text, normalization): + if normalization == Normalization.NFC: + return unicodedata.normalize('NFC', text) + else: + raise ValueError() -from lxml.etree import XMLSyntaxError + +@attr.s(frozen=True) +class ExtractedTextSegment: + id = attr.ib(type=str) + text = attr.ib(type=str) + @text.validator + def check(self, attribute, value): + if normalize(value, self.normalization) != value: + raise ValueError('String "{}" is not normalized.'.format(value)) + normalization = attr.ib(converter=Normalization, default=Normalization.NFC) def alto_namespace(tree): @@ -21,7 +69,7 @@ def alto_namespace(tree): raise ValueError('Not an ALTO tree') -def alto_text(tree): +def alto_extract(tree): """Extract text from the given ALTO ElementTree.""" nsmap = {'alto': alto_namespace(tree)} @@ -29,9 +77,15 @@ def alto_text(tree): lines = ( ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap)) for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap)) - text_ = '\n'.join(lines) - return text_ + return ExtractedText((ExtractedTextSegment(None, line_text) for line_text in lines), '\n') + # TODO This currently does not extract any segment id, because we are + # clueless about the ALTO format. + # FIXME needs to handle normalization + + +def alto_text(tree): + return alto_extract(tree).text def page_namespace(tree): @@ -47,7 +101,7 @@ def page_namespace(tree): raise ValueError('Not a PAGE tree') -def page_text(tree): +def page_extract(tree): """Extract text from the given PAGE content ElementTree.""" nsmap = {'page': page_namespace(tree)} @@ -80,10 +134,13 @@ def page_text(tree): # XXX Does a file have to have regions etc.? region vs lines etc. # Filter empty region texts region_texts = (t for t in region_texts if t) + return ExtractedText((ExtractedTextSegment(None, region_text) for region_text in region_texts), '\n') + # TODO This currently does not extract any segment id + # FIXME needs to handle normalization - text_ = '\n'.join(region_texts) - return text_ +def page_text(tree): + return page_extract(tree).text def text(filename):