🚧 dinglehopper: Extract text while retaining segment id info

pull/38/head
Gerber, Mike 5 years ago
parent 5b353a2232
commit 5dbf563d6a

@ -1,50 +0,0 @@
import attr
import unicodedata
import enum
# TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped
# TODO types are not validated (attr does not do this yet)
@attr.s(frozen=True)
class ExtractedText:
segments = attr.ib()
joiner = attr.ib(type=str)
@property
def text(self):
return self.joiner.join(s.text for s in self.segments)
def segment_id_for_pos(self, pos):
i = 0
for s in self.segments:
if i <= pos < i + len(s.text):
return s.id
i += len(s.text)
if i <= pos < i + len(self.joiner):
return None
i += len(self.joiner)
class Normalization(enum.Enum):
NFC = 1
NFC_MUFI = 2
def normalize(text, normalization):
if normalization == Normalization.NFC:
return unicodedata.normalize('NFC', text)
else:
raise ValueError()
@attr.s(frozen=True)
class ExtractedTextSegment:
id = attr.ib(type=str)
text = attr.ib(type=str)
@text.validator
def check(self, attribute, value):
if normalize(value, self.normalization) != value:
raise ValueError('String "{}" is not normalized.'.format(value))
normalization = attr.ib(converter=Normalization, default=Normalization.NFC)

@ -1,6 +1,6 @@
import unicodedata import unicodedata
import pytest import pytest
from extracted_text import ExtractedText, ExtractedTextSegment from qurator.dinglehopper import ExtractedText, ExtractedTextSegment
from uniseg.graphemecluster import grapheme_clusters from uniseg.graphemecluster import grapheme_clusters
from qurator.dinglehopper import seq_align from qurator.dinglehopper import seq_align
from collections import namedtuple from collections import namedtuple

@ -3,9 +3,57 @@ from __future__ import division, print_function
from warnings import warn from warnings import warn
from lxml import etree as ET from lxml import etree as ET
from lxml.etree import XMLSyntaxError
import sys import sys
import attr
import enum
import unicodedata
@attr.s(frozen=True)
class ExtractedText:
segments = attr.ib()
joiner = attr.ib(type=str)
# TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped
# TODO Types are not validated (attr does not do this yet)
@property
def text(self):
return self.joiner.join(s.text for s in self.segments)
def segment_id_for_pos(self, pos):
i = 0
for s in self.segments:
if i <= pos < i + len(s.text):
return s.id
i += len(s.text)
if i <= pos < i + len(self.joiner):
return None
i += len(self.joiner)
# XXX Cache results
class Normalization(enum.Enum):
NFC = 1
NFC_MUFI = 2
def normalize(text, normalization):
if normalization == Normalization.NFC:
return unicodedata.normalize('NFC', text)
else:
raise ValueError()
from lxml.etree import XMLSyntaxError
@attr.s(frozen=True)
class ExtractedTextSegment:
id = attr.ib(type=str)
text = attr.ib(type=str)
@text.validator
def check(self, attribute, value):
if normalize(value, self.normalization) != value:
raise ValueError('String "{}" is not normalized.'.format(value))
normalization = attr.ib(converter=Normalization, default=Normalization.NFC)
def alto_namespace(tree): def alto_namespace(tree):
@ -21,7 +69,7 @@ def alto_namespace(tree):
raise ValueError('Not an ALTO tree') raise ValueError('Not an ALTO tree')
def alto_text(tree): def alto_extract(tree):
"""Extract text from the given ALTO ElementTree.""" """Extract text from the given ALTO ElementTree."""
nsmap = {'alto': alto_namespace(tree)} nsmap = {'alto': alto_namespace(tree)}
@ -29,9 +77,15 @@ def alto_text(tree):
lines = ( lines = (
' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap)) ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap))
for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap)) for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap))
text_ = '\n'.join(lines)
return text_ return ExtractedText((ExtractedTextSegment(None, line_text) for line_text in lines), '\n')
# TODO This currently does not extract any segment id, because we are
# clueless about the ALTO format.
# FIXME needs to handle normalization
def alto_text(tree):
return alto_extract(tree).text
def page_namespace(tree): def page_namespace(tree):
@ -47,7 +101,7 @@ def page_namespace(tree):
raise ValueError('Not a PAGE tree') raise ValueError('Not a PAGE tree')
def page_text(tree): def page_extract(tree):
"""Extract text from the given PAGE content ElementTree.""" """Extract text from the given PAGE content ElementTree."""
nsmap = {'page': page_namespace(tree)} nsmap = {'page': page_namespace(tree)}
@ -80,10 +134,13 @@ def page_text(tree):
# XXX Does a file have to have regions etc.? region vs lines etc. # XXX Does a file have to have regions etc.? region vs lines etc.
# Filter empty region texts # Filter empty region texts
region_texts = (t for t in region_texts if t) region_texts = (t for t in region_texts if t)
return ExtractedText((ExtractedTextSegment(None, region_text) for region_text in region_texts), '\n')
# TODO This currently does not extract any segment id
# FIXME needs to handle normalization
text_ = '\n'.join(region_texts)
return text_ def page_text(tree):
return page_extract(tree).text
def text(filename): def text(filename):

Loading…
Cancel
Save