mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 20:00:01 +02:00
🚧 dinglehopper: Extract text while retaining segment id info
This commit is contained in:
parent
1083dcc5b9
commit
9f8bb1d8ea
3 changed files with 66 additions and 59 deletions
|
@ -1,50 +0,0 @@
|
||||||
import attr
|
|
||||||
import unicodedata
|
|
||||||
import enum
|
|
||||||
|
|
||||||
|
|
||||||
# TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped
|
|
||||||
# TODO types are not validated (attr does not do this yet)
|
|
||||||
|
|
||||||
|
|
||||||
@attr.s(frozen=True)
|
|
||||||
class ExtractedText:
|
|
||||||
segments = attr.ib()
|
|
||||||
joiner = attr.ib(type=str)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def text(self):
|
|
||||||
return self.joiner.join(s.text for s in self.segments)
|
|
||||||
|
|
||||||
def segment_id_for_pos(self, pos):
|
|
||||||
i = 0
|
|
||||||
for s in self.segments:
|
|
||||||
if i <= pos < i + len(s.text):
|
|
||||||
return s.id
|
|
||||||
i += len(s.text)
|
|
||||||
if i <= pos < i + len(self.joiner):
|
|
||||||
return None
|
|
||||||
i += len(self.joiner)
|
|
||||||
|
|
||||||
|
|
||||||
class Normalization(enum.Enum):
|
|
||||||
NFC = 1
|
|
||||||
NFC_MUFI = 2
|
|
||||||
|
|
||||||
|
|
||||||
def normalize(text, normalization):
|
|
||||||
if normalization == Normalization.NFC:
|
|
||||||
return unicodedata.normalize('NFC', text)
|
|
||||||
else:
|
|
||||||
raise ValueError()
|
|
||||||
|
|
||||||
|
|
||||||
@attr.s(frozen=True)
|
|
||||||
class ExtractedTextSegment:
|
|
||||||
id = attr.ib(type=str)
|
|
||||||
text = attr.ib(type=str)
|
|
||||||
@text.validator
|
|
||||||
def check(self, attribute, value):
|
|
||||||
if normalize(value, self.normalization) != value:
|
|
||||||
raise ValueError('String "{}" is not normalized.'.format(value))
|
|
||||||
normalization = attr.ib(converter=Normalization, default=Normalization.NFC)
|
|
|
@ -1,6 +1,6 @@
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import pytest
|
import pytest
|
||||||
from extracted_text import ExtractedText, ExtractedTextSegment
|
from qurator.dinglehopper import ExtractedText, ExtractedTextSegment
|
||||||
from uniseg.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
from qurator.dinglehopper import seq_align
|
from qurator.dinglehopper import seq_align
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
|
|
|
@ -3,9 +3,57 @@ from __future__ import division, print_function
|
||||||
from warnings import warn
|
from warnings import warn
|
||||||
|
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
import sys
|
|
||||||
|
|
||||||
from lxml.etree import XMLSyntaxError
|
from lxml.etree import XMLSyntaxError
|
||||||
|
import sys
|
||||||
|
import attr
|
||||||
|
import enum
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
|
||||||
|
@attr.s(frozen=True)
|
||||||
|
class ExtractedText:
|
||||||
|
segments = attr.ib()
|
||||||
|
joiner = attr.ib(type=str)
|
||||||
|
# TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped
|
||||||
|
# TODO Types are not validated (attr does not do this yet)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text(self):
|
||||||
|
return self.joiner.join(s.text for s in self.segments)
|
||||||
|
|
||||||
|
def segment_id_for_pos(self, pos):
|
||||||
|
i = 0
|
||||||
|
for s in self.segments:
|
||||||
|
if i <= pos < i + len(s.text):
|
||||||
|
return s.id
|
||||||
|
i += len(s.text)
|
||||||
|
if i <= pos < i + len(self.joiner):
|
||||||
|
return None
|
||||||
|
i += len(self.joiner)
|
||||||
|
# XXX Cache results
|
||||||
|
|
||||||
|
|
||||||
|
class Normalization(enum.Enum):
|
||||||
|
NFC = 1
|
||||||
|
NFC_MUFI = 2
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(text, normalization):
|
||||||
|
if normalization == Normalization.NFC:
|
||||||
|
return unicodedata.normalize('NFC', text)
|
||||||
|
else:
|
||||||
|
raise ValueError()
|
||||||
|
|
||||||
|
|
||||||
|
@attr.s(frozen=True)
|
||||||
|
class ExtractedTextSegment:
|
||||||
|
id = attr.ib(type=str)
|
||||||
|
text = attr.ib(type=str)
|
||||||
|
@text.validator
|
||||||
|
def check(self, attribute, value):
|
||||||
|
if normalize(value, self.normalization) != value:
|
||||||
|
raise ValueError('String "{}" is not normalized.'.format(value))
|
||||||
|
normalization = attr.ib(converter=Normalization, default=Normalization.NFC)
|
||||||
|
|
||||||
|
|
||||||
def alto_namespace(tree):
|
def alto_namespace(tree):
|
||||||
|
@ -21,7 +69,7 @@ def alto_namespace(tree):
|
||||||
raise ValueError('Not an ALTO tree')
|
raise ValueError('Not an ALTO tree')
|
||||||
|
|
||||||
|
|
||||||
def alto_text(tree):
|
def alto_extract(tree):
|
||||||
"""Extract text from the given ALTO ElementTree."""
|
"""Extract text from the given ALTO ElementTree."""
|
||||||
|
|
||||||
nsmap = {'alto': alto_namespace(tree)}
|
nsmap = {'alto': alto_namespace(tree)}
|
||||||
|
@ -29,9 +77,15 @@ def alto_text(tree):
|
||||||
lines = (
|
lines = (
|
||||||
' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap))
|
' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap))
|
||||||
for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap))
|
for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap))
|
||||||
text_ = '\n'.join(lines)
|
|
||||||
|
|
||||||
return text_
|
return ExtractedText((ExtractedTextSegment(None, line_text) for line_text in lines), '\n')
|
||||||
|
# TODO This currently does not extract any segment id, because we are
|
||||||
|
# clueless about the ALTO format.
|
||||||
|
# FIXME needs to handle normalization
|
||||||
|
|
||||||
|
|
||||||
|
def alto_text(tree):
|
||||||
|
return alto_extract(tree).text
|
||||||
|
|
||||||
|
|
||||||
def page_namespace(tree):
|
def page_namespace(tree):
|
||||||
|
@ -47,7 +101,7 @@ def page_namespace(tree):
|
||||||
raise ValueError('Not a PAGE tree')
|
raise ValueError('Not a PAGE tree')
|
||||||
|
|
||||||
|
|
||||||
def page_text(tree):
|
def page_extract(tree):
|
||||||
"""Extract text from the given PAGE content ElementTree."""
|
"""Extract text from the given PAGE content ElementTree."""
|
||||||
|
|
||||||
nsmap = {'page': page_namespace(tree)}
|
nsmap = {'page': page_namespace(tree)}
|
||||||
|
@ -80,10 +134,13 @@ def page_text(tree):
|
||||||
# XXX Does a file have to have regions etc.? region vs lines etc.
|
# XXX Does a file have to have regions etc.? region vs lines etc.
|
||||||
# Filter empty region texts
|
# Filter empty region texts
|
||||||
region_texts = (t for t in region_texts if t)
|
region_texts = (t for t in region_texts if t)
|
||||||
|
return ExtractedText((ExtractedTextSegment(None, region_text) for region_text in region_texts), '\n')
|
||||||
|
# TODO This currently does not extract any segment id
|
||||||
|
# FIXME needs to handle normalization
|
||||||
|
|
||||||
text_ = '\n'.join(region_texts)
|
|
||||||
|
|
||||||
return text_
|
def page_text(tree):
|
||||||
|
return page_extract(tree).text
|
||||||
|
|
||||||
|
|
||||||
def text(filename):
|
def text(filename):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue