@ -1,16 +1,14 @@
import enum
import functools
import re
import unicodedata
from contextlib import suppress
from itertools import repeat
from typing import Any, Dict , List , Optional
from typing import Optional
import attr
import numpy as np
from lxml import etree as ET
from ocrd_utils import getLogger
from uniseg . graphemecluster import grapheme_clusters
class Normalization ( enum . Enum ) :
@ -122,7 +120,7 @@ class ExtractedText:
segment_id = attr . ib ( type = Optional [ str ] )
@segment_id.validator
def is_valid_segment_id ( self , _ , value ) :
def check ( self , _ , value ) :
if value is None :
return
if not re . match ( r " [ \ w \ d_-]+ " , value ) :
@ -132,85 +130,33 @@ class ExtractedText:
# a. _text itself
# b. or segments (ExtractedText) and a joiner
segments = attr . ib ( type = Optional [ List [ " ExtractedText " ] ] )
segments = attr . ib ( type = Optional [ list ] , converter = attr . converters . optional ( list ) )
joiner = attr . ib ( type = Optional [ str ] )
_text = attr . ib ( type = Optional [ str ] )
_grapheme_clusters = attr . ib ( type = Optional [ List [ str ] ] )
@segments.validator
def c ant_set_both_segments_and_text ( self , _ , value ) :
def c heck ( self , _ , value ) :
if value is not None and self . _text is not None :
raise ValueError ( " Can ' t have both segments and text " )
@joiner.validator
def is_valid_joiner ( self , _ , value ) :
if self . segments is None :
if value is not None :
raise ValueError ( " Can ' t have joiner without segments to join " )
if self . segments is not None :
if value not in ( " " , " " , " \n " ) :
raise ValueError ( f " Unexpected segment joiner value { repr ( value ) } " )
@_text.validator
def is_valid_text ( self , _ , value ) :
if value is None :
return
if self . segments is not None :
def check ( self , _ , value ) :
if value is not None and self . segments is not None :
raise ValueError ( " Can ' t have both segments and text " )
if unicodedata. normalize ( " NFC " , value ) != value :
if value is not None and unicodedata . normalize ( " NFC " , value ) != value :
raise ValueError ( ' String " {} " is not in NFC. ' . format ( value ) )
if normalize( value , self . normalization ) != value :
if value is not None and normalize ( value , self . normalization ) != value :
raise ValueError ( ' String " {} " is not normalized. ' . format ( value ) )
if self . _grapheme_clusters is None :
raise ValueError ( " Requires both text and grapheme clusters to be set " )
@_grapheme_clusters.validator
def are_valid_grapheme_clusters ( self , _ , value ) :
if value is not None and self . _text is None :
raise ValueError ( " Requires both text and grapheme clusters to be set " )
normalization = attr . ib ( converter = Normalization , default = Normalization . NFC_SBB )
@property
def text ( self ) - > str :
def text ( self ) :
if self . _text is not None :
return self . _text
else :
assert self . joiner is not None and self . segments is not None
return self . joiner . join ( s . text for s in self . segments )
@functools.cached_property
def _joiner_grapheme_cluster ( self ) :
""" We need the joiner as a list of 0 or 1 grapheme clusters.
This property is cached .
"""
assert self . joiner is not None
if len ( self . joiner ) > 0 :
joiner_grapheme_cluster = list ( grapheme_clusters ( self . joiner ) )
assert len ( joiner_grapheme_cluster ) == 1 # see joiner's check above
elif len ( self . joiner ) == 0 :
joiner_grapheme_cluster = [ ]
else :
joiner_grapheme_cluster = None
return joiner_grapheme_cluster
@property
def grapheme_clusters ( self ) :
if self . _text is not None :
return self . _grapheme_clusters
else :
# TODO Test with text extracted at glyph level (joiner == "")
clusters = [ ]
assert self . segments is not None
for seg in self . segments :
clusters + = seg . grapheme_clusters + self . _joiner_grapheme_cluster
clusters = clusters [ : - 1 ]
return clusters
_segment_id_for_pos = None
def segment_id_for_pos ( self , pos ) :
@ -221,7 +167,6 @@ class ExtractedText:
else :
# Recurse
segment_id_for_pos = [ ]
assert self . joiner is not None and self . segments is not None
for s in self . segments :
seg_ids = [ s . segment_id_for_pos ( i ) for i in range ( len ( s . text ) ) ]
segment_id_for_pos . extend ( seg_ids )
@ -235,7 +180,7 @@ class ExtractedText:
return self . _segment_id_for_pos [ pos ]
@classmethod
def from_text_segment ( cls , text_segment , nsmap , * , textequiv_level = " region " ) :
def from_text_segment ( cls , text_segment , nsmap , textequiv_level = " region " ) :
""" Build an ExtractedText from a PAGE content text element """
localname_for_textequiv_level = { " region " : " TextRegion " , " line " : " TextLine " }
@ -252,8 +197,7 @@ class ExtractedText:
# FIXME hardcoded SBB normalization
segment_text = normalize_sbb ( segment_text )
segment_text = segment_text or " "
clusters = list ( grapheme_clusters ( segment_text ) )
return cls ( segment_id , None , None , segment_text , clusters )
return cls ( segment_id , None , None , segment_text )
else :
# Recurse
sub_localname = children_for_localname [ localname ]
@ -268,15 +212,12 @@ class ExtractedText:
)
)
joiner = joiner_for_textequiv_level [ sub_textequiv_level ]
return cls ( segment_id , segments , joiner , None , None )
return cls ( segment_id , segments , joiner , None )
@classmethod
def from_str ( cls , text , normalization = Normalization . NFC_SBB ) :
normalized_text = normalize ( text , normalization )
clusters = list ( grapheme_clusters ( normalized_text ) )
return cls (
None , None , None , normalized_text , clusters , normalization = normalization
)
return cls ( None , None , None , normalized_text , normalization = normalization )
def invert_dict ( d ) :
@ -284,7 +225,7 @@ def invert_dict(d):
return { v : k for k , v in d . items ( ) }
def get_textequiv_unicode ( text_segment : Any , nsmap : Dict [ str , str ] ) - > str :
def get_textequiv_unicode ( text_segment , nsmap ) - > str :
""" Get the TextEquiv/Unicode text of the given PAGE text element. """
segment_id = text_segment . attrib [ " id " ]
textequivs = text_segment . findall ( " ./page:TextEquiv " , namespaces = nsmap )
@ -308,7 +249,7 @@ def get_first_textequiv(textequivs, segment_id):
if np . any ( ~ nan_mask ) :
if np . any ( nan_mask ) :
log . warning ( " TextEquiv without index in %s . " , segment_id )
index = int ( np . nanargmin ( indices ) )
index = np . nanargmin ( indices )
else :
# try ordering by conf
confidences = np . array ( [ get_attr ( te , " conf " ) for te in textequivs ] , dtype = float )
@ -317,7 +258,7 @@ def get_first_textequiv(textequivs, segment_id):
" No index attributes, use ' conf ' attribute to sort TextEquiv in %s . " ,
segment_id ,
)
index = int ( np . nanargmax ( confidences ) )
index = np . nanargmax ( confidences )
else :
# fallback to first entry in case of neither index or conf present
log . warning ( " No index attributes, use first TextEquiv in %s . " , segment_id )
@ -325,11 +266,11 @@ def get_first_textequiv(textequivs, segment_id):
return textequivs [ index ]
def get_attr ( te : Any , attr_name : str ) - > float :
def get_attr ( te , attr_name ) - > float :
""" Extract the attribute for the given name.
Note : currently only handles numeric values !
Other or non existen t values are encoded as np . nan .
Other or non existen d values are encoded as np . nan .
"""
attr_value = te . attrib . get ( attr_name )
try :