@ -1,16 +1,14 @@
import enum
import enum
import functools
import re
import re
import unicodedata
import unicodedata
from contextlib import suppress
from contextlib import suppress
from itertools import repeat
from itertools import repeat
from typing import Any, Dict , List , Optional
from typing import Optional
import attr
import attr
import numpy as np
import numpy as np
from lxml import etree as ET
from lxml import etree as ET
from ocrd_utils import getLogger
from ocrd_utils import getLogger
from uniseg . graphemecluster import grapheme_clusters
class Normalization ( enum . Enum ) :
class Normalization ( enum . Enum ) :
@ -122,7 +120,7 @@ class ExtractedText:
segment_id = attr . ib ( type = Optional [ str ] )
segment_id = attr . ib ( type = Optional [ str ] )
@segment_id.validator
@segment_id.validator
def is_valid_segment_id ( self , _ , value ) :
def check ( self , _ , value ) :
if value is None :
if value is None :
return
return
if not re . match ( r " [ \ w \ d_-]+ " , value ) :
if not re . match ( r " [ \ w \ d_-]+ " , value ) :
@ -132,85 +130,33 @@ class ExtractedText:
# a. _text itself
# a. _text itself
# b. or segments (ExtractedText) and a joiner
# b. or segments (ExtractedText) and a joiner
segments = attr . ib ( type = Optional [ List [ " ExtractedText " ] ] )
segments = attr . ib ( type = Optional [ list ] , converter = attr . converters . optional ( list ) )
joiner = attr . ib ( type = Optional [ str ] )
joiner = attr . ib ( type = Optional [ str ] )
_text = attr . ib ( type = Optional [ str ] )
_text = attr . ib ( type = Optional [ str ] )
_grapheme_clusters = attr . ib ( type = Optional [ List [ str ] ] )
@segments.validator
@segments.validator
def c ant_set_both_segments_and_text ( self , _ , value ) :
def c heck ( self , _ , value ) :
if value is not None and self . _text is not None :
if value is not None and self . _text is not None :
raise ValueError ( " Can ' t have both segments and text " )
raise ValueError ( " Can ' t have both segments and text " )
@joiner.validator
def is_valid_joiner ( self , _ , value ) :
if self . segments is None :
if value is not None :
raise ValueError ( " Can ' t have joiner without segments to join " )
if self . segments is not None :
if value not in ( " " , " " , " \n " ) :
raise ValueError ( f " Unexpected segment joiner value { repr ( value ) } " )
@_text.validator
@_text.validator
def is_valid_text ( self , _ , value ) :
def check ( self , _ , value ) :
if value is None :
if value is not None and self . segments is not None :
return
if self . segments is not None :
raise ValueError ( " Can ' t have both segments and text " )
raise ValueError ( " Can ' t have both segments and text " )
if unicodedata. normalize ( " NFC " , value ) != value :
if value is not None and unicodedata . normalize ( " NFC " , value ) != value :
raise ValueError ( ' String " {} " is not in NFC. ' . format ( value ) )
raise ValueError ( ' String " {} " is not in NFC. ' . format ( value ) )
if normalize( value , self . normalization ) != value :
if value is not None and normalize ( value , self . normalization ) != value :
raise ValueError ( ' String " {} " is not normalized. ' . format ( value ) )
raise ValueError ( ' String " {} " is not normalized. ' . format ( value ) )
if self . _grapheme_clusters is None :
raise ValueError ( " Requires both text and grapheme clusters to be set " )
@_grapheme_clusters.validator
def are_valid_grapheme_clusters ( self , _ , value ) :
if value is not None and self . _text is None :
raise ValueError ( " Requires both text and grapheme clusters to be set " )
normalization = attr . ib ( converter = Normalization , default = Normalization . NFC_SBB )
normalization = attr . ib ( converter = Normalization , default = Normalization . NFC_SBB )
@property
@property
def text ( self ) - > str :
def text ( self ) :
if self . _text is not None :
if self . _text is not None :
return self . _text
return self . _text
else :
else :
assert self . joiner is not None and self . segments is not None
return self . joiner . join ( s . text for s in self . segments )
return self . joiner . join ( s . text for s in self . segments )
@functools.cached_property
def _joiner_grapheme_cluster ( self ) :
""" We need the joiner as a list of 0 or 1 grapheme clusters.
This property is cached .
"""
assert self . joiner is not None
if len ( self . joiner ) > 0 :
joiner_grapheme_cluster = list ( grapheme_clusters ( self . joiner ) )
assert len ( joiner_grapheme_cluster ) == 1 # see joiner's check above
elif len ( self . joiner ) == 0 :
joiner_grapheme_cluster = [ ]
else :
joiner_grapheme_cluster = None
return joiner_grapheme_cluster
@property
def grapheme_clusters ( self ) :
if self . _text is not None :
return self . _grapheme_clusters
else :
# TODO Test with text extracted at glyph level (joiner == "")
clusters = [ ]
assert self . segments is not None
for seg in self . segments :
clusters + = seg . grapheme_clusters + self . _joiner_grapheme_cluster
clusters = clusters [ : - 1 ]
return clusters
_segment_id_for_pos = None
_segment_id_for_pos = None
def segment_id_for_pos ( self , pos ) :
def segment_id_for_pos ( self , pos ) :
@ -221,7 +167,6 @@ class ExtractedText:
else :
else :
# Recurse
# Recurse
segment_id_for_pos = [ ]
segment_id_for_pos = [ ]
assert self . joiner is not None and self . segments is not None
for s in self . segments :
for s in self . segments :
seg_ids = [ s . segment_id_for_pos ( i ) for i in range ( len ( s . text ) ) ]
seg_ids = [ s . segment_id_for_pos ( i ) for i in range ( len ( s . text ) ) ]
segment_id_for_pos . extend ( seg_ids )
segment_id_for_pos . extend ( seg_ids )
@ -235,7 +180,7 @@ class ExtractedText:
return self . _segment_id_for_pos [ pos ]
return self . _segment_id_for_pos [ pos ]
@classmethod
@classmethod
def from_text_segment ( cls , text_segment , nsmap , * , textequiv_level = " region " ) :
def from_text_segment ( cls , text_segment , nsmap , textequiv_level = " region " ) :
""" Build an ExtractedText from a PAGE content text element """
""" Build an ExtractedText from a PAGE content text element """
localname_for_textequiv_level = { " region " : " TextRegion " , " line " : " TextLine " }
localname_for_textequiv_level = { " region " : " TextRegion " , " line " : " TextLine " }
@ -252,8 +197,7 @@ class ExtractedText:
# FIXME hardcoded SBB normalization
# FIXME hardcoded SBB normalization
segment_text = normalize_sbb ( segment_text )
segment_text = normalize_sbb ( segment_text )
segment_text = segment_text or " "
segment_text = segment_text or " "
clusters = list ( grapheme_clusters ( segment_text ) )
return cls ( segment_id , None , None , segment_text )
return cls ( segment_id , None , None , segment_text , clusters )
else :
else :
# Recurse
# Recurse
sub_localname = children_for_localname [ localname ]
sub_localname = children_for_localname [ localname ]
@ -268,15 +212,12 @@ class ExtractedText:
)
)
)
)
joiner = joiner_for_textequiv_level [ sub_textequiv_level ]
joiner = joiner_for_textequiv_level [ sub_textequiv_level ]
return cls ( segment_id , segments , joiner , None , None )
return cls ( segment_id , segments , joiner , None )
@classmethod
@classmethod
def from_str ( cls , text , normalization = Normalization . NFC_SBB ) :
def from_str ( cls , text , normalization = Normalization . NFC_SBB ) :
normalized_text = normalize ( text , normalization )
normalized_text = normalize ( text , normalization )
clusters = list ( grapheme_clusters ( normalized_text ) )
return cls ( None , None , None , normalized_text , normalization = normalization )
return cls (
None , None , None , normalized_text , clusters , normalization = normalization
)
def invert_dict ( d ) :
def invert_dict ( d ) :
@ -284,7 +225,7 @@ def invert_dict(d):
return { v : k for k , v in d . items ( ) }
return { v : k for k , v in d . items ( ) }
def get_textequiv_unicode ( text_segment : Any , nsmap : Dict [ str , str ] ) - > str :
def get_textequiv_unicode ( text_segment , nsmap ) - > str :
""" Get the TextEquiv/Unicode text of the given PAGE text element. """
""" Get the TextEquiv/Unicode text of the given PAGE text element. """
segment_id = text_segment . attrib [ " id " ]
segment_id = text_segment . attrib [ " id " ]
textequivs = text_segment . findall ( " ./page:TextEquiv " , namespaces = nsmap )
textequivs = text_segment . findall ( " ./page:TextEquiv " , namespaces = nsmap )
@ -308,7 +249,7 @@ def get_first_textequiv(textequivs, segment_id):
if np . any ( ~ nan_mask ) :
if np . any ( ~ nan_mask ) :
if np . any ( nan_mask ) :
if np . any ( nan_mask ) :
log . warning ( " TextEquiv without index in %s . " , segment_id )
log . warning ( " TextEquiv without index in %s . " , segment_id )
index = int ( np . nanargmin ( indices ) )
index = np . nanargmin ( indices )
else :
else :
# try ordering by conf
# try ordering by conf
confidences = np . array ( [ get_attr ( te , " conf " ) for te in textequivs ] , dtype = float )
confidences = np . array ( [ get_attr ( te , " conf " ) for te in textequivs ] , dtype = float )
@ -317,7 +258,7 @@ def get_first_textequiv(textequivs, segment_id):
" No index attributes, use ' conf ' attribute to sort TextEquiv in %s . " ,
" No index attributes, use ' conf ' attribute to sort TextEquiv in %s . " ,
segment_id ,
segment_id ,
)
)
index = int ( np . nanargmax ( confidences ) )
index = np . nanargmax ( confidences )
else :
else :
# fallback to first entry in case of neither index or conf present
# fallback to first entry in case of neither index or conf present
log . warning ( " No index attributes, use first TextEquiv in %s . " , segment_id )
log . warning ( " No index attributes, use first TextEquiv in %s . " , segment_id )
@ -325,11 +266,11 @@ def get_first_textequiv(textequivs, segment_id):
return textequivs [ index ]
return textequivs [ index ]
def get_attr ( te : Any , attr_name : str ) - > float :
def get_attr ( te , attr_name ) - > float :
""" Extract the attribute for the given name.
""" Extract the attribute for the given name.
Note : currently only handles numeric values !
Note : currently only handles numeric values !
Other or non existen t values are encoded as np . nan .
Other or non existen d values are encoded as np . nan .
"""
"""
attr_value = te . attrib . get ( attr_name )
attr_value = te . attrib . get ( attr_name )
try :
try :