mirror of
https://github.com/mikegerber/ocrd_calamari.git
synced 2025-06-10 04:09:53 +02:00
🚧 Update higher TextEquiv levels
This commit is contained in:
parent
2561b67891
commit
0498f9551e
1 changed files with 37 additions and 0 deletions
|
@ -75,6 +75,8 @@ class CalamariRecognize(Processor):
|
|||
|
||||
line.add_TextEquiv(TextEquivType(Unicode=line_text, conf=line_conf))
|
||||
|
||||
_page_update_higher_textequiv_levels('line', pcgts)
|
||||
|
||||
file_id = self._make_file_id(input_file, n)
|
||||
self.workspace.add_file(
|
||||
ID=file_id,
|
||||
|
@ -83,3 +85,38 @@ class CalamariRecognize(Processor):
|
|||
mimetype=MIMETYPE_PAGE,
|
||||
local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
|
||||
content=to_xml(pcgts))
|
||||
|
||||
|
||||
# TODO: This is a copy of ocrd_tesserocr's function, and should probably be moved to a ocrd lib
|
||||
def _page_update_higher_textequiv_levels(level, pcgts):
|
||||
"""Update the TextEquivs of all PAGE-XML hierarchy levels above `level` for consistency.
|
||||
|
||||
Starting with the hierarchy level chosen for processing,
|
||||
join all first TextEquiv (by the rules governing the respective level)
|
||||
into TextEquiv of the next higher level, replacing them.
|
||||
"""
|
||||
regions = pcgts.get_Page().get_TextRegion()
|
||||
if level != 'region':
|
||||
for region in regions:
|
||||
lines = region.get_TextLine()
|
||||
if level != 'line':
|
||||
for line in lines:
|
||||
words = line.get_Word()
|
||||
if level != 'word':
|
||||
for word in words:
|
||||
glyphs = word.get_Glyph()
|
||||
word_unicode = u''.join(glyph.get_TextEquiv()[0].Unicode
|
||||
if glyph.get_TextEquiv()
|
||||
else u'' for glyph in glyphs)
|
||||
word.set_TextEquiv(
|
||||
[TextEquivType(Unicode=word_unicode)]) # remove old
|
||||
line_unicode = u' '.join(word.get_TextEquiv()[0].Unicode
|
||||
if word.get_TextEquiv()
|
||||
else u'' for word in words)
|
||||
line.set_TextEquiv(
|
||||
[TextEquivType(Unicode=line_unicode)]) # remove old
|
||||
region_unicode = u'\n'.join(line.get_TextEquiv()[0].Unicode
|
||||
if line.get_TextEquiv()
|
||||
else u'' for line in lines)
|
||||
region.set_TextEquiv(
|
||||
[TextEquivType(Unicode=region_unicode)]) # remove old
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue