🚧 Update higher TextEquiv levels

2025-07-28 03:19:54 +02:00 · 2019-08-08 16:28:08 +02:00 · 2019-08-08 16:28:08 +02:00 · 0498f9551e
commit 0498f9551e
parent 2561b67891
1 changed files with 37 additions and 0 deletions
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@ -75,6 +75,8 @@ class CalamariRecognize(Processor):
                    line.add_TextEquiv(TextEquivType(Unicode=line_text, conf=line_conf))
            _page_update_higher_textequiv_levels('line', pcgts)
            file_id = self._make_file_id(input_file, n)
            self.workspace.add_file(
                ID=file_id,
@ -83,3 +85,38 @@ class CalamariRecognize(Processor):
                mimetype=MIMETYPE_PAGE,
                local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
                content=to_xml(pcgts))
 # TODO: This is a copy of ocrd_tesserocr's function, and should probably be moved to a ocrd lib
 def _page_update_higher_textequiv_levels(level, pcgts):
    """Update the TextEquivs of all PAGE-XML hierarchy levels above `level` for consistency.
    Starting with the hierarchy level chosen for processing,
    join all first TextEquiv (by the rules governing the respective level)
    into TextEquiv of the next higher level, replacing them.
    """
    regions = pcgts.get_Page().get_TextRegion()
    if level != 'region':
        for region in regions:
            lines = region.get_TextLine()
            if level != 'line':
                for line in lines:
                    words = line.get_Word()
                    if level != 'word':
                        for word in words:
                            glyphs = word.get_Glyph()
                            word_unicode = u''.join(glyph.get_TextEquiv()[0].Unicode
                                                    if glyph.get_TextEquiv()
                                                    else u'' for glyph in glyphs)
                            word.set_TextEquiv(
                                [TextEquivType(Unicode=word_unicode)])  # remove old
                    line_unicode = u' '.join(word.get_TextEquiv()[0].Unicode
                                             if word.get_TextEquiv()
                                             else u'' for word in words)
                    line.set_TextEquiv(
                        [TextEquivType(Unicode=line_unicode)])  # remove old
            region_unicode = u'\n'.join(line.get_TextEquiv()[0].Unicode
                                        if line.get_TextEquiv()
                                        else u'' for line in lines)
            region.set_TextEquiv(
                [TextEquivType(Unicode=region_unicode)])  # remove old