mirror of
https://github.com/mikegerber/ocrd_calamari.git
synced 2025-06-08 19:29:53 +02:00
✨ Do word segmentation as expected by OCR-D PAGE specs
This commit is contained in:
parent
0f9c94e7dc
commit
6f4736f8e4
3 changed files with 28 additions and 25 deletions
12
README.md
12
README.md
|
@ -14,12 +14,12 @@ This processor only operates on the text line level and so needs a line segmenta
|
||||||
image) as its input.
|
image) as its input.
|
||||||
|
|
||||||
In addition to the line text it also outputs glyph segmentation including
|
In addition to the line text it also outputs glyph segmentation including
|
||||||
per-glyph confidence values and per-glyph alternative predictions as provided
|
per-glyph confidence values and per-glyph alternative predictions as provided by
|
||||||
by the Calamari OCR engine. Note that while Calamari does not provide word
|
the Calamari OCR engine. Note that while Calamari does not provide word
|
||||||
segmentation, this processor produces word segmentation inferred from Unicode
|
segmentation, this processor produces word segmentation inferred from text
|
||||||
text segmentation and the glyph positions. The provided glyph and word
|
segmentation and the glyph positions. The provided glyph and word segmentation
|
||||||
segmentation can be used for text extraction and highlighting, but is probably
|
can be used for text extraction and highlighting, but is probably not useful for
|
||||||
not useful for further image-based processing.
|
further image-based processing.
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,6 @@ import os
|
||||||
from glob import glob
|
from glob import glob
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import uniseg.wordbreak
|
|
||||||
from calamari_ocr.ocr import MultiPredictor
|
from calamari_ocr.ocr import MultiPredictor
|
||||||
from calamari_ocr.ocr.voting import voter_from_proto
|
from calamari_ocr.ocr.voting import voter_from_proto
|
||||||
from calamari_ocr.proto import VoterParams
|
from calamari_ocr.proto import VoterParams
|
||||||
|
@ -101,26 +100,32 @@ class CalamariRecognize(Processor):
|
||||||
|
|
||||||
# Save word results
|
# Save word results
|
||||||
#
|
#
|
||||||
# Calamari OCR does not provide word positions, so we infer word positions from a. Unicode text
|
# Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
|
||||||
# segmentation and b. the glyph positions. This is necessary because the PAGE XML format enforces
|
# and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
|
||||||
# a strict hierarchy of lines > words > glyphs.
|
# hierarchy of lines > words > glyphs.
|
||||||
|
|
||||||
def unwanted(c):
|
def _words(s):
|
||||||
"""
|
"""Split words based on spaces and include spaces as 'words'"""
|
||||||
Define unwanted characters
|
spaces = None
|
||||||
|
word = ''
|
||||||
Words only containing these e.g. whitespace characters are not considered as words.
|
for c in s:
|
||||||
"""
|
if c == ' ' and spaces is True:
|
||||||
return c == " "
|
word += c
|
||||||
|
elif c != ' ' and spaces is False:
|
||||||
|
word += c
|
||||||
|
else:
|
||||||
|
if word:
|
||||||
|
yield word
|
||||||
|
word = c
|
||||||
|
spaces = (c == ' ')
|
||||||
|
yield word
|
||||||
|
|
||||||
word_no = 0
|
word_no = 0
|
||||||
i = 0
|
i = 0
|
||||||
for word_text in uniseg.wordbreak.words(prediction.sentence):
|
|
||||||
# XXX Re-use word segmentation from dinglehopper, i.e. support private use characters
|
|
||||||
word_length = len(word_text)
|
|
||||||
do_not_include = all(unwanted(c) for c in word_text)
|
|
||||||
|
|
||||||
if not do_not_include:
|
for word_text in _words(prediction.sentence):
|
||||||
|
word_length = len(word_text)
|
||||||
|
if not all(c == ' ' for c in word_text):
|
||||||
word_positions = prediction.positions[i:i+word_length]
|
word_positions = prediction.positions[i:i+word_length]
|
||||||
word_start = word_positions[0].global_start
|
word_start = word_positions[0].global_start
|
||||||
word_end = word_positions[-1].global_end
|
word_end = word_positions[-1].global_end
|
||||||
|
@ -152,10 +157,9 @@ class CalamariRecognize(Processor):
|
||||||
word.add_Glyph(glyph)
|
word.add_Glyph(glyph)
|
||||||
|
|
||||||
line.add_Word(word)
|
line.add_Word(word)
|
||||||
|
word_no += 1
|
||||||
|
|
||||||
i += word_length
|
i += word_length
|
||||||
word_no += 1
|
|
||||||
|
|
||||||
|
|
||||||
_page_update_higher_textequiv_levels('line', pcgts)
|
_page_update_higher_textequiv_levels('line', pcgts)
|
||||||
|
|
|
@ -4,4 +4,3 @@ calamari-ocr == 0.3.5
|
||||||
setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime?
|
setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime?
|
||||||
click
|
click
|
||||||
ocrd >= 2.2.1
|
ocrd >= 2.2.1
|
||||||
uniseg
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue