1
0
Fork 0
mirror of https://github.com/mikegerber/ocrd_calamari.git synced 2025-06-09 11:49:53 +02:00

📝 Document why we are using Unicode text segmentation to produce word results

This commit is contained in:
Gerber, Mike 2020-02-03 15:33:11 +01:00
parent 0a572df0ba
commit 91cca1e1b8

View file

@ -100,6 +100,11 @@ class CalamariRecognize(Processor):
line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])
# Save word results
#
# Calamari OCR does not provide word positions, so we infer word positions from a. Unicode text
# segmentation and b. the glyph positions. This is necessary because the PAGE XML format enforces
# a strict hierarchy of lines > words > glyphs.
def unwanted(c):
return c == " "