1
0
Fork 0
mirror of https://github.com/mikegerber/ocrd_calamari.git synced 2025-06-09 11:49:53 +02:00

Allow controlling of output hierarchy level, e.g. only line, not words+glyphs

This commit is contained in:
Gerber, Mike 2020-02-05 13:02:10 +01:00
parent 0f0bae18ba
commit ef3fb44fb5
4 changed files with 73 additions and 37 deletions

View file

@ -87,6 +87,7 @@ def test_word_segmentation(workspace):
output_file_grp="OCR-D-OCR-CALAMARI",
parameter={
"checkpoint": CHECKPOINT,
"textequiv_level": "word", # Note that we're going down to word level here
}
).process()
workspace.save_mets()
@ -106,5 +107,30 @@ def test_word_segmentation(workspace):
line_text = line.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text
assert words_text == line_text
# For extra measure, check that we're not seeing any glyphs, as we asked for textequiv_level == "word"
glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP)
assert len(glyphs) == 0
def test_glyphs(workspace):
CalamariRecognize(
workspace,
input_file_grp="OCR-D-GT-SEG-LINE",
output_file_grp="OCR-D-OCR-CALAMARI",
parameter={
"checkpoint": CHECKPOINT,
"textequiv_level": "glyph", # Note that we're going down to glyph level here
}
).process()
workspace.save_mets()
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
assert os.path.exists(page1)
tree = etree.parse(page1)
# The result should contain a lot of glyphs
glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP)
assert len(glyphs) >= 100
# vim:tw=120: