mirror of
https://github.com/mikegerber/ocrd_calamari.git
synced 2025-06-09 11:49:53 +02:00
✨ Allow controlling of output hierarchy level, e.g. only line, not words+glyphs
This commit is contained in:
parent
0f0bae18ba
commit
ef3fb44fb5
4 changed files with 73 additions and 37 deletions
|
@ -87,6 +87,7 @@ def test_word_segmentation(workspace):
|
|||
output_file_grp="OCR-D-OCR-CALAMARI",
|
||||
parameter={
|
||||
"checkpoint": CHECKPOINT,
|
||||
"textequiv_level": "word", # Note that we're going down to word level here
|
||||
}
|
||||
).process()
|
||||
workspace.save_mets()
|
||||
|
@ -106,5 +107,30 @@ def test_word_segmentation(workspace):
|
|||
line_text = line.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text
|
||||
assert words_text == line_text
|
||||
|
||||
# For extra measure, check that we're not seeing any glyphs, as we asked for textequiv_level == "word"
|
||||
glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP)
|
||||
assert len(glyphs) == 0
|
||||
|
||||
|
||||
def test_glyphs(workspace):
|
||||
CalamariRecognize(
|
||||
workspace,
|
||||
input_file_grp="OCR-D-GT-SEG-LINE",
|
||||
output_file_grp="OCR-D-OCR-CALAMARI",
|
||||
parameter={
|
||||
"checkpoint": CHECKPOINT,
|
||||
"textequiv_level": "glyph", # Note that we're going down to glyph level here
|
||||
}
|
||||
).process()
|
||||
workspace.save_mets()
|
||||
|
||||
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
|
||||
assert os.path.exists(page1)
|
||||
tree = etree.parse(page1)
|
||||
|
||||
# The result should contain a lot of glyphs
|
||||
glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP)
|
||||
assert len(glyphs) >= 100
|
||||
|
||||
|
||||
# vim:tw=120:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue