🎨 dinglehopper: Reformat using black

pull/46/head
Gerber, Mike 4 years ago
parent 31c63f9e4c
commit 14421c8e53

@ -1,2 +1 @@
__import__('pkg_resources').declare_namespace(__name__) __import__("pkg_resources").declare_namespace(__name__)

@ -3,8 +3,8 @@ from .edit_distance import *
def align(t1, t2): def align(t1, t2):
"""Align text.""" """Align text."""
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', t1))) s1 = list(grapheme_clusters(unicodedata.normalize("NFC", t1)))
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', t2))) s2 = list(grapheme_clusters(unicodedata.normalize("NFC", t2)))
return seq_align(s1, s2) return seq_align(s1, s2)
@ -27,13 +27,13 @@ def seq_align(s1, s2):
pass pass
if o: if o:
if o[0] == 'insert': if o[0] == "insert":
yield None, s2[j] yield None, s2[j]
j += 1 j += 1
elif o[0] == 'delete': elif o[0] == "delete":
yield s1[i], None yield s1[i], None
i += 1 i += 1
elif o[0] == 'replace': elif o[0] == "replace":
yield s1[i], s2[j] yield s1[i], s2[j]
i += 1 i += 1
j += 1 j += 1

@ -19,19 +19,21 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
""" """
d = distance(reference, compared) d = distance(reference, compared)
n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference)))) n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference))))
if d == 0: if d == 0:
return 0, n return 0, n
if n == 0: if n == 0:
return float('inf'), n return float("inf"), n
return d/n, n return d / n, n
# XXX Should we really count newlines here? # XXX Should we really count newlines here?
@multimethod @multimethod
def character_error_rate_n(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]: def character_error_rate_n(
reference: ExtractedText, compared: ExtractedText
) -> Tuple[float, int]:
return character_error_rate_n(reference.text, compared.text) return character_error_rate_n(reference.text, compared.text)

@ -12,16 +12,17 @@ from .extracted_text import ExtractedText
from .ocr_files import extract from .ocr_files import extract
from .config import Config from .config import Config
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
gtx = '' gtx = ""
ocrx = '' ocrx = ""
def format_thing(t, css_classes=None, id_=None): def format_thing(t, css_classes=None, id_=None):
if t is None: if t is None:
html_t = none html_t = none
css_classes += ' ellipsis' css_classes += " ellipsis"
elif t == '\n': elif t == "\n":
html_t = '<br>' html_t = "<br>"
else: else:
html_t = escape(t) html_t = escape(t)
@ -32,9 +33,13 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_) html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)
if css_classes: if css_classes:
return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs) return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(
css_classes=css_classes,
html_t=html_t,
html_custom_attrs=html_custom_attrs,
)
else: else:
return '{html_t}'.format(html_t=html_t) return "{html_t}".format(html_t=html_t)
if isinstance(gt_in, ExtractedText): if isinstance(gt_in, ExtractedText):
if not isinstance(ocr_in, ExtractedText): if not isinstance(ocr_in, ExtractedText):
@ -46,8 +51,6 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
gt_things = gt_in gt_things = gt_in
ocr_things = ocr_in ocr_things = ocr_in
g_pos = 0 g_pos = 0
o_pos = 0 o_pos = 0
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)): for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
@ -55,7 +58,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
gt_id = None gt_id = None
ocr_id = None ocr_id = None
if g != o: if g != o:
css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k) css_classes = "{css_prefix}diff{k} diff".format(css_prefix=css_prefix, k=k)
if isinstance(gt_in, ExtractedText): if isinstance(gt_in, ExtractedText):
gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None
ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None
@ -70,17 +73,17 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
if o is not None: if o is not None:
o_pos += len(o) o_pos += len(o)
return """
return \
'''
<div class="row"> <div class="row">
<div class="col-md-6 gt">{}</div> <div class="col-md-6 gt">{}</div>
<div class="col-md-6 ocr">{}</div> <div class="col-md-6 ocr">{}</div>
</div> </div>
'''.format(gtx, ocrx) """.format(
gtx, ocrx
)
def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'): def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
"""Check OCR result against GT. """Check OCR result against GT.
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
@ -93,36 +96,47 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'):
cer, n_characters = character_error_rate_n(gt_text, ocr_text) cer, n_characters = character_error_rate_n(gt_text, ocr_text)
wer, n_words = word_error_rate_n(gt_text, ocr_text) wer, n_words = word_error_rate_n(gt_text, ocr_text)
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·') char_diff_report = gen_diff_report(
gt_text, ocr_text, css_prefix="c", joiner="", none="·"
)
gt_words = words_normalized(gt_text) gt_words = words_normalized(gt_text)
ocr_words = words_normalized(ocr_text) ocr_words = words_normalized(ocr_text)
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='') word_diff_report = gen_diff_report(
gt_words, ocr_words, css_prefix="w", joiner=" ", none=""
)
def json_float(value): def json_float(value):
"""Convert a float value to an JSON float. """Convert a float value to an JSON float.
This is here so that float('inf') yields "Infinity", not "inf". This is here so that float('inf') yields "Infinity", not "inf".
""" """
if value == float('inf'): if value == float("inf"):
return 'Infinity' return "Infinity"
elif value == float('-inf'): elif value == float("-inf"):
return '-Infinity' return "-Infinity"
else: else:
return str(value) return str(value)
env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates'))) env = Environment(
env.filters['json_float'] = json_float loader=FileSystemLoader(
os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
)
)
env.filters["json_float"] = json_float
for report_suffix in ('.html', '.json'): for report_suffix in (".html", ".json"):
template_fn = 'report' + report_suffix + '.j2' template_fn = "report" + report_suffix + ".j2"
out_fn = report_prefix + report_suffix out_fn = report_prefix + report_suffix
template = env.get_template(template_fn) template = env.get_template(template_fn)
template.stream( template.stream(
gt=gt, ocr=ocr, gt=gt,
cer=cer, n_characters=n_characters, ocr=ocr,
wer=wer, n_words=n_words, cer=cer,
n_characters=n_characters,
wer=wer,
n_words=n_words,
char_diff_report=char_diff_report, char_diff_report=char_diff_report,
word_diff_report=word_diff_report, word_diff_report=word_diff_report,
metrics=metrics, metrics=metrics,
@ -130,12 +144,19 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'):
@click.command() @click.command()
@click.argument('gt', type=click.Path(exists=True)) @click.argument("gt", type=click.Path(exists=True))
@click.argument('ocr', type=click.Path(exists=True)) @click.argument("ocr", type=click.Path(exists=True))
@click.argument('report_prefix', type=click.Path(), default='report') @click.argument("report_prefix", type=click.Path(), default="report")
@click.option('--metrics/--no-metrics', default=True, help='Enable/disable metrics and green/red') @click.option(
@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL') "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
@click.option('--progress', default=False, is_flag=True, help='Show progress bar') )
@click.option(
"--textequiv-level",
default="region",
help="PAGE TextEquiv level to extract text from",
metavar="LEVEL",
)
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
def main(gt, ocr, report_prefix, metrics, textequiv_level, progress): def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
""" """
Compare the PAGE/ALTO/text document GT against the document OCR. Compare the PAGE/ALTO/text document GT against the document OCR.
@ -159,5 +180,5 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level) process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
if __name__ == '__main__': if __name__ == "__main__":
main() main()

@ -7,8 +7,13 @@ from .ocr_files import extract
@click.command() @click.command()
@click.argument('input_file', type=click.Path(exists=True)) @click.argument("input_file", type=click.Path(exists=True))
@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL') @click.option(
"--textequiv-level",
default="region",
help="PAGE TextEquiv level to extract text from",
metavar="LEVEL",
)
def main(input_file, textequiv_level): def main(input_file, textequiv_level):
""" """
Extract the text of the given INPUT_FILE. Extract the text of the given INPUT_FILE.
@ -23,5 +28,5 @@ def main(input_file, textequiv_level):
print(input_text) print(input_text)
if __name__ == '__main__': if __name__ == "__main__":
main() main()

@ -48,9 +48,10 @@ def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
for i in tqdm(from_to(1, m), disable=not Config.progress): for i in tqdm(from_to(1, m), disable=not Config.progress):
for j in from_to(1, n): for j in from_to(1, n):
D[i, j] = min( D[i, j] = min(
D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution D[i - 1, j - 1]
+ 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution
D[i, j - 1] + 1, # Insertion D[i, j - 1] + 1, # Insertion
D[i - 1, j] + 1 # Deletion D[i - 1, j] + 1, # Deletion
) )
return D return D
@ -81,8 +82,8 @@ def distance(s1: str, s2: str):
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
clusters. This should be the correct way to compare two Unicode strings. clusters. This should be the correct way to compare two Unicode strings.
""" """
seq1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1))) seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
seq2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2))) seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
return levenshtein(seq1, seq2) return levenshtein(seq1, seq2)
@ -106,11 +107,17 @@ def seq_editops(seq1, seq2):
def _tail_backtrace(i, j, accumulator): def _tail_backtrace(i, j, accumulator):
if i > 0 and D[i - 1, j] + 1 == D[i, j]: if i > 0 and D[i - 1, j] + 1 == D[i, j]:
return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator) return partial(
_tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator
)
if j > 0 and D[i, j - 1] + 1 == D[i, j]: if j > 0 and D[i, j - 1] + 1 == D[i, j]:
return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator) return partial(
_tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator
)
if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]: if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator) return partial(
_tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator
)
if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]: if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP
return accumulator return accumulator
@ -132,6 +139,6 @@ def editops(word1, word2):
Note that this returns indices to the _grapheme clusters_, not characters! Note that this returns indices to the _grapheme clusters_, not characters!
""" """
word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1))) word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2))) word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
return seq_editops(word1, word2) return seq_editops(word1, word2)

@ -10,6 +10,7 @@ import numpy as np
from lxml import etree as ET from lxml import etree as ET
from ocrd_utils import getLogger from ocrd_utils import getLogger
class Normalization(enum.Enum): class Normalization(enum.Enum):
NFC = 1 NFC = 1
NFC_MUFI = 2 # TODO NFC_MUFI = 2 # TODO
@ -18,7 +19,7 @@ class Normalization(enum.Enum):
def normalize(text, normalization): def normalize(text, normalization):
if normalization == Normalization.NFC: if normalization == Normalization.NFC:
return unicodedata.normalize('NFC', text) return unicodedata.normalize("NFC", text)
if normalization == Normalization.NFC_MUFI: if normalization == Normalization.NFC_MUFI:
raise NotImplementedError() raise NotImplementedError()
if normalization == Normalization.NFC_SBB: if normalization == Normalization.NFC_SBB:
@ -36,31 +37,31 @@ def unjoin_ligatures(s):
"""Unjoin ligatures, i.e. ff becomes ff.""" """Unjoin ligatures, i.e. ff becomes ff."""
equivalences = { equivalences = {
'': 'ſſ', "": "ſſ",
"\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I "\ueba7": "ſſi", # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
'': 'ch', "": "ch",
'': 'ck', "": "ck",
'': 'll', "": "ll",
'': 'ſi', "": "ſi",
'': 'ſt', "": "ſt",
'': 'fi', "": "fi",
'': 'ff', "": "ff",
'': 'fl', "": "fl",
'': 'ffi', "": "ffi",
'': 'ct', "": "ct",
'': 'tz', # MUFI: LATIN SMALL LIGATURE TZ "": "tz", # MUFI: LATIN SMALL LIGATURE TZ
'\uf532': 'as', # eMOP: Latin small ligature as "\uf532": "as", # eMOP: Latin small ligature as
'\uf533': 'is', # eMOP: Latin small ligature is "\uf533": "is", # eMOP: Latin small ligature is
'\uf534': 'us', # eMOP: Latin small ligature us "\uf534": "us", # eMOP: Latin small ligature us
'\uf535': 'Qu', # eMOP: Latin ligature capital Q small u "\uf535": "Qu", # eMOP: Latin ligature capital Q small u
'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ "ij": "ij", # U+0133 LATIN SMALL LIGATURE IJ
'\uE8BF': 'q&', "\uE8BF": "q&",
# MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET
# XXX How to replace this correctly? # XXX How to replace this correctly?
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P "\uEBA5": "ſp", # MUFI: LATIN SMALL LIGATURE LONG S P
'': 'st', # U+FB06 LATIN SMALL LIGATURE ST "": "st", # U+FB06 LATIN SMALL LIGATURE ST
} }
s = unicodedata.normalize('NFC', s) s = unicodedata.normalize("NFC", s)
for fr, to in equivalences.items(): for fr, to in equivalences.items():
s = s.replace(fr, to) s = s.replace(fr, to)
return s return s
@ -70,20 +71,20 @@ def substitute_equivalences(s):
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
# It might make sense to use different rules for GT and for the different OCR # It might make sense to use different rules for GT and for the different OCR
equivalences = { equivalences = {
'': 'ü', "": "ü",
'': 'ä', "": "ä",
'==': '', # → en-dash "==": "", # → en-dash
'': '', # em-dash → en-dash "": "", # em-dash → en-dash
'': 'ö', "": "ö",
'': '\'', "": "'",
'': '-', "": "-",
'': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E "": "ä", # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
'': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E "": "ö", # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
'': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E "": "ü", # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
'\uF50E': '' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT "\uF50E": "", # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
} }
s = unicodedata.normalize('NFC', s) s = unicodedata.normalize("NFC", s)
s = unjoin_ligatures(s) s = unjoin_ligatures(s)
for fr, to in equivalences.items(): for fr, to in equivalences.items():
s = s.replace(fr, to) s = s.replace(fr, to)
@ -115,13 +116,14 @@ class ExtractedText:
Objects of this class are guaranteed to be a. always in their normalization Objects of this class are guaranteed to be a. always in their normalization
and b. in NFC. and b. in NFC.
""" """
segment_id = attr.ib(type=Optional[str]) segment_id = attr.ib(type=Optional[str])
@segment_id.validator @segment_id.validator
def check(self, _, value): def check(self, _, value):
if value is None: if value is None:
return return
if not re.match(r'[\w\d_-]+', value): if not re.match(r"[\w\d_-]+", value):
raise ValueError('Malformed segment id "{}"'.format(value)) raise ValueError('Malformed segment id "{}"'.format(value))
# An object contains either # An object contains either
@ -141,7 +143,7 @@ class ExtractedText:
def check(self, _, value): def check(self, _, value):
if value is not None and self.segments is not None: if value is not None and self.segments is not None:
raise ValueError("Can't have both segments and text") raise ValueError("Can't have both segments and text")
if value is not None and unicodedata.normalize('NFC', value) != value: if value is not None and unicodedata.normalize("NFC", value) != value:
raise ValueError('String "{}" is not in NFC.'.format(value)) raise ValueError('String "{}" is not in NFC.'.format(value))
if value is not None and normalize(value, self.normalization) != value: if value is not None and normalize(value, self.normalization) != value:
raise ValueError('String "{}" is not normalized.'.format(value)) raise ValueError('String "{}" is not normalized.'.format(value))
@ -169,31 +171,24 @@ class ExtractedText:
seg_ids = [s.segment_id_for_pos(i) for i in range(len(s.text))] seg_ids = [s.segment_id_for_pos(i) for i in range(len(s.text))]
segment_id_for_pos.extend(seg_ids) segment_id_for_pos.extend(seg_ids)
segment_id_for_pos.extend(repeat(None, len(self.joiner))) segment_id_for_pos.extend(repeat(None, len(self.joiner)))
segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)] segment_id_for_pos = segment_id_for_pos[: -len(self.joiner)]
# This is frozen, so we have to jump through the hoop: # This is frozen, so we have to jump through the hoop:
object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos) object.__setattr__(self, "_segment_id_for_pos", segment_id_for_pos)
assert self._segment_id_for_pos assert self._segment_id_for_pos
return self._segment_id_for_pos[pos] return self._segment_id_for_pos[pos]
@classmethod @classmethod
def from_text_segment(cls, text_segment, nsmap, textequiv_level='region'): def from_text_segment(cls, text_segment, nsmap, textequiv_level="region"):
"""Build an ExtractedText from a PAGE content text element""" """Build an ExtractedText from a PAGE content text element"""
localname_for_textequiv_level = { localname_for_textequiv_level = {"region": "TextRegion", "line": "TextLine"}
'region': 'TextRegion',
'line': 'TextLine'
}
textequiv_level_for_localname = invert_dict(localname_for_textequiv_level) textequiv_level_for_localname = invert_dict(localname_for_textequiv_level)
children_for_localname = { children_for_localname = {"TextRegion": "TextLine"}
'TextRegion': 'TextLine' joiner_for_textequiv_level = {"line": "\n"}
}
joiner_for_textequiv_level = {
'line': '\n'
}
segment_id = text_segment.attrib['id'] segment_id = text_segment.attrib["id"]
localname = ET.QName(text_segment).localname localname = ET.QName(text_segment).localname
if localname == localname_for_textequiv_level[textequiv_level]: if localname == localname_for_textequiv_level[textequiv_level]:
segment_text = None segment_text = None
@ -201,19 +196,20 @@ class ExtractedText:
segment_text = get_textequiv_unicode(text_segment, nsmap) segment_text = get_textequiv_unicode(text_segment, nsmap)
# FIXME hardcoded SBB normalization # FIXME hardcoded SBB normalization
segment_text = normalize_sbb(segment_text) segment_text = normalize_sbb(segment_text)
segment_text = segment_text or '' segment_text = segment_text or ""
return cls(segment_id, None, None, segment_text) return cls(segment_id, None, None, segment_text)
else: else:
# Recurse # Recurse
sub_localname = children_for_localname[localname] sub_localname = children_for_localname[localname]
sub_textequiv_level = textequiv_level_for_localname[sub_localname] sub_textequiv_level = textequiv_level_for_localname[sub_localname]
segments = [] segments = []
for sub_segment in text_segment.iterfind('./page:%s' % sub_localname, for sub_segment in text_segment.iterfind(
namespaces=nsmap): "./page:%s" % sub_localname, namespaces=nsmap
):
segments.append( segments.append(
ExtractedText.from_text_segment( ExtractedText.from_text_segment(
sub_segment, nsmap, sub_segment, nsmap, textequiv_level=sub_textequiv_level
textequiv_level=sub_textequiv_level) )
) )
joiner = joiner_for_textequiv_level[sub_textequiv_level] joiner = joiner_for_textequiv_level[sub_textequiv_level]
return cls(segment_id, segments, joiner, None) return cls(segment_id, segments, joiner, None)
@ -231,24 +227,24 @@ def invert_dict(d):
def get_textequiv_unicode(text_segment, nsmap) -> str: def get_textequiv_unicode(text_segment, nsmap) -> str:
"""Get the TextEquiv/Unicode text of the given PAGE text element.""" """Get the TextEquiv/Unicode text of the given PAGE text element."""
segment_id = text_segment.attrib['id'] segment_id = text_segment.attrib["id"]
textequivs = text_segment.findall('./page:TextEquiv', namespaces=nsmap) textequivs = text_segment.findall("./page:TextEquiv", namespaces=nsmap)
if not textequivs: if not textequivs:
return '' return ""
textequiv = get_first_textequiv(textequivs, segment_id) textequiv = get_first_textequiv(textequivs, segment_id)
return textequiv.find('./page:Unicode', namespaces=nsmap).text or '' return textequiv.find("./page:Unicode", namespaces=nsmap).text or ""
def get_first_textequiv(textequivs, segment_id): def get_first_textequiv(textequivs, segment_id):
"""Get the first TextEquiv based on index or conf order if index is not present.""" """Get the first TextEquiv based on index or conf order if index is not present."""
log = getLogger('processor.OcrdDinglehopperEvaluate') log = getLogger("processor.OcrdDinglehopperEvaluate")
if len(textequivs) == 1: if len(textequivs) == 1:
return textequivs[0] return textequivs[0]
# try ordering by index # try ordering by index
indices = np.array([get_attr(te, 'index') for te in textequivs], dtype=float) indices = np.array([get_attr(te, "index") for te in textequivs], dtype=float)
nan_mask = np.isnan(indices) nan_mask = np.isnan(indices)
if np.any(~nan_mask): if np.any(~nan_mask):
if np.any(nan_mask): if np.any(nan_mask):
@ -256,10 +252,12 @@ def get_first_textequiv(textequivs, segment_id):
index = np.nanargmin(indices) index = np.nanargmin(indices)
else: else:
# try ordering by conf # try ordering by conf
confidences = np.array([get_attr(te, 'conf') for te in textequivs], dtype=float) confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float)
if np.any(~np.isnan(confidences)): if np.any(~np.isnan(confidences)):
log.info("No index attributes, use 'conf' attribute to sort TextEquiv in %s.", log.info(
segment_id) "No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
segment_id,
)
index = np.nanargmax(confidences) index = np.nanargmax(confidences)
else: else:
# fallback to first entry in case of neither index or conf present # fallback to first entry in case of neither index or conf present

@ -17,24 +17,27 @@ def alto_namespace(tree: ET.ElementTree) -> str:
check if the files uses any valid ALTO namespace. check if the files uses any valid ALTO namespace.
""" """
root_name = ET.QName(tree.getroot().tag) root_name = ET.QName(tree.getroot().tag)
if root_name.localname == 'alto': if root_name.localname == "alto":
return root_name.namespace return root_name.namespace
else: else:
raise ValueError('Not an ALTO tree') raise ValueError("Not an ALTO tree")
def alto_extract_lines(tree: ET.ElementTree) -> Generator[ExtractedText, None, None]: def alto_extract_lines(tree: ET.ElementTree) -> Generator[ExtractedText, None, None]:
nsmap = {'alto': alto_namespace(tree)} nsmap = {"alto": alto_namespace(tree)}
for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap): for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
line_id = line.attrib.get('ID') line_id = line.attrib.get("ID")
line_text = ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap)) line_text = " ".join(
string.attrib.get("CONTENT")
for string in line.iterfind("alto:String", namespaces=nsmap)
)
yield ExtractedText(line_id, None, None, normalize_sbb(line_text)) yield ExtractedText(line_id, None, None, normalize_sbb(line_text))
# FIXME hardcoded SBB normalization # FIXME hardcoded SBB normalization
def alto_extract(tree: ET.ElementTree()) -> ExtractedText: def alto_extract(tree: ET.ElementTree()) -> ExtractedText:
"""Extract text from the given ALTO ElementTree.""" """Extract text from the given ALTO ElementTree."""
return ExtractedText(None, list(alto_extract_lines(tree)), '\n', None) return ExtractedText(None, list(alto_extract_lines(tree)), "\n", None)
def alto_text(tree): def alto_text(tree):
@ -48,56 +51,73 @@ def page_namespace(tree):
do not check if the files uses any valid PAGE namespace. do not check if the files uses any valid PAGE namespace.
""" """
root_name = ET.QName(tree.getroot().tag) root_name = ET.QName(tree.getroot().tag)
if root_name.localname == 'PcGts': if root_name.localname == "PcGts":
return root_name.namespace return root_name.namespace
else: else:
raise ValueError('Not a PAGE tree') raise ValueError("Not a PAGE tree")
def page_extract(tree, *, textequiv_level='region'): def page_extract(tree, *, textequiv_level="region"):
"""Extract text from the given PAGE content ElementTree.""" """Extract text from the given PAGE content ElementTree."""
# Internally, this is just parsing the Reading Order (if it exists) and # Internally, this is just parsing the Reading Order (if it exists) and
# and leaves reading the TextRegions to ExtractedText.from_text_segment(). # and leaves reading the TextRegions to ExtractedText.from_text_segment().
nsmap = {'page': page_namespace(tree)} nsmap = {"page": page_namespace(tree)}
regions = [] regions = []
reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap) reading_order = tree.find(".//page:ReadingOrder", namespaces=nsmap)
if reading_order is not None: if reading_order is not None:
for group in reading_order.iterfind('./*', namespaces=nsmap): for group in reading_order.iterfind("./*", namespaces=nsmap):
if ET.QName(group.tag).localname == 'OrderedGroup': if ET.QName(group.tag).localname == "OrderedGroup":
region_ref_indexeds = group.findall('./page:RegionRefIndexed', namespaces=nsmap) region_ref_indexeds = group.findall(
for region_ref_indexed in sorted(region_ref_indexeds, key=lambda r: int(r.attrib['index'])): "./page:RegionRefIndexed", namespaces=nsmap
region_id = region_ref_indexed.attrib['regionRef'] )
region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap) for region_ref_indexed in sorted(
region_ref_indexeds, key=lambda r: int(r.attrib["index"])
):
region_id = region_ref_indexed.attrib["regionRef"]
region = tree.find(
'.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap
)
if region is not None: if region is not None:
regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level)) regions.append(
ExtractedText.from_text_segment(
region, nsmap, textequiv_level=textequiv_level
)
)
else: else:
pass # Not a TextRegion pass # Not a TextRegion
else: else:
raise NotImplementedError raise NotImplementedError
else: else:
for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap): for region in tree.iterfind(".//page:TextRegion", namespaces=nsmap):
regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level)) regions.append(
ExtractedText.from_text_segment(
region, nsmap, textequiv_level=textequiv_level
)
)
# Filter empty region texts # Filter empty region texts
regions = [r for r in regions if r.text != ''] regions = [r for r in regions if r.text != ""]
return ExtractedText(None, regions, '\n', None) return ExtractedText(None, regions, "\n", None)
def page_text(tree, *, textequiv_level='region'): def page_text(tree, *, textequiv_level="region"):
return page_extract(tree, textequiv_level=textequiv_level).text return page_extract(tree, textequiv_level=textequiv_level).text
def plain_extract(filename): def plain_extract(filename):
with open(filename, 'r') as f: with open(filename, "r") as f:
return ExtractedText( return ExtractedText(
None, None,
[ExtractedText('line %d' % no, None, None, line) for no, line in enumerate(f.readlines())], [
'\n', ExtractedText("line %d" % no, None, None, line)
None for no, line in enumerate(f.readlines())
],
"\n",
None,
) )
@ -105,7 +125,7 @@ def plain_text(filename):
return plain_extract(filename).text return plain_extract(filename).text
def extract(filename, *, textequiv_level='region'): def extract(filename, *, textequiv_level="region"):
"""Extract the text from the given file. """Extract the text from the given file.
Supports PAGE, ALTO and falls back to plain text. Supports PAGE, ALTO and falls back to plain text.
@ -124,5 +144,5 @@ def text(filename):
return extract(filename).text return extract(filename).text
if __name__ == '__main__': if __name__ == "__main__":
print(text(sys.argv[1])) print(text(sys.argv[1]))

@ -10,7 +10,7 @@ from pkg_resources import resource_string
from .cli import process as cli_process from .cli import process as cli_process
from .edit_distance import levenshtein_matrix_cache_clear from .edit_distance import levenshtein_matrix_cache_clear
OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
@click.command() @click.command()
@ -20,20 +20,19 @@ def ocrd_dinglehopper(*args, **kwargs):
class OcrdDinglehopperEvaluate(Processor): class OcrdDinglehopperEvaluate(Processor):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dinglehopper'] kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs) super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
def process(self): def process(self):
assert_file_grp_cardinality(self.input_file_grp, 2, 'GT and OCR') assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
assert_file_grp_cardinality(self.output_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1)
log = getLogger('processor.OcrdDinglehopperEvaluate') log = getLogger("processor.OcrdDinglehopperEvaluate")
metrics = self.parameter['metrics'] metrics = self.parameter["metrics"]
textequiv_level = self.parameter['textequiv_level'] textequiv_level = self.parameter["textequiv_level"]
gt_grp, ocr_grp = self.input_file_grp.split(',') gt_grp, ocr_grp = self.input_file_grp.split(",")
input_file_tuples = self._zip_input_files([gt_grp, ocr_grp]) input_file_tuples = self._zip_input_files([gt_grp, ocr_grp])
for n, (gt_file, ocr_file) in enumerate(input_file_tuples): for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
@ -59,36 +58,43 @@ class OcrdDinglehopperEvaluate(Processor):
ocr_file.local_filename, ocr_file.local_filename,
report_prefix, report_prefix,
metrics=metrics, metrics=metrics,
textequiv_level=textequiv_level textequiv_level=textequiv_level,
) )
# Add reports to the workspace # Add reports to the workspace
for report_suffix, mimetype in \ for report_suffix, mimetype in [
[ [".html", "text/html"],
['.html', 'text/html'], [".json", "application/json"],
['.json', 'application/json']
]: ]:
self.workspace.add_file( self.workspace.add_file(
ID=file_id + report_suffix, ID=file_id + report_suffix,
file_grp=self.output_file_grp, file_grp=self.output_file_grp,
pageId=page_id, pageId=page_id,
mimetype=mimetype, mimetype=mimetype,
local_filename=report_prefix + report_suffix) local_filename=report_prefix + report_suffix,
)
# Clear cache between files # Clear cache between files
levenshtein_matrix_cache_clear() levenshtein_matrix_cache_clear()
def _zip_input_files(self, input_file_grps): def _zip_input_files(self, input_file_grps):
log = getLogger('processor.OcrdDinglehopperEvaluate') log = getLogger("processor.OcrdDinglehopperEvaluate")
input_file_tuples = list() input_file_tuples = list()
for page_id in ([self.page_id] if self.page_id else for page_id in (
self.workspace.mets.physical_pages): [self.page_id] if self.page_id else self.workspace.mets.physical_pages
):
ifiles = list() ifiles = list()
for input_file_grp in input_file_grps: for input_file_grp in input_file_grps:
log.debug("Adding input file group %s to page %s", input_file_grp, page_id) log.debug(
files = self.workspace.mets.find_all_files(pageId=page_id, fileGrp=input_file_grp) "Adding input file group %s to page %s", input_file_grp, page_id
)
files = self.workspace.mets.find_all_files(
pageId=page_id, fileGrp=input_file_grp
)
if not files: if not files:
log.error('Found no page "%s" in file group %s', page_id, input_file_grp) log.error(
'Found no page "%s" in file group %s', page_id, input_file_grp
)
ifiles.append(None) ifiles.append(None)
else: else:
ifiles.append(files[0]) ifiles.append(files[0])
@ -97,5 +103,5 @@ class OcrdDinglehopperEvaluate(Processor):
return input_file_tuples return input_file_tuples
if __name__ == '__main__': if __name__ == "__main__":
ocrd_dinglehopper() ocrd_dinglehopper()

@ -10,25 +10,30 @@ from .. import seq_align, ExtractedText
def test_text(): def test_text():
test1 = ExtractedText(None, [ test1 = ExtractedText(
ExtractedText('s0', None, None, 'foo'), None,
ExtractedText('s1', None, None, 'bar'), [
ExtractedText('s2', None, None, 'bazinga') ExtractedText("s0", None, None, "foo"),
], ' ', None) ExtractedText("s1", None, None, "bar"),
ExtractedText("s2", None, None, "bazinga"),
assert test1.text == 'foo bar bazinga' ],
assert test1.segment_id_for_pos(0) == 's0' " ",
None,
)
assert test1.text == "foo bar bazinga"
assert test1.segment_id_for_pos(0) == "s0"
assert test1.segment_id_for_pos(3) is None assert test1.segment_id_for_pos(3) is None
assert test1.segment_id_for_pos(10) == 's2' assert test1.segment_id_for_pos(10) == "s2"
def test_normalization_check(): def test_normalization_check():
with pytest.raises(ValueError, match=r'.*is not in NFC.*'): with pytest.raises(ValueError, match=r".*is not in NFC.*"):
ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ')) ExtractedText("foo", None, None, unicodedata.normalize("NFD", "Schlyñ"))
assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ')) assert ExtractedText("foo", None, None, unicodedata.normalize("NFC", "Schlyñ"))
AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id') AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")
def test_align(): def test_align():
@ -39,25 +44,36 @@ def test_align():
not Python characters. not Python characters.
""" """
test1 = ExtractedText(None, [ test1 = ExtractedText(
ExtractedText('s0', None, None, 'foo'), None,
ExtractedText('s1', None, None, 'bar'), [
ExtractedText('s2', None, None, 'batzinga') ExtractedText("s0", None, None, "foo"),
], ' ', None) ExtractedText("s1", None, None, "bar"),
test2 = ExtractedText(None, [ ExtractedText("s2", None, None, "batzinga"),
ExtractedText('x0', None, None, 'foo'), ],
ExtractedText('x1', None, None, 'bar'), " ",
None,
)
test2 = ExtractedText(
None,
[
ExtractedText("x0", None, None, "foo"),
ExtractedText("x1", None, None, "bar"),
# extra . # extra .
ExtractedText('x2', None, None, '.'), ExtractedText("x2", None, None, "."),
# deletion + different grapheme cluster, m̃ also is two Python characters # deletion + different grapheme cluster, m̃ also is two Python characters
ExtractedText('x3', None, None, 'bazim̃ga'), ExtractedText("x3", None, None, "bazim̃ga"),
], ' ', None) ],
" ",
None,
)
left_pos = 0 left_pos = 0
right_pos = 0 right_pos = 0
alignment = [] alignment = []
for left, right in seq_align(grapheme_clusters(test1.text), for left, right in seq_align(
grapheme_clusters(test2.text)): grapheme_clusters(test1.text), grapheme_clusters(test2.text)
):
left_id = test1.segment_id_for_pos(left_pos) if left is not None else None left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
right_id = test2.segment_id_for_pos(right_pos) if right is not None else None right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
el = AlignmentElement(left, right, left_id, right_id) el = AlignmentElement(left, right, left_id, right_id)
@ -67,46 +83,57 @@ def test_align():
if right is not None: if right is not None:
right_pos += len(right) right_pos += len(right)
print('test1: {}'.format(test1.text)) print("test1: {}".format(test1.text))
print('test2: {}'.format(test2.text)) print("test2: {}".format(test2.text))
assert alignment[0] == ('f', 'f', 's0', 'x0') assert alignment[0] == ("f", "f", "s0", "x0")
assert alignment[8] == (None, '.', None, 'x2') assert alignment[8] == (None, ".", None, "x2")
assert alignment[12] == ('t', None, 's2', None) assert alignment[12] == ("t", None, "s2", None)
assert alignment[15] == ('n', '', 's2', 'x3') assert alignment[15] == ("n", "", "s2", "x3")
@pytest.mark.parametrize("attributes,expected_index,expected_log", [ @pytest.mark.parametrize(
"attributes,expected_index,expected_log",
[
([], None, None), ([], None, None),
(['index="0"'], 0, None), (['index="0"'], 0, None),
([''], 0, None), ([""], 0, None),
(['conf="0.5"'], 0, None), (['conf="0.5"'], 0, None),
(['index="1"', 'index="0"'], 1, None), (['index="1"', 'index="0"'], 1, None),
(['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"), (['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"),
(['conf="0.4"', 'conf="0.5"', 'conf="0.9"'], 2, (
"No index attributes, use 'conf' attribute to sort TextEquiv"), ['conf="0.4"', 'conf="0.5"', 'conf="0.9"'],
(['index="0"', ''], 0, "TextEquiv without index"), 2,
(['', 'conf="0.4"'], 1, "No index attributes, use 'conf' attribute to sort TextEquiv",
"No index attributes, use 'conf' attribute to sort TextEquiv"), ),
(['', ''], 0, "No index attributes, use first TextEquiv"), (['index="0"', ""], 0, "TextEquiv without index"),
]) (
["", 'conf="0.4"'],
1,
"No index attributes, use 'conf' attribute to sort TextEquiv",
),
(["", ""], 0, "No index attributes, use first TextEquiv"),
],
)
def test_textequiv(attributes, expected_index, expected_log, caplog): def test_textequiv(attributes, expected_index, expected_log, caplog):
"""Test that extracting text from a PAGE TextEquiv is working without index attr.""" """Test that extracting text from a PAGE TextEquiv is working without index attr."""
caplog.set_level(logging.INFO) caplog.set_level(logging.INFO)
xml = "<?xml version=\"1.0\"?>" xml = '<?xml version="1.0"?>'
ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
text = ["Text {0}".format(i) for i in range(len(attributes) + 1)] text = ["Text {0}".format(i) for i in range(len(attributes) + 1)]
equiv = ["<TextEquiv {0}><Unicode>{1}</Unicode></TextEquiv>".format(attr, text[i]) equiv = [
for i, attr in enumerate(attributes)] "<TextEquiv {0}><Unicode>{1}</Unicode></TextEquiv>".format(attr, text[i])
for i, attr in enumerate(attributes)
]
textline = "{0}<TextLine id=\"l3\" xmlns=\"{1}\">{2}</TextLine>" textline = '{0}<TextLine id="l3" xmlns="{1}">{2}</TextLine>'
textline = textline.format(xml, ns, ''.join(equiv)) textline = textline.format(xml, ns, "".join(equiv))
root = ET.fromstring(textline) root = ET.fromstring(textline)
result = ExtractedText.from_text_segment(root, result = ExtractedText.from_text_segment(
{'page': ns}, root, {"page": ns}, textequiv_level="line"
textequiv_level='line').text ).text
if expected_index is None: if expected_index is None:
assert not result assert not result
else: else:

@ -3,64 +3,85 @@ from .. import align, seq_align, distance
def test_left_empty(): def test_left_empty():
result = list(align('', 'foo')) result = list(align("", "foo"))
expected = [(None, 'f'), (None, 'o'), (None, 'o')] expected = [(None, "f"), (None, "o"), (None, "o")]
assert result == expected assert result == expected
def test_right_empty(): def test_right_empty():
result = list(align('foo', '')) result = list(align("foo", ""))
expected = [('f', None), ('o', None), ('o', None)] expected = [("f", None), ("o", None), ("o", None)]
assert result == expected assert result == expected
def test_left_longer(): def test_left_longer():
result = list(align('food', 'foo')) result = list(align("food", "foo"))
expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), ('d', None)] expected = [("f", "f"), ("o", "o"), ("o", "o"), ("d", None)]
assert result == expected assert result == expected
def test_right_longer(): def test_right_longer():
result = list(align('foo', 'food')) result = list(align("foo", "food"))
expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), (None, 'd')] expected = [("f", "f"), ("o", "o"), ("o", "o"), (None, "d")]
assert result == expected assert result == expected
def test_some_diff(): def test_some_diff():
result = list(align('abcde', 'aaadef')) result = list(align("abcde", "aaadef"))
left, right = unzip(result) left, right = unzip(result)
assert list(left) == ['a', 'b', 'c', 'd', 'e', None] assert list(left) == ["a", "b", "c", "d", "e", None]
assert list(right) == ['a', 'a', 'a', 'd', 'e', 'f'] assert list(right) == ["a", "a", "a", "d", "e", "f"]
def test_longer(): def test_longer():
s1 = 'Dies ist eine Tst!' s1 = "Dies ist eine Tst!"
s2 = 'Dies ist ein Test.' s2 = "Dies ist ein Test."
result = list(align(s1, s2)) # ; diffprint(*unzip(result)) result = list(align(s1, s2)) # ; diffprint(*unzip(result))
expected = [('D', 'D'), ('i', 'i'), ('e', 'e'), ('s', 's'), (' ', ' '), expected = [
('i', 'i'), ('s', 's'), ('t', 't'), (' ', ' '), ("D", "D"),
('e', 'e'), ('i', 'i'), ('n', 'n'), ('e', None), (' ', ' '), ("i", "i"),
('T', 'T'), (None, 'e'), ('s', 's'), ('t', 't'), ('!', '.')] ("e", "e"),
("s", "s"),
(" ", " "),
("i", "i"),
("s", "s"),
("t", "t"),
(" ", " "),
("e", "e"),
("i", "i"),
("n", "n"),
("e", None),
(" ", " "),
("T", "T"),
(None, "e"),
("s", "s"),
("t", "t"),
("!", "."),
]
assert result == expected assert result == expected
def test_completely_different(): def test_completely_different():
assert len(list(align('abcde', 'fghij'))) == 5 assert len(list(align("abcde", "fghij"))) == 5
def test_with_some_fake_ocr_errors(): def test_with_some_fake_ocr_errors():
result = list(align('Über die vielen Sorgen wegen desselben vergaß', result = list(
'SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab')) align(
"Über die vielen Sorgen wegen desselben vergaß",
"SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
)
)
left, right = unzip(result) left, right = unzip(result)
# Beginning # Beginning
assert list(left[:18]) == [None]*18 assert list(left[:18]) == [None] * 18
assert list(right[:18]) == list('SomeJunk MoreJunk ') assert list(right[:18]) == list("SomeJunk MoreJunk ")
# End # End
assert list(left[-1:]) == ['ß'] assert list(left[-1:]) == ["ß"]
assert list(right[-1:]) == ['b'] assert list(right[-1:]) == ["b"]
def test_lines(): def test_lines():
@ -68,13 +89,30 @@ def test_lines():
This mainly serves as documentation for comparing lists of lines. This mainly serves as documentation for comparing lists of lines.
""" """
result = list(seq_align( result = list(
['This is a line.', 'This is another', 'And the last line'], seq_align(
['This is a line.', 'This is another', 'J u n k', 'And the last line'] ["This is a line.", "This is another", "And the last line"],
)) [
"This is a line.",
"This is another",
"J u n k",
"And the last line",
],
)
)
left, right = unzip(result) left, right = unzip(result)
assert list(left) == ['This is a line.', 'This is another', None, 'And the last line'] assert list(left) == [
assert list(right) == ['This is a line.', 'This is another', 'J u n k', 'And the last line'] "This is a line.",
"This is another",
None,
"And the last line",
]
assert list(right) == [
"This is a line.",
"This is another",
"J u n k",
"And the last line",
]
def test_lines_similar(): def test_lines_similar():
@ -92,7 +130,7 @@ def test_lines_similar():
# Just an example! # Just an example!
min_len = min(len(self._string), len(other._string)) min_len = min(len(self._string), len(other._string))
if min_len > 0: if min_len > 0:
normalized_distance = distance(self._string, other._string)/min_len normalized_distance = distance(self._string, other._string) / min_len
similar = normalized_distance < 0.1 similar = normalized_distance < 0.1
else: else:
similar = False similar = False
@ -102,18 +140,39 @@ def test_lines_similar():
return not self.__eq__(other) return not self.__eq__(other)
def __repr__(self): def __repr__(self):
return 'SimilarString(\'%s\')' % self._string return "SimilarString('%s')" % self._string
def __hash__(self): def __hash__(self):
return hash(self._string) return hash(self._string)
result = list(seq_align( result = list(
[SimilarString('This is a line.'), SimilarString('This is another'), SimilarString('And the last line')], seq_align(
[SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')] [
)) SimilarString("This is a line."),
SimilarString("This is another"),
SimilarString("And the last line"),
],
[
SimilarString("This is a ljne."),
SimilarString("This is another"),
SimilarString("J u n k"),
SimilarString("And the last line"),
],
)
)
left, right = unzip(result) left, right = unzip(result)
assert list(left) == [SimilarString('This is a line.'), SimilarString('This is another'), None, SimilarString('And the last line')] assert list(left) == [
assert list(right) == [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')] SimilarString("This is a line."),
SimilarString("This is another"),
None,
SimilarString("And the last line"),
]
assert list(right) == [
SimilarString("This is a ljne."),
SimilarString("This is another"),
SimilarString("J u n k"),
SimilarString("And the last line"),
]
# Test __eq__ (i.e. is it a substitution or a similar string?) # Test __eq__ (i.e. is it a substitution or a similar string?)
assert list(left)[0] == list(right)[0] assert list(left)[0] == list(right)[0]

@ -7,31 +7,35 @@ from .. import character_error_rate
def test_character_error_rate(): def test_character_error_rate():
assert character_error_rate('a', 'a') == 0 assert character_error_rate("a", "a") == 0
assert character_error_rate('a', 'b') == 1/1 assert character_error_rate("a", "b") == 1 / 1
assert character_error_rate('Foo', 'Bar') == 3/3 assert character_error_rate("Foo", "Bar") == 3 / 3
assert character_error_rate('Foo', '') == 3/3 assert character_error_rate("Foo", "") == 3 / 3
assert character_error_rate('', '') == 0 assert character_error_rate("", "") == 0
assert math.isinf(character_error_rate('', 'Foo')) assert math.isinf(character_error_rate("", "Foo"))
assert character_error_rate('Foo', 'Food') == 1/3 assert character_error_rate("Foo", "Food") == 1 / 3
assert character_error_rate('Fnord', 'Food') == 2/5 assert character_error_rate("Fnord", "Food") == 2 / 5
assert character_error_rate('Müll', 'Mull') == 1/4 assert character_error_rate("Müll", "Mull") == 1 / 4
assert character_error_rate('Abstand', 'Sand') == 4/7 assert character_error_rate("Abstand", "Sand") == 4 / 7
def test_character_error_rate_hard(): def test_character_error_rate_hard():
s1 = unicodedata.normalize('NFC', 'Schlyñ lorem ipsum.') s1 = unicodedata.normalize("NFC", "Schlyñ lorem ipsum.")
s2 = unicodedata.normalize('NFD', 'Schlyñ lorem ipsum!') # Different, decomposed! s2 = unicodedata.normalize("NFD", "Schlyñ lorem ipsum!") # Different, decomposed!
assert character_error_rate(s1, s2) == 1/19 assert character_error_rate(s1, s2) == 1 / 19
s1 = 'Schlyñ' s1 = "Schlyñ"
assert len(s1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points assert (
s2 = 'Schlym̃' len(s1) == 6
assert len(s2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points ) # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
s2 = "Schlym̃"
assert (
len(s2) == 7
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
# Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical. # Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
assert character_error_rate(s2, s1) == 1/6 assert character_error_rate(s2, s1) == 1 / 6
assert character_error_rate(s1, s2) == 1/6 assert character_error_rate(s1, s2) == 1 / 6

@ -6,35 +6,39 @@ from .. import levenshtein, distance
def test_levenshtein(): def test_levenshtein():
assert levenshtein('a', 'a') == 0 assert levenshtein("a", "a") == 0
assert levenshtein('a', 'b') == 1 assert levenshtein("a", "b") == 1
assert levenshtein('Foo', 'Bar') == 3 assert levenshtein("Foo", "Bar") == 3
assert levenshtein('', '') == 0 assert levenshtein("", "") == 0
assert levenshtein('Foo', '') == 3 assert levenshtein("Foo", "") == 3
assert levenshtein('', 'Foo') == 3 assert levenshtein("", "Foo") == 3
assert levenshtein('Foo', 'Food') == 1 assert levenshtein("Foo", "Food") == 1
assert levenshtein('Fnord', 'Food') == 2 assert levenshtein("Fnord", "Food") == 2
assert levenshtein('Müll', 'Mull') == 1 assert levenshtein("Müll", "Mull") == 1
assert levenshtein('Abstand', 'Sand') == 4 assert levenshtein("Abstand", "Sand") == 4
def test_levenshtein_other_sequences(): def test_levenshtein_other_sequences():
assert levenshtein(['a', 'ab'], ['a', 'ab', 'c']) == 1 assert levenshtein(["a", "ab"], ["a", "ab", "c"]) == 1
assert levenshtein(['a', 'ab'], ['a', 'c']) == 1 assert levenshtein(["a", "ab"], ["a", "c"]) == 1
def test_distance(): def test_distance():
assert distance('Fnord', 'Food') == 2 assert distance("Fnord", "Food") == 2
assert distance('Müll', 'Mull') == 1 assert distance("Müll", "Mull") == 1
word1 = unicodedata.normalize('NFC', 'Schlyñ') word1 = unicodedata.normalize("NFC", "Schlyñ")
word2 = unicodedata.normalize('NFD', 'Schlyñ') # Different, decomposed! word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed!
assert distance(word1, word2) == 0 assert distance(word1, word2) == 0
word1 = 'Schlyñ' word1 = "Schlyñ"
assert len(word1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points assert (
word2 = 'Schlym̃' len(word1) == 6
assert len(word2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points ) # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
word2 = "Schlym̃"
assert (
len(word2) == 7
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
assert distance(word1, word2) == 1 assert distance(word1, word2) == 1

@ -4,45 +4,60 @@ from .. import seq_editops, editops
def test_trivial(): def test_trivial():
assert seq_editops('abc', 'abc') == [] assert seq_editops("abc", "abc") == []
assert seq_editops('', '') == [] assert seq_editops("", "") == []
def test_insert(): def test_insert():
assert seq_editops('bc', 'abc') == [('insert', 0, 0)] assert seq_editops("bc", "abc") == [("insert", 0, 0)]
assert seq_editops('ac', 'abc') == [('insert', 1, 1)] assert seq_editops("ac", "abc") == [("insert", 1, 1)]
assert seq_editops('ab', 'abc') == [('insert', 2, 2)] assert seq_editops("ab", "abc") == [("insert", 2, 2)]
assert seq_editops('', 'a') == [('insert', 0, 0)] assert seq_editops("", "a") == [("insert", 0, 0)]
def test_multiple(): def test_multiple():
assert seq_editops('bcd', 'abce') == [('insert', 0, 0), ('replace', 2, 3)] assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]
def test_delete(): def test_delete():
assert seq_editops('abcdef', 'cdef') == [('delete', 0, 0), ('delete', 1, 0)] assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
assert seq_editops('Xabcdef', 'Xcdef') == [('delete', 1, 1), ('delete', 2, 1)] assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
assert seq_editops('abcdefg', 'acdefX') == [('delete', 1, 1), ('replace', 6, 5)] assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
assert seq_editops('abcde', 'aabcd') == [('insert', 1, 1), ('delete', 4, 5)] assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
assert seq_editops('Foo', '') == [('delete', 0, 0), ('delete', 1, 0), ('delete', 2, 0)] assert seq_editops("Foo", "") == [
assert seq_editops('Foolish', 'Foo') == [('delete', 3, 3), ('delete', 4, 3), ('delete', 5, 3), ('delete', 6, 3)] ("delete", 0, 0),
("delete", 1, 0),
("delete", 2, 0),
]
assert seq_editops("Foolish", "Foo") == [
("delete", 3, 3),
("delete", 4, 3),
("delete", 5, 3),
("delete", 6, 3),
]
def test_ambiguous(): def test_ambiguous():
assert seq_editops('bcd', 'abcef') == [('insert', 0, 0), ('replace', 2, 3), ('insert', 3, 4)] assert seq_editops("bcd", "abcef") == [
("insert", 0, 0),
("replace", 2, 3),
("insert", 3, 4),
]
def test_editops(): def test_editops():
"""Test editops() in cases where dealing with grapheme clusters matters""" """Test editops() in cases where dealing with grapheme clusters matters"""
# In these cases, one of the words has a composed form, the other one does not. # In these cases, one of the words has a composed form, the other one does not.
assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)] assert editops("Schlyñ", "Schlym̃") == [("replace", 5, 5)]
assert editops('oͤde', 'öde') == [('replace', 0, 0)] assert editops("oͤde", "öde") == [("replace", 0, 0)]
def test_editops_canonically_equivalent(): def test_editops_canonically_equivalent():
left = unicodedata.lookup('LATIN SMALL LETTER N') + unicodedata.lookup('COMBINING TILDE') left = unicodedata.lookup("LATIN SMALL LETTER N") + unicodedata.lookup(
right = unicodedata.lookup('LATIN SMALL LETTER N WITH TILDE') "COMBINING TILDE"
)
right = unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE")
assert left != right assert left != right
assert unicodedata.normalize('NFC', left) == unicodedata.normalize('NFC', right) assert unicodedata.normalize("NFC", left) == unicodedata.normalize("NFC", right)
assert editops(left, right) == [] assert editops(left, right) == []

@ -7,7 +7,7 @@ from lxml import etree as ET
from .. import align, page_text from .. import align, page_text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration @pytest.mark.integration
@ -17,8 +17,8 @@ def test_align_page_files():
# (currently) not counted due to normalization. # (currently) not counted due to normalization.
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters. # NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
result = list(align(gt, ocr)) result = list(align(gt, ocr))
for left, right in result: for left, right in result:

@ -8,26 +8,34 @@ from uniseg.graphemecluster import grapheme_clusters
from .. import character_error_rate, page_text, alto_text from .. import character_error_rate, page_text, alto_text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration @pytest.mark.integration
def test_character_error_rate_between_page_files(): def test_character_error_rate_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
# The fi ligature does not count. # The fi ligature does not count.
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
gt_len = len(list(grapheme_clusters(gt))) gt_len = len(list(grapheme_clusters(gt)))
expected_cer = 2/gt_len expected_cer = 2 / gt_len
assert character_error_rate(gt, ocr) == expected_cer assert character_error_rate(gt, ocr) == expected_cer
@pytest.mark.integration @pytest.mark.integration
def test_character_error_rate_between_page_alto(): def test_character_error_rate_between_page_alto():
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml'))) gt = page_text(
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml'))) ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml"))
)
ocr = alto_text(
ET.parse(
os.path.join(
data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml"
)
)
)
assert gt == ocr assert gt == ocr
assert character_error_rate(gt, ocr) == 0 assert character_error_rate(gt, ocr) == 0
@ -35,7 +43,17 @@ def test_character_error_rate_between_page_alto():
@pytest.mark.integration @pytest.mark.integration
def test_character_error_rate_between_page_alto_2(): def test_character_error_rate_between_page_alto_2():
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml'))) gt = page_text(
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml'))) ET.parse(
os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml")
assert character_error_rate(gt, ocr) == 8/591 # Manually verified )
)
ocr = alto_text(
ET.parse(
os.path.join(
data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml"
)
)
)
assert character_error_rate(gt, ocr) == 8 / 591 # Manually verified

@ -10,31 +10,31 @@ def test_cli_json(tmp_path):
"""Test that the cli/process() yields a loadable JSON report""" """Test that the cli/process() yields a loadable JSON report"""
with working_directory(str(tmp_path)): with working_directory(str(tmp_path)):
with open('gt.txt', 'w') as gtf: with open("gt.txt", "w") as gtf:
gtf.write('AAAAA') gtf.write("AAAAA")
with open('ocr.txt', 'w') as ocrf: with open("ocr.txt", "w") as ocrf:
ocrf.write('AAAAB') ocrf.write("AAAAB")
with open('gt.txt', 'r') as gtf: with open("gt.txt", "r") as gtf:
print(gtf.read()) print(gtf.read())
process('gt.txt', 'ocr.txt', 'report') process("gt.txt", "ocr.txt", "report")
with open('report.json', 'r') as jsonf: with open("report.json", "r") as jsonf:
print(jsonf.read()) print(jsonf.read())
with open('report.json', 'r') as jsonf: with open("report.json", "r") as jsonf:
j = json.load(jsonf) j = json.load(jsonf)
assert j['cer'] == pytest.approx(0.2) assert j["cer"] == pytest.approx(0.2)
def test_cli_json_cer_is_infinity(tmp_path): def test_cli_json_cer_is_infinity(tmp_path):
"""Test that the cli/process() yields a loadable JSON report when CER == inf""" """Test that the cli/process() yields a loadable JSON report when CER == inf"""
with working_directory(str(tmp_path)): with working_directory(str(tmp_path)):
with open('gt.txt', 'w') as gtf: with open("gt.txt", "w") as gtf:
gtf.write('') # Empty to yield CER == inf gtf.write("") # Empty to yield CER == inf
with open('ocr.txt', 'w') as ocrf: with open("ocr.txt", "w") as ocrf:
ocrf.write('Not important') ocrf.write("Not important")
process('gt.txt', 'ocr.txt', 'report') process("gt.txt", "ocr.txt", "report")
with open('report.json', 'r') as jsonf: with open("report.json", "r") as jsonf:
j = json.load(jsonf) j = json.load(jsonf)
assert j['cer'] == pytest.approx(float('inf')) assert j["cer"] == pytest.approx(float("inf"))

@ -7,7 +7,7 @@ from lxml import etree as ET
from .. import distance, page_text, alto_text from .. import distance, page_text, alto_text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration @pytest.mark.integration
@ -15,15 +15,23 @@ def test_distance_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
# Due to normalization, we don't count the ligature. # Due to normalization, we don't count the ligature.
# → 2 differences # → 2 differences
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
assert distance(gt, ocr) == 2 assert distance(gt, ocr) == 2
@pytest.mark.integration @pytest.mark.integration
def test_distance_between_page_alto(): def test_distance_between_page_alto():
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml'))) gt = page_text(
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml'))) ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml"))
)
ocr = alto_text(
ET.parse(
os.path.join(
data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml"
)
)
)
assert gt == ocr assert gt == ocr
assert distance(gt, ocr) == 0 assert distance(gt, ocr) == 0
@ -31,7 +39,17 @@ def test_distance_between_page_alto():
@pytest.mark.integration @pytest.mark.integration
def test_distance_between_page_alto_2(): def test_distance_between_page_alto_2():
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml'))) gt = page_text(
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml'))) ET.parse(
os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml")
)
)
ocr = alto_text(
ET.parse(
os.path.join(
data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml"
)
)
)
assert distance(gt, ocr) == 8 # Manually verified assert distance(gt, ocr) == 8 # Manually verified

@ -10,27 +10,32 @@ from .util import working_directory
from ..ocrd_cli import ocrd_dinglehopper from ..ocrd_cli import ocrd_dinglehopper
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
def test_ocrd_cli(tmp_path): def test_ocrd_cli(tmp_path):
"""Test OCR-D interface""" """Test OCR-D interface"""
# Copy test workspace # Copy test workspace
test_workspace_dir_source = Path(data_dir) / 'actevedef_718448162' test_workspace_dir_source = Path(data_dir) / "actevedef_718448162"
test_workspace_dir = tmp_path / 'test_ocrd_cli' test_workspace_dir = tmp_path / "test_ocrd_cli"
shutil.copytree(str(test_workspace_dir_source), str(test_workspace_dir)) shutil.copytree(str(test_workspace_dir_source), str(test_workspace_dir))
# Run through the OCR-D interface # Run through the OCR-D interface
with working_directory(str(test_workspace_dir)): with working_directory(str(test_workspace_dir)):
runner = CliRunner() runner = CliRunner()
args = [ args = [
'-m', 'mets.xml', "-m",
'-I', 'OCR-D-GT-PAGE,OCR-D-OCR-CALAMARI', "mets.xml",
'-O', 'OCR-D-OCR-CALAMARI-EVAL' "-I",
"OCR-D-GT-PAGE,OCR-D-OCR-CALAMARI",
"-O",
"OCR-D-OCR-CALAMARI-EVAL",
] ]
sys.argv[1:] = args # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments sys.argv[
1:
] = args # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments
result = runner.invoke(ocrd_dinglehopper, args) result = runner.invoke(ocrd_dinglehopper, args)
assert result.exit_code == 0 assert result.exit_code == 0
result_json = list((test_workspace_dir / 'OCR-D-OCR-CALAMARI-EVAL').glob('*.json')) result_json = list((test_workspace_dir / "OCR-D-OCR-CALAMARI-EVAL").glob("*.json"))
assert json.load(open(str(result_json[0])))['cer'] < 0.03 assert json.load(open(str(result_json[0])))["cer"] < 0.03

@ -7,26 +7,36 @@ from lxml import etree as ET
from .. import word_error_rate, words, page_text, alto_text from .. import word_error_rate, words, page_text, alto_text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration @pytest.mark.integration
def test_word_error_rate_between_page_files(): def test_word_error_rate_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words, # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
# the ligature does not count → 2 errors # the ligature does not count → 2 errors
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line gt_word_count = (
7 + 6 + 5 + 8 + 7 + 6 + 7 + 8 + 6 + 7 + 7 + 5 + 6 + 8 + 8 + 7 + 7 + 6 + 5 + 4
) # Manually verified word count per line
assert len(list(words(gt))) == gt_word_count assert len(list(words(gt))) == gt_word_count
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
assert word_error_rate(gt, ocr) == 2/gt_word_count assert word_error_rate(gt, ocr) == 2 / gt_word_count
@pytest.mark.integration @pytest.mark.integration
def test_word_error_rate_between_page_alto(): def test_word_error_rate_between_page_alto():
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml'))) gt = page_text(
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml'))) ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml"))
)
ocr = alto_text(
ET.parse(
os.path.join(
data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml"
)
)
)
assert gt == ocr assert gt == ocr
assert word_error_rate(gt, ocr) == 0 assert word_error_rate(gt, ocr) == 0
@ -34,11 +44,25 @@ def test_word_error_rate_between_page_alto():
@pytest.mark.integration @pytest.mark.integration
def test_word_error_rate_between_page_alto_2(): def test_word_error_rate_between_page_alto_2():
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml'))) gt = page_text(
ET.parse(
gt_word_count = 14+18+17+14+17+17+3 # Manually verified word count per line os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml")
)
)
gt_word_count = (
14 + 18 + 17 + 14 + 17 + 17 + 3
) # Manually verified word count per line
assert len(list(words(gt))) == gt_word_count assert len(list(words(gt))) == gt_word_count
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml'))) ocr = alto_text(
ET.parse(
assert word_error_rate(gt, ocr) == 7/gt_word_count # Manually verified, 6 words are wrong, 1 got split (=2 errors) os.path.join(
data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml"
)
)
)
assert (
word_error_rate(gt, ocr) == 7 / gt_word_count
) # Manually verified, 6 words are wrong, 1 got split (=2 errors)

@ -9,46 +9,54 @@ import pytest
from .util import working_directory from .util import working_directory
from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
def test_alto_namespace(): def test_alto_namespace():
tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml')) tree = ET.parse(os.path.join(data_dir, "test.alto3.xml"))
assert alto_namespace(tree) == 'http://www.loc.gov/standards/alto/ns-v3#' assert alto_namespace(tree) == "http://www.loc.gov/standards/alto/ns-v3#"
def test_alto_text(): def test_alto_text():
tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml')) tree = ET.parse(os.path.join(data_dir, "test.alto3.xml"))
result = alto_text(tree) result = alto_text(tree)
expected = textwrap.dedent("""\ expected = textwrap.dedent(
"""\
über die vielen Sorgen wegen deſſelben vergaß über die vielen Sorgen wegen deſſelben vergaß
Hartkopf, der Frau Amtmännin das ver- Hartkopf, der Frau Amtmännin das ver-
ſprochene zu überliefern.""") ſprochene zu überliefern."""
)
assert result == expected assert result == expected
def test_alto_text_ALTO1(): def test_alto_text_ALTO1():
tree = ET.parse(os.path.join(data_dir, 'test.alto1.xml')) tree = ET.parse(os.path.join(data_dir, "test.alto1.xml"))
assert "being erected at the Broadway stock" in alto_text(tree) assert "being erected at the Broadway stock" in alto_text(tree)
def test_alto_text_ALTO2(): def test_alto_text_ALTO2():
tree = ET.parse(os.path.join(data_dir, 'test.alto2.xml')) tree = ET.parse(os.path.join(data_dir, "test.alto2.xml"))
assert "Halbmonde, die genau durch einen Ouerstrich halbiert\nsind und an beiden Enden" in alto_text(tree) assert (
"Halbmonde, die genau durch einen Ouerstrich halbiert\nsind und an beiden Enden"
in alto_text(tree)
)
def test_alto_text_ALTO3(): def test_alto_text_ALTO3():
tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml')) tree = ET.parse(os.path.join(data_dir, "test.alto3.xml"))
assert "über die vielen Sorgen wegen deſſelben vergaß" in alto_text(tree) assert "über die vielen Sorgen wegen deſſelben vergaß" in alto_text(tree)
def test_page_namespace(): def test_page_namespace():
tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml')) tree = ET.parse(os.path.join(data_dir, "test.page2018.xml"))
assert page_namespace(tree) == 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15' assert (
page_namespace(tree)
== "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
)
def test_page_test(): def test_page_test():
tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml')) tree = ET.parse(os.path.join(data_dir, "test.page2018.xml"))
result = page_text(tree) result = page_text(tree)
# We are currently normalizing on extraction, so the text is normalized. # We are currently normalizing on extraction, so the text is normalized.
@ -74,7 +82,8 @@ def test_page_test():
# Jndeß mangelten do einige Generalia, die # Jndeß mangelten do einige Generalia, die
# alſo wegfielen. — Hartkopf gieng ſelb # alſo wegfielen. — Hartkopf gieng ſelb
# mit und berbrate es. —""") # mit und berbrate es. —""")
expected = textwrap.dedent("""\ expected = textwrap.dedent(
"""\
über die vielen Sorgen wegen deſſelben vergaß über die vielen Sorgen wegen deſſelben vergaß
Hartkopf, der Frau Amtmännin das ver- Hartkopf, der Frau Amtmännin das ver-
ſprochene zu überliefern. Ein Erpreſſer ſprochene zu überliefern. Ein Erpreſſer
@ -94,7 +103,8 @@ def test_page_test():
ſie das, was da wäre, herbeyſchaffen möchte. ſie das, was da wäre, herbeyſchaffen möchte.
Jndeß mangelten doch einige Generalia, die Jndeß mangelten doch einige Generalia, die
alſo wegfielen. Hartkopf gieng ſelbſt alſo wegfielen. Hartkopf gieng ſelbſt
mit und überbrachte es. """) mit und überbrachte es. """
)
assert result == expected assert result == expected
@ -107,56 +117,69 @@ def test_page_with_empty_region():
# <Unicode></Unicode> # <Unicode></Unicode>
# </TextEquiv> # </TextEquiv>
# </TextRegion> # </TextRegion>
tree = ET.parse(os.path.join(data_dir, 'brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml')) tree = ET.parse(
os.path.join(data_dir, "brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml")
)
result = page_text(tree) result = page_text(tree)
assert result assert result
def test_page_order(): def test_page_order():
# This file contains TextRegions where file order is not the same as reading order. # This file contains TextRegions where file order is not the same as reading order.
tree = ET.parse(os.path.join(data_dir, 'order.page.xml')) tree = ET.parse(os.path.join(data_dir, "order.page.xml"))
result = page_text(tree) result = page_text(tree)
print(result) print(result)
assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die', result, re.DOTALL) assert re.search(
r"Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die",
result,
re.DOTALL,
)
def test_page_mixed_regions(): def test_page_mixed_regions():
# This file contains ImageRegions and TextRegions in the ReadingOrder # This file contains ImageRegions and TextRegions in the ReadingOrder
tree = ET.parse(os.path.join(data_dir, 'mixed-regions.page.xml')) tree = ET.parse(os.path.join(data_dir, "mixed-regions.page.xml"))
result = page_text(tree) result = page_text(tree)
assert 'non exaudiam uos. Chriſtiani uero quia orant iuxta' in result assert "non exaudiam uos. Chriſtiani uero quia orant iuxta" in result
def test_page_level(): def test_page_level():
# This file contains inconsistent TextRegion and TextLine texts # This file contains inconsistent TextRegion and TextLine texts
# TextRegion # TextRegion
tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml')) tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml"))
result = page_text(tree) result = page_text(tree)
assert result == 'Inconsistent dummy region text' assert result == "Inconsistent dummy region text"
tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml')) tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml"))
result = page_text(tree, textequiv_level='region') result = page_text(tree, textequiv_level="region")
assert result == 'Inconsistent dummy region text' assert result == "Inconsistent dummy region text"
# TextLine # TextLine
tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml')) tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml"))
result = page_text(tree, textequiv_level='line') result = page_text(tree, textequiv_level="line")
assert result == 'Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-' assert (
result
== "Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-"
)
def test_text(): def test_text():
assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml')) assert "being erected at the Broadway stock" in text(
assert "wieder ein. Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml')) os.path.join(data_dir, "test.alto1.xml")
assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt')) )
assert "wieder ein. Er langte den Zettel aus dem" in text(
os.path.join(data_dir, "test.page2018.xml")
)
assert "Lorem ipsum" in text(os.path.join(data_dir, "test.txt"))
def test_plain(tmp_path): def test_plain(tmp_path):
with working_directory(str(tmp_path)): with working_directory(str(tmp_path)):
with open('ocr.txt', 'w') as ocrf: with open("ocr.txt", "w") as ocrf:
ocrf.write('AAAAB') ocrf.write("AAAAB")
result = plain_text('ocr.txt') result = plain_text("ocr.txt")
expected = 'AAAAB' expected = "AAAAB"
assert result == expected assert result == expected

@ -6,32 +6,81 @@ from .. import word_error_rate, words
def test_words(): def test_words():
result = list(words('Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?')) result = list(
expected = ['Der', 'schnelle', 'braune', 'Fuchs', 'kann', 'keine', '3,14', 'Meter', 'springen', 'oder'] words("Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?")
)
expected = [
"Der",
"schnelle",
"braune",
"Fuchs",
"kann",
"keine",
"3,14",
"Meter",
"springen",
"oder",
]
assert result == expected assert result == expected
def test_words_private_use_area(): def test_words_private_use_area():
result = list(words( result = list(
'ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n' words(
'ſproene zu berliefern.')) "ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n"
"ſproene zu berliefern."
)
)
expected = [ expected = [
'ber', 'die', 'vielen', 'Sorgen', 'wegen', 'deelben', 'vergaß', 'Hartkopf', "ber",
'der', 'Frau', 'Amtmnnin', 'das', 'ver', "die",
'ſproene', 'zu', 'berliefern'] "vielen",
"Sorgen",
"wegen",
"deelben",
"vergaß",
"Hartkopf",
"der",
"Frau",
"Amtmnnin",
"das",
"ver",
"ſproene",
"zu",
"berliefern",
]
assert result == expected assert result == expected
def test_word_error_rate(): def test_word_error_rate():
assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ist ein Beispielsatz!') == 0 assert (
assert word_error_rate('Dies. ist ein Beispielsatz!', 'Dies ist ein Beispielsatz!') == 0 word_error_rate("Dies ist ein Beispielsatz!", "Dies ist ein Beispielsatz!") == 0
assert word_error_rate('Dies. ist ein Beispielsatz!', 'Dies ist ein Beispielsatz.') == 0 )
assert (
word_error_rate("Dies. ist ein Beispielsatz!", "Dies ist ein Beispielsatz!")
== 0
)
assert (
word_error_rate("Dies. ist ein Beispielsatz!", "Dies ist ein Beispielsatz.")
== 0
)
assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ist ein Beispielsarz:') == 1/4 assert (
assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ein ist Beispielsatz!') == 2/4 word_error_rate("Dies ist ein Beispielsatz!", "Dies ist ein Beispielsarz:")
== 1 / 4
)
assert (
word_error_rate("Dies ist ein Beispielsatz!", "Dies ein ist Beispielsatz!")
== 2 / 4
)
assert word_error_rate('Dies ist ein Beispielsatz!', '') == 4/4 assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4
assert math.isinf(word_error_rate('', 'Dies ist ein Beispielsatz!')) assert math.isinf(word_error_rate("", "Dies ist ein Beispielsatz!"))
assert word_error_rate('', '') == 0 assert word_error_rate("", "") == 0
assert word_error_rate('Schlyñ lorem ipsum dolor sit amet,', 'Schlym̃ lorem ipsum dolor sit amet.') == 1/6 assert (
word_error_rate(
"Schlyñ lorem ipsum dolor sit amet,", "Schlym̃ lorem ipsum dolor sit amet."
)
== 1 / 6
)

@ -27,6 +27,7 @@ def unzip(an_iterable_of_tuples):
class working_directory: class working_directory:
"""Context manager to temporarily change the working directory""" """Context manager to temporarily change the working directory"""
def __init__(self, wd): def __init__(self, wd):
self.wd = wd self.wd = wd

@ -20,9 +20,10 @@ def words(s: str):
def new_word_break(c, index=0): def new_word_break(c, index=0):
if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area
return 'ALetter' return "ALetter"
else: else:
return old_word_break(c, index) return old_word_break(c, index)
uniseg.wordbreak.word_break = new_word_break uniseg.wordbreak.word_break = new_word_break
# Check if c is an unwanted character, i.e. whitespace, punctuation, or similar # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar
@ -30,8 +31,8 @@ def words(s: str):
# See https://www.fileformat.info/info/unicode/category/index.htm # See https://www.fileformat.info/info/unicode/category/index.htm
# and https://unicodebook.readthedocs.io/unicode.html#categories # and https://unicodebook.readthedocs.io/unicode.html#categories
unwanted_categories = 'O', 'M', 'P', 'Z', 'S' unwanted_categories = "O", "M", "P", "Z", "S"
unwanted_subcategories = 'Cc', 'Cf' unwanted_subcategories = "Cc", "Cf"
subcat = unicodedata.category(c) subcat = unicodedata.category(c)
cat = subcat[0] cat = subcat[0]
@ -53,7 +54,7 @@ def words(s: ExtractedText):
@multimethod @multimethod
def words_normalized(s: str): def words_normalized(s: str):
return words(unicodedata.normalize('NFC', s)) return words(unicodedata.normalize("NFC", s))
@multimethod @multimethod
@ -69,7 +70,9 @@ def word_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
@multimethod @multimethod
def word_error_rate_n(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]: def word_error_rate_n(
reference: ExtractedText, compared: ExtractedText
) -> Tuple[float, int]:
return word_error_rate_n(reference.text, compared.text) return word_error_rate_n(reference.text, compared.text)
@ -84,7 +87,7 @@ def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, i
if d == 0: if d == 0:
return 0, n return 0, n
if n == 0: if n == 0:
return float('inf'), n return float("inf"), n
return d / n, n return d / n, n

@ -1,29 +1,29 @@
from io import open from io import open
from setuptools import find_packages, setup from setuptools import find_packages, setup
with open('requirements.txt') as fp: with open("requirements.txt") as fp:
install_requires = fp.read() install_requires = fp.read()
setup( setup(
name='dinglehopper', name="dinglehopper",
author='Mike Gerber, The QURATOR SPK Team', author="Mike Gerber, The QURATOR SPK Team",
author_email='mike.gerber@sbb.spk-berlin.de, qurator@sbb.spk-berlin.de', author_email="mike.gerber@sbb.spk-berlin.de, qurator@sbb.spk-berlin.de",
description='The OCR evaluation tool', description="The OCR evaluation tool",
long_description=open('README.md', 'r', encoding='utf-8').read(), long_description=open("README.md", "r", encoding="utf-8").read(),
long_description_content_type='text/markdown', long_description_content_type="text/markdown",
keywords='qurator ocr', keywords="qurator ocr",
license='Apache', license="Apache",
namespace_packages=['qurator'], namespace_packages=["qurator"],
packages=find_packages(exclude=['*.tests', '*.tests.*', 'tests.*', 'tests']), packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
install_requires=install_requires, install_requires=install_requires,
package_data={ package_data={
'': ['*.json', 'templates/*'], "": ["*.json", "templates/*"],
}, },
entry_points={ entry_points={
'console_scripts': [ "console_scripts": [
'dinglehopper=qurator.dinglehopper.cli:main', "dinglehopper=qurator.dinglehopper.cli:main",
'dinglehopper-extract=qurator.dinglehopper.cli_extract:main', "dinglehopper-extract=qurator.dinglehopper.cli_extract:main",
'ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper', "ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper",
] ]
} },
) )

Loading…
Cancel
Save