🎨 dinglehopper: Reformat using black

pull/46/head
Gerber, Mike 4 years ago
parent 31c63f9e4c
commit 14421c8e53

@ -1,2 +1 @@
__import__('pkg_resources').declare_namespace(__name__)
__import__("pkg_resources").declare_namespace(__name__)

@ -3,8 +3,8 @@ from .edit_distance import *
def align(t1, t2):
"""Align text."""
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', t1)))
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', t2)))
s1 = list(grapheme_clusters(unicodedata.normalize("NFC", t1)))
s2 = list(grapheme_clusters(unicodedata.normalize("NFC", t2)))
return seq_align(s1, s2)
@ -27,13 +27,13 @@ def seq_align(s1, s2):
pass
if o:
if o[0] == 'insert':
if o[0] == "insert":
yield None, s2[j]
j += 1
elif o[0] == 'delete':
elif o[0] == "delete":
yield s1[i], None
i += 1
elif o[0] == 'replace':
elif o[0] == "replace":
yield s1[i], s2[j]
i += 1
j += 1

@ -19,19 +19,21 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
"""
d = distance(reference, compared)
n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))
n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference))))
if d == 0:
return 0, n
if n == 0:
return float('inf'), n
return d/n, n
return float("inf"), n
return d / n, n
# XXX Should we really count newlines here?
@multimethod
def character_error_rate_n(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
def character_error_rate_n(
reference: ExtractedText, compared: ExtractedText
) -> Tuple[float, int]:
return character_error_rate_n(reference.text, compared.text)

@ -12,16 +12,17 @@ from .extracted_text import ExtractedText
from .ocr_files import extract
from .config import Config
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
gtx = ''
ocrx = ''
gtx = ""
ocrx = ""
def format_thing(t, css_classes=None, id_=None):
if t is None:
html_t = none
css_classes += ' ellipsis'
elif t == '\n':
html_t = '<br>'
css_classes += " ellipsis"
elif t == "\n":
html_t = "<br>"
else:
html_t = escape(t)
@ -32,9 +33,13 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)
if css_classes:
return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs)
return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(
css_classes=css_classes,
html_t=html_t,
html_custom_attrs=html_custom_attrs,
)
else:
return '{html_t}'.format(html_t=html_t)
return "{html_t}".format(html_t=html_t)
if isinstance(gt_in, ExtractedText):
if not isinstance(ocr_in, ExtractedText):
@ -46,8 +51,6 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
gt_things = gt_in
ocr_things = ocr_in
g_pos = 0
o_pos = 0
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
@ -55,7 +58,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
gt_id = None
ocr_id = None
if g != o:
css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k)
css_classes = "{css_prefix}diff{k} diff".format(css_prefix=css_prefix, k=k)
if isinstance(gt_in, ExtractedText):
gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None
ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None
@ -70,17 +73,17 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
if o is not None:
o_pos += len(o)
return \
'''
return """
<div class="row">
<div class="col-md-6 gt">{}</div>
<div class="col-md-6 ocr">{}</div>
</div>
'''.format(gtx, ocrx)
""".format(
gtx, ocrx
)
def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'):
def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
"""Check OCR result against GT.
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
@ -93,36 +96,47 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'):
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
wer, n_words = word_error_rate_n(gt_text, ocr_text)
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·')
char_diff_report = gen_diff_report(
gt_text, ocr_text, css_prefix="c", joiner="", none="·"
)
gt_words = words_normalized(gt_text)
ocr_words = words_normalized(ocr_text)
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='')
word_diff_report = gen_diff_report(
gt_words, ocr_words, css_prefix="w", joiner=" ", none=""
)
def json_float(value):
"""Convert a float value to an JSON float.
This is here so that float('inf') yields "Infinity", not "inf".
"""
if value == float('inf'):
return 'Infinity'
elif value == float('-inf'):
return '-Infinity'
if value == float("inf"):
return "Infinity"
elif value == float("-inf"):
return "-Infinity"
else:
return str(value)
env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates')))
env.filters['json_float'] = json_float
env = Environment(
loader=FileSystemLoader(
os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
)
)
env.filters["json_float"] = json_float
for report_suffix in ('.html', '.json'):
template_fn = 'report' + report_suffix + '.j2'
for report_suffix in (".html", ".json"):
template_fn = "report" + report_suffix + ".j2"
out_fn = report_prefix + report_suffix
template = env.get_template(template_fn)
template.stream(
gt=gt, ocr=ocr,
cer=cer, n_characters=n_characters,
wer=wer, n_words=n_words,
gt=gt,
ocr=ocr,
cer=cer,
n_characters=n_characters,
wer=wer,
n_words=n_words,
char_diff_report=char_diff_report,
word_diff_report=word_diff_report,
metrics=metrics,
@ -130,12 +144,19 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'):
@click.command()
@click.argument('gt', type=click.Path(exists=True))
@click.argument('ocr', type=click.Path(exists=True))
@click.argument('report_prefix', type=click.Path(), default='report')
@click.option('--metrics/--no-metrics', default=True, help='Enable/disable metrics and green/red')
@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL')
@click.option('--progress', default=False, is_flag=True, help='Show progress bar')
@click.argument("gt", type=click.Path(exists=True))
@click.argument("ocr", type=click.Path(exists=True))
@click.argument("report_prefix", type=click.Path(), default="report")
@click.option(
"--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
)
@click.option(
"--textequiv-level",
default="region",
help="PAGE TextEquiv level to extract text from",
metavar="LEVEL",
)
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
"""
Compare the PAGE/ALTO/text document GT against the document OCR.
@ -159,5 +180,5 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
if __name__ == '__main__':
if __name__ == "__main__":
main()

@ -7,8 +7,13 @@ from .ocr_files import extract
@click.command()
@click.argument('input_file', type=click.Path(exists=True))
@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL')
@click.argument("input_file", type=click.Path(exists=True))
@click.option(
"--textequiv-level",
default="region",
help="PAGE TextEquiv level to extract text from",
metavar="LEVEL",
)
def main(input_file, textequiv_level):
"""
Extract the text of the given INPUT_FILE.
@ -23,5 +28,5 @@ def main(input_file, textequiv_level):
print(input_text)
if __name__ == '__main__':
if __name__ == "__main__":
main()

@ -48,9 +48,10 @@ def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
for i in tqdm(from_to(1, m), disable=not Config.progress):
for j in from_to(1, n):
D[i, j] = min(
D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution
D[i - 1, j - 1]
+ 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution
D[i, j - 1] + 1, # Insertion
D[i - 1, j] + 1 # Deletion
D[i - 1, j] + 1, # Deletion
)
return D
@ -81,8 +82,8 @@ def distance(s1: str, s2: str):
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
clusters. This should be the correct way to compare two Unicode strings.
"""
seq1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
seq2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
return levenshtein(seq1, seq2)
@ -106,11 +107,17 @@ def seq_editops(seq1, seq2):
def _tail_backtrace(i, j, accumulator):
if i > 0 and D[i - 1, j] + 1 == D[i, j]:
return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator)
return partial(
_tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator
)
if j > 0 and D[i, j - 1] + 1 == D[i, j]:
return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator)
return partial(
_tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator
)
if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator)
return partial(
_tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator
)
if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP
return accumulator
@ -132,6 +139,6 @@ def editops(word1, word2):
Note that this returns indices to the _grapheme clusters_, not characters!
"""
word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))
word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))
word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
return seq_editops(word1, word2)

@ -10,6 +10,7 @@ import numpy as np
from lxml import etree as ET
from ocrd_utils import getLogger
class Normalization(enum.Enum):
NFC = 1
NFC_MUFI = 2 # TODO
@ -18,7 +19,7 @@ class Normalization(enum.Enum):
def normalize(text, normalization):
if normalization == Normalization.NFC:
return unicodedata.normalize('NFC', text)
return unicodedata.normalize("NFC", text)
if normalization == Normalization.NFC_MUFI:
raise NotImplementedError()
if normalization == Normalization.NFC_SBB:
@ -36,31 +37,31 @@ def unjoin_ligatures(s):
"""Unjoin ligatures, i.e. ff becomes ff."""
equivalences = {
'': 'ſſ',
"\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
'': 'ch',
'': 'ck',
'': 'll',
'': 'ſi',
'': 'ſt',
'': 'fi',
'': 'ff',
'': 'fl',
'': 'ffi',
'': 'ct',
'': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
'\uf532': 'as', # eMOP: Latin small ligature as
'\uf533': 'is', # eMOP: Latin small ligature is
'\uf534': 'us', # eMOP: Latin small ligature us
'\uf535': 'Qu', # eMOP: Latin ligature capital Q small u
'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ
'\uE8BF': 'q&',
"": "ſſ",
"\ueba7": "ſſi", # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
"": "ch",
"": "ck",
"": "ll",
"": "ſi",
"": "ſt",
"": "fi",
"": "ff",
"": "fl",
"": "ffi",
"": "ct",
"": "tz", # MUFI: LATIN SMALL LIGATURE TZ
"\uf532": "as", # eMOP: Latin small ligature as
"\uf533": "is", # eMOP: Latin small ligature is
"\uf534": "us", # eMOP: Latin small ligature us
"\uf535": "Qu", # eMOP: Latin ligature capital Q small u
"ij": "ij", # U+0133 LATIN SMALL LIGATURE IJ
"\uE8BF": "q&",
# MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET
# XXX How to replace this correctly?
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
'': 'st', # U+FB06 LATIN SMALL LIGATURE ST
"\uEBA5": "ſp", # MUFI: LATIN SMALL LIGATURE LONG S P
"": "st", # U+FB06 LATIN SMALL LIGATURE ST
}
s = unicodedata.normalize('NFC', s)
s = unicodedata.normalize("NFC", s)
for fr, to in equivalences.items():
s = s.replace(fr, to)
return s
@ -70,20 +71,20 @@ def substitute_equivalences(s):
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
# It might make sense to use different rules for GT and for the different OCR
equivalences = {
'': 'ü',
'': 'ä',
'==': '', # → en-dash
'': '', # em-dash → en-dash
'': 'ö',
'': '\'',
'': '-',
'': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
'': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
'': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
'\uF50E': '' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
"": "ü",
"": "ä",
"==": "", # → en-dash
"": "", # em-dash → en-dash
"": "ö",
"": "'",
"": "-",
"": "ä", # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
"": "ö", # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
"": "ü", # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
"\uF50E": "", # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
}
s = unicodedata.normalize('NFC', s)
s = unicodedata.normalize("NFC", s)
s = unjoin_ligatures(s)
for fr, to in equivalences.items():
s = s.replace(fr, to)
@ -115,13 +116,14 @@ class ExtractedText:
Objects of this class are guaranteed to be a. always in their normalization
and b. in NFC.
"""
segment_id = attr.ib(type=Optional[str])
@segment_id.validator
def check(self, _, value):
if value is None:
return
if not re.match(r'[\w\d_-]+', value):
if not re.match(r"[\w\d_-]+", value):
raise ValueError('Malformed segment id "{}"'.format(value))
# An object contains either
@ -141,7 +143,7 @@ class ExtractedText:
def check(self, _, value):
if value is not None and self.segments is not None:
raise ValueError("Can't have both segments and text")
if value is not None and unicodedata.normalize('NFC', value) != value:
if value is not None and unicodedata.normalize("NFC", value) != value:
raise ValueError('String "{}" is not in NFC.'.format(value))
if value is not None and normalize(value, self.normalization) != value:
raise ValueError('String "{}" is not normalized.'.format(value))
@ -169,31 +171,24 @@ class ExtractedText:
seg_ids = [s.segment_id_for_pos(i) for i in range(len(s.text))]
segment_id_for_pos.extend(seg_ids)
segment_id_for_pos.extend(repeat(None, len(self.joiner)))
segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)]
segment_id_for_pos = segment_id_for_pos[: -len(self.joiner)]
# This is frozen, so we have to jump through the hoop:
object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
object.__setattr__(self, "_segment_id_for_pos", segment_id_for_pos)
assert self._segment_id_for_pos
return self._segment_id_for_pos[pos]
@classmethod
def from_text_segment(cls, text_segment, nsmap, textequiv_level='region'):
def from_text_segment(cls, text_segment, nsmap, textequiv_level="region"):
"""Build an ExtractedText from a PAGE content text element"""
localname_for_textequiv_level = {
'region': 'TextRegion',
'line': 'TextLine'
}
localname_for_textequiv_level = {"region": "TextRegion", "line": "TextLine"}
textequiv_level_for_localname = invert_dict(localname_for_textequiv_level)
children_for_localname = {
'TextRegion': 'TextLine'
}
joiner_for_textequiv_level = {
'line': '\n'
}
segment_id = text_segment.attrib['id']
children_for_localname = {"TextRegion": "TextLine"}
joiner_for_textequiv_level = {"line": "\n"}
segment_id = text_segment.attrib["id"]
localname = ET.QName(text_segment).localname
if localname == localname_for_textequiv_level[textequiv_level]:
segment_text = None
@ -201,19 +196,20 @@ class ExtractedText:
segment_text = get_textequiv_unicode(text_segment, nsmap)
# FIXME hardcoded SBB normalization
segment_text = normalize_sbb(segment_text)
segment_text = segment_text or ''
segment_text = segment_text or ""
return cls(segment_id, None, None, segment_text)
else:
# Recurse
sub_localname = children_for_localname[localname]
sub_textequiv_level = textequiv_level_for_localname[sub_localname]
segments = []
for sub_segment in text_segment.iterfind('./page:%s' % sub_localname,
namespaces=nsmap):
for sub_segment in text_segment.iterfind(
"./page:%s" % sub_localname, namespaces=nsmap
):
segments.append(
ExtractedText.from_text_segment(
sub_segment, nsmap,
textequiv_level=sub_textequiv_level)
sub_segment, nsmap, textequiv_level=sub_textequiv_level
)
)
joiner = joiner_for_textequiv_level[sub_textequiv_level]
return cls(segment_id, segments, joiner, None)
@ -231,24 +227,24 @@ def invert_dict(d):
def get_textequiv_unicode(text_segment, nsmap) -> str:
"""Get the TextEquiv/Unicode text of the given PAGE text element."""
segment_id = text_segment.attrib['id']
textequivs = text_segment.findall('./page:TextEquiv', namespaces=nsmap)
segment_id = text_segment.attrib["id"]
textequivs = text_segment.findall("./page:TextEquiv", namespaces=nsmap)
if not textequivs:
return ''
return ""
textequiv = get_first_textequiv(textequivs, segment_id)
return textequiv.find('./page:Unicode', namespaces=nsmap).text or ''
return textequiv.find("./page:Unicode", namespaces=nsmap).text or ""
def get_first_textequiv(textequivs, segment_id):
"""Get the first TextEquiv based on index or conf order if index is not present."""
log = getLogger('processor.OcrdDinglehopperEvaluate')
log = getLogger("processor.OcrdDinglehopperEvaluate")
if len(textequivs) == 1:
return textequivs[0]
# try ordering by index
indices = np.array([get_attr(te, 'index') for te in textequivs], dtype=float)
indices = np.array([get_attr(te, "index") for te in textequivs], dtype=float)
nan_mask = np.isnan(indices)
if np.any(~nan_mask):
if np.any(nan_mask):
@ -256,10 +252,12 @@ def get_first_textequiv(textequivs, segment_id):
index = np.nanargmin(indices)
else:
# try ordering by conf
confidences = np.array([get_attr(te, 'conf') for te in textequivs], dtype=float)
confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float)
if np.any(~np.isnan(confidences)):
log.info("No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
segment_id)
log.info(
"No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
segment_id,
)
index = np.nanargmax(confidences)
else:
# fallback to first entry in case of neither index or conf present

@ -17,24 +17,27 @@ def alto_namespace(tree: ET.ElementTree) -> str:
check if the files uses any valid ALTO namespace.
"""
root_name = ET.QName(tree.getroot().tag)
if root_name.localname == 'alto':
if root_name.localname == "alto":
return root_name.namespace
else:
raise ValueError('Not an ALTO tree')
raise ValueError("Not an ALTO tree")
def alto_extract_lines(tree: ET.ElementTree) -> Generator[ExtractedText, None, None]:
nsmap = {'alto': alto_namespace(tree)}
for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap):
line_id = line.attrib.get('ID')
line_text = ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap))
nsmap = {"alto": alto_namespace(tree)}
for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
line_id = line.attrib.get("ID")
line_text = " ".join(
string.attrib.get("CONTENT")
for string in line.iterfind("alto:String", namespaces=nsmap)
)
yield ExtractedText(line_id, None, None, normalize_sbb(line_text))
# FIXME hardcoded SBB normalization
def alto_extract(tree: ET.ElementTree()) -> ExtractedText:
"""Extract text from the given ALTO ElementTree."""
return ExtractedText(None, list(alto_extract_lines(tree)), '\n', None)
return ExtractedText(None, list(alto_extract_lines(tree)), "\n", None)
def alto_text(tree):
@ -48,56 +51,73 @@ def page_namespace(tree):
do not check if the files uses any valid PAGE namespace.
"""
root_name = ET.QName(tree.getroot().tag)
if root_name.localname == 'PcGts':
if root_name.localname == "PcGts":
return root_name.namespace
else:
raise ValueError('Not a PAGE tree')
raise ValueError("Not a PAGE tree")
def page_extract(tree, *, textequiv_level='region'):
def page_extract(tree, *, textequiv_level="region"):
"""Extract text from the given PAGE content ElementTree."""
# Internally, this is just parsing the Reading Order (if it exists) and
# and leaves reading the TextRegions to ExtractedText.from_text_segment().
nsmap = {'page': page_namespace(tree)}
nsmap = {"page": page_namespace(tree)}
regions = []
reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap)
reading_order = tree.find(".//page:ReadingOrder", namespaces=nsmap)
if reading_order is not None:
for group in reading_order.iterfind('./*', namespaces=nsmap):
if ET.QName(group.tag).localname == 'OrderedGroup':
region_ref_indexeds = group.findall('./page:RegionRefIndexed', namespaces=nsmap)
for region_ref_indexed in sorted(region_ref_indexeds, key=lambda r: int(r.attrib['index'])):
region_id = region_ref_indexed.attrib['regionRef']
region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap)
for group in reading_order.iterfind("./*", namespaces=nsmap):
if ET.QName(group.tag).localname == "OrderedGroup":
region_ref_indexeds = group.findall(
"./page:RegionRefIndexed", namespaces=nsmap
)
for region_ref_indexed in sorted(
region_ref_indexeds, key=lambda r: int(r.attrib["index"])
):
region_id = region_ref_indexed.attrib["regionRef"]
region = tree.find(
'.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap
)
if region is not None:
regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level))
regions.append(
ExtractedText.from_text_segment(
region, nsmap, textequiv_level=textequiv_level
)
)
else:
pass # Not a TextRegion
else:
raise NotImplementedError
else:
for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap):
regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level))
for region in tree.iterfind(".//page:TextRegion", namespaces=nsmap):
regions.append(
ExtractedText.from_text_segment(
region, nsmap, textequiv_level=textequiv_level
)
)
# Filter empty region texts
regions = [r for r in regions if r.text != '']
regions = [r for r in regions if r.text != ""]
return ExtractedText(None, regions, '\n', None)
return ExtractedText(None, regions, "\n", None)
def page_text(tree, *, textequiv_level='region'):
def page_text(tree, *, textequiv_level="region"):
return page_extract(tree, textequiv_level=textequiv_level).text
def plain_extract(filename):
with open(filename, 'r') as f:
with open(filename, "r") as f:
return ExtractedText(
None,
[ExtractedText('line %d' % no, None, None, line) for no, line in enumerate(f.readlines())],
'\n',
None
None,
[
ExtractedText("line %d" % no, None, None, line)
for no, line in enumerate(f.readlines())
],
"\n",
None,
)
@ -105,7 +125,7 @@ def plain_text(filename):
return plain_extract(filename).text
def extract(filename, *, textequiv_level='region'):
def extract(filename, *, textequiv_level="region"):
"""Extract the text from the given file.
Supports PAGE, ALTO and falls back to plain text.
@ -124,5 +144,5 @@ def text(filename):
return extract(filename).text
if __name__ == '__main__':
if __name__ == "__main__":
print(text(sys.argv[1]))

@ -10,7 +10,7 @@ from pkg_resources import resource_string
from .cli import process as cli_process
from .edit_distance import levenshtein_matrix_cache_clear
OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
@click.command()
@ -20,20 +20,19 @@ def ocrd_dinglehopper(*args, **kwargs):
class OcrdDinglehopperEvaluate(Processor):
def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dinglehopper']
kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
def process(self):
assert_file_grp_cardinality(self.input_file_grp, 2, 'GT and OCR')
assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
assert_file_grp_cardinality(self.output_file_grp, 1)
log = getLogger('processor.OcrdDinglehopperEvaluate')
log = getLogger("processor.OcrdDinglehopperEvaluate")
metrics = self.parameter['metrics']
textequiv_level = self.parameter['textequiv_level']
gt_grp, ocr_grp = self.input_file_grp.split(',')
metrics = self.parameter["metrics"]
textequiv_level = self.parameter["textequiv_level"]
gt_grp, ocr_grp = self.input_file_grp.split(",")
input_file_tuples = self._zip_input_files([gt_grp, ocr_grp])
for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
@ -55,40 +54,47 @@ class OcrdDinglehopperEvaluate(Processor):
except FileExistsError:
pass
cli_process(
gt_file.local_filename,
ocr_file.local_filename,
report_prefix,
metrics=metrics,
textequiv_level=textequiv_level
gt_file.local_filename,
ocr_file.local_filename,
report_prefix,
metrics=metrics,
textequiv_level=textequiv_level,
)
# Add reports to the workspace
for report_suffix, mimetype in \
[
['.html', 'text/html'],
['.json', 'application/json']
]:
for report_suffix, mimetype in [
[".html", "text/html"],
[".json", "application/json"],
]:
self.workspace.add_file(
ID=file_id + report_suffix,
file_grp=self.output_file_grp,
pageId=page_id,
mimetype=mimetype,
local_filename=report_prefix + report_suffix)
ID=file_id + report_suffix,
file_grp=self.output_file_grp,
pageId=page_id,
mimetype=mimetype,
local_filename=report_prefix + report_suffix,
)
# Clear cache between files
levenshtein_matrix_cache_clear()
def _zip_input_files(self, input_file_grps):
log = getLogger('processor.OcrdDinglehopperEvaluate')
log = getLogger("processor.OcrdDinglehopperEvaluate")
input_file_tuples = list()
for page_id in ([self.page_id] if self.page_id else
self.workspace.mets.physical_pages):
for page_id in (
[self.page_id] if self.page_id else self.workspace.mets.physical_pages
):
ifiles = list()
for input_file_grp in input_file_grps:
log.debug("Adding input file group %s to page %s", input_file_grp, page_id)
files = self.workspace.mets.find_all_files(pageId=page_id, fileGrp=input_file_grp)
log.debug(
"Adding input file group %s to page %s", input_file_grp, page_id
)
files = self.workspace.mets.find_all_files(
pageId=page_id, fileGrp=input_file_grp
)
if not files:
log.error('Found no page "%s" in file group %s', page_id, input_file_grp)
log.error(
'Found no page "%s" in file group %s', page_id, input_file_grp
)
ifiles.append(None)
else:
ifiles.append(files[0])
@ -97,5 +103,5 @@ class OcrdDinglehopperEvaluate(Processor):
return input_file_tuples
if __name__ == '__main__':
if __name__ == "__main__":
ocrd_dinglehopper()

@ -10,25 +10,30 @@ from .. import seq_align, ExtractedText
def test_text():
test1 = ExtractedText(None, [
ExtractedText('s0', None, None, 'foo'),
ExtractedText('s1', None, None, 'bar'),
ExtractedText('s2', None, None, 'bazinga')
], ' ', None)
assert test1.text == 'foo bar bazinga'
assert test1.segment_id_for_pos(0) == 's0'
test1 = ExtractedText(
None,
[
ExtractedText("s0", None, None, "foo"),
ExtractedText("s1", None, None, "bar"),
ExtractedText("s2", None, None, "bazinga"),
],
" ",
None,
)
assert test1.text == "foo bar bazinga"
assert test1.segment_id_for_pos(0) == "s0"
assert test1.segment_id_for_pos(3) is None
assert test1.segment_id_for_pos(10) == 's2'
assert test1.segment_id_for_pos(10) == "s2"
def test_normalization_check():
with pytest.raises(ValueError, match=r'.*is not in NFC.*'):
ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ'))
assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ'))
with pytest.raises(ValueError, match=r".*is not in NFC.*"):
ExtractedText("foo", None, None, unicodedata.normalize("NFD", "Schlyñ"))
assert ExtractedText("foo", None, None, unicodedata.normalize("NFC", "Schlyñ"))
AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id')
AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")
def test_align():
@ -39,25 +44,36 @@ def test_align():
not Python characters.
"""
test1 = ExtractedText(None, [
ExtractedText('s0', None, None, 'foo'),
ExtractedText('s1', None, None, 'bar'),
ExtractedText('s2', None, None, 'batzinga')
], ' ', None)
test2 = ExtractedText(None, [
ExtractedText('x0', None, None, 'foo'),
ExtractedText('x1', None, None, 'bar'),
# extra .
ExtractedText('x2', None, None, '.'),
# deletion + different grapheme cluster, m̃ also is two Python characters
ExtractedText('x3', None, None, 'bazim̃ga'),
], ' ', None)
test1 = ExtractedText(
None,
[
ExtractedText("s0", None, None, "foo"),
ExtractedText("s1", None, None, "bar"),
ExtractedText("s2", None, None, "batzinga"),
],
" ",
None,
)
test2 = ExtractedText(
None,
[
ExtractedText("x0", None, None, "foo"),
ExtractedText("x1", None, None, "bar"),
# extra .
ExtractedText("x2", None, None, "."),
# deletion + different grapheme cluster, m̃ also is two Python characters
ExtractedText("x3", None, None, "bazim̃ga"),
],
" ",
None,
)
left_pos = 0
right_pos = 0
alignment = []
for left, right in seq_align(grapheme_clusters(test1.text),
grapheme_clusters(test2.text)):
for left, right in seq_align(
grapheme_clusters(test1.text), grapheme_clusters(test2.text)
):
left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
el = AlignmentElement(left, right, left_id, right_id)
@ -67,46 +83,57 @@ def test_align():
if right is not None:
right_pos += len(right)
print('test1: {}'.format(test1.text))
print('test2: {}'.format(test2.text))
assert alignment[0] == ('f', 'f', 's0', 'x0')
assert alignment[8] == (None, '.', None, 'x2')
assert alignment[12] == ('t', None, 's2', None)
assert alignment[15] == ('n', '', 's2', 'x3')
@pytest.mark.parametrize("attributes,expected_index,expected_log", [
([], None, None),
(['index="0"'], 0, None),
([''], 0, None),
(['conf="0.5"'], 0, None),
(['index="1"', 'index="0"'], 1, None),
(['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"),
(['conf="0.4"', 'conf="0.5"', 'conf="0.9"'], 2,
"No index attributes, use 'conf' attribute to sort TextEquiv"),
(['index="0"', ''], 0, "TextEquiv without index"),
(['', 'conf="0.4"'], 1,
"No index attributes, use 'conf' attribute to sort TextEquiv"),
(['', ''], 0, "No index attributes, use first TextEquiv"),
])
print("test1: {}".format(test1.text))
print("test2: {}".format(test2.text))
assert alignment[0] == ("f", "f", "s0", "x0")
assert alignment[8] == (None, ".", None, "x2")
assert alignment[12] == ("t", None, "s2", None)
assert alignment[15] == ("n", "", "s2", "x3")
@pytest.mark.parametrize(
"attributes,expected_index,expected_log",
[
([], None, None),
(['index="0"'], 0, None),
([""], 0, None),
(['conf="0.5"'], 0, None),
(['index="1"', 'index="0"'], 1, None),
(['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"),
(
['conf="0.4"', 'conf="0.5"', 'conf="0.9"'],
2,
"No index attributes, use 'conf' attribute to sort TextEquiv",
),
(['index="0"', ""], 0, "TextEquiv without index"),
(
["", 'conf="0.4"'],
1,
"No index attributes, use 'conf' attribute to sort TextEquiv",
),
(["", ""], 0, "No index attributes, use first TextEquiv"),
],
)
def test_textequiv(attributes, expected_index, expected_log, caplog):
"""Test that extracting text from a PAGE TextEquiv is working without index attr."""
caplog.set_level(logging.INFO)
xml = "<?xml version=\"1.0\"?>"
xml = '<?xml version="1.0"?>'
ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
text = ["Text {0}".format(i) for i in range(len(attributes) + 1)]
equiv = ["<TextEquiv {0}><Unicode>{1}</Unicode></TextEquiv>".format(attr, text[i])
for i, attr in enumerate(attributes)]
equiv = [
"<TextEquiv {0}><Unicode>{1}</Unicode></TextEquiv>".format(attr, text[i])
for i, attr in enumerate(attributes)
]
textline = "{0}<TextLine id=\"l3\" xmlns=\"{1}\">{2}</TextLine>"
textline = textline.format(xml, ns, ''.join(equiv))
textline = '{0}<TextLine id="l3" xmlns="{1}">{2}</TextLine>'
textline = textline.format(xml, ns, "".join(equiv))
root = ET.fromstring(textline)
result = ExtractedText.from_text_segment(root,
{'page': ns},
textequiv_level='line').text
result = ExtractedText.from_text_segment(
root, {"page": ns}, textequiv_level="line"
).text
if expected_index is None:
assert not result
else:

@ -3,64 +3,85 @@ from .. import align, seq_align, distance
def test_left_empty():
result = list(align('', 'foo'))
expected = [(None, 'f'), (None, 'o'), (None, 'o')]
result = list(align("", "foo"))
expected = [(None, "f"), (None, "o"), (None, "o")]
assert result == expected
def test_right_empty():
result = list(align('foo', ''))
expected = [('f', None), ('o', None), ('o', None)]
result = list(align("foo", ""))
expected = [("f", None), ("o", None), ("o", None)]
assert result == expected
def test_left_longer():
result = list(align('food', 'foo'))
expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), ('d', None)]
result = list(align("food", "foo"))
expected = [("f", "f"), ("o", "o"), ("o", "o"), ("d", None)]
assert result == expected
def test_right_longer():
result = list(align('foo', 'food'))
expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), (None, 'd')]
result = list(align("foo", "food"))
expected = [("f", "f"), ("o", "o"), ("o", "o"), (None, "d")]
assert result == expected
def test_some_diff():
result = list(align('abcde', 'aaadef'))
result = list(align("abcde", "aaadef"))
left, right = unzip(result)
assert list(left) == ['a', 'b', 'c', 'd', 'e', None]
assert list(right) == ['a', 'a', 'a', 'd', 'e', 'f']
assert list(left) == ["a", "b", "c", "d", "e", None]
assert list(right) == ["a", "a", "a", "d", "e", "f"]
def test_longer():
s1 = 'Dies ist eine Tst!'
s2 = 'Dies ist ein Test.'
s1 = "Dies ist eine Tst!"
s2 = "Dies ist ein Test."
result = list(align(s1, s2)) # ; diffprint(*unzip(result))
expected = [('D', 'D'), ('i', 'i'), ('e', 'e'), ('s', 's'), (' ', ' '),
('i', 'i'), ('s', 's'), ('t', 't'), (' ', ' '),
('e', 'e'), ('i', 'i'), ('n', 'n'), ('e', None), (' ', ' '),
('T', 'T'), (None, 'e'), ('s', 's'), ('t', 't'), ('!', '.')]
expected = [
("D", "D"),
("i", "i"),
("e", "e"),
("s", "s"),
(" ", " "),
("i", "i"),
("s", "s"),
("t", "t"),
(" ", " "),
("e", "e"),
("i", "i"),
("n", "n"),
("e", None),
(" ", " "),
("T", "T"),
(None, "e"),
("s", "s"),
("t", "t"),
("!", "."),
]
assert result == expected
def test_completely_different():
assert len(list(align('abcde', 'fghij'))) == 5
assert len(list(align("abcde", "fghij"))) == 5
def test_with_some_fake_ocr_errors():
result = list(align('Über die vielen Sorgen wegen desselben vergaß',
'SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab'))
result = list(
align(
"Über die vielen Sorgen wegen desselben vergaß",
"SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
)
)
left, right = unzip(result)
# Beginning
assert list(left[:18]) == [None]*18
assert list(right[:18]) == list('SomeJunk MoreJunk ')
assert list(left[:18]) == [None] * 18
assert list(right[:18]) == list("SomeJunk MoreJunk ")
# End
assert list(left[-1:]) == ['ß']
assert list(right[-1:]) == ['b']
assert list(left[-1:]) == ["ß"]
assert list(right[-1:]) == ["b"]
def test_lines():
@ -68,13 +89,30 @@ def test_lines():
This mainly serves as documentation for comparing lists of lines.
"""
result = list(seq_align(
['This is a line.', 'This is another', 'And the last line'],
['This is a line.', 'This is another', 'J u n k', 'And the last line']
))
result = list(
seq_align(
["This is a line.", "This is another", "And the last line"],
[
"This is a line.",
"This is another",
"J u n k",
"And the last line",
],
)
)
left, right = unzip(result)
assert list(left) == ['This is a line.', 'This is another', None, 'And the last line']
assert list(right) == ['This is a line.', 'This is another', 'J u n k', 'And the last line']
assert list(left) == [
"This is a line.",
"This is another",
None,
"And the last line",
]
assert list(right) == [
"This is a line.",
"This is another",
"J u n k",
"And the last line",
]
def test_lines_similar():
@ -92,7 +130,7 @@ def test_lines_similar():
# Just an example!
min_len = min(len(self._string), len(other._string))
if min_len > 0:
normalized_distance = distance(self._string, other._string)/min_len
normalized_distance = distance(self._string, other._string) / min_len
similar = normalized_distance < 0.1
else:
similar = False
@ -102,18 +140,39 @@ def test_lines_similar():
return not self.__eq__(other)
def __repr__(self):
return 'SimilarString(\'%s\')' % self._string
return "SimilarString('%s')" % self._string
def __hash__(self):
return hash(self._string)
result = list(seq_align(
[SimilarString('This is a line.'), SimilarString('This is another'), SimilarString('And the last line')],
[SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')]
))
result = list(
seq_align(
[
SimilarString("This is a line."),
SimilarString("This is another"),
SimilarString("And the last line"),
],
[
SimilarString("This is a ljne."),
SimilarString("This is another"),
SimilarString("J u n k"),
SimilarString("And the last line"),
],
)
)
left, right = unzip(result)
assert list(left) == [SimilarString('This is a line.'), SimilarString('This is another'), None, SimilarString('And the last line')]
assert list(right) == [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')]
assert list(left) == [
SimilarString("This is a line."),
SimilarString("This is another"),
None,
SimilarString("And the last line"),
]
assert list(right) == [
SimilarString("This is a ljne."),
SimilarString("This is another"),
SimilarString("J u n k"),
SimilarString("And the last line"),
]
# Test __eq__ (i.e. is it a substitution or a similar string?)
assert list(left)[0] == list(right)[0]

@ -7,31 +7,35 @@ from .. import character_error_rate
def test_character_error_rate():
assert character_error_rate('a', 'a') == 0
assert character_error_rate('a', 'b') == 1/1
assert character_error_rate('Foo', 'Bar') == 3/3
assert character_error_rate("a", "a") == 0
assert character_error_rate("a", "b") == 1 / 1
assert character_error_rate("Foo", "Bar") == 3 / 3
assert character_error_rate('Foo', '') == 3/3
assert character_error_rate("Foo", "") == 3 / 3
assert character_error_rate('', '') == 0
assert math.isinf(character_error_rate('', 'Foo'))
assert character_error_rate("", "") == 0
assert math.isinf(character_error_rate("", "Foo"))
assert character_error_rate('Foo', 'Food') == 1/3
assert character_error_rate('Fnord', 'Food') == 2/5
assert character_error_rate('Müll', 'Mull') == 1/4
assert character_error_rate('Abstand', 'Sand') == 4/7
assert character_error_rate("Foo", "Food") == 1 / 3
assert character_error_rate("Fnord", "Food") == 2 / 5
assert character_error_rate("Müll", "Mull") == 1 / 4
assert character_error_rate("Abstand", "Sand") == 4 / 7
def test_character_error_rate_hard():
s1 = unicodedata.normalize('NFC', 'Schlyñ lorem ipsum.')
s2 = unicodedata.normalize('NFD', 'Schlyñ lorem ipsum!') # Different, decomposed!
assert character_error_rate(s1, s2) == 1/19
s1 = 'Schlyñ'
assert len(s1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
s2 = 'Schlym̃'
assert len(s2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
s1 = unicodedata.normalize("NFC", "Schlyñ lorem ipsum.")
s2 = unicodedata.normalize("NFD", "Schlyñ lorem ipsum!") # Different, decomposed!
assert character_error_rate(s1, s2) == 1 / 19
s1 = "Schlyñ"
assert (
len(s1) == 6
) # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
s2 = "Schlym̃"
assert (
len(s2) == 7
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
# Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
assert character_error_rate(s2, s1) == 1/6
assert character_error_rate(s1, s2) == 1/6
assert character_error_rate(s2, s1) == 1 / 6
assert character_error_rate(s1, s2) == 1 / 6

@ -6,35 +6,39 @@ from .. import levenshtein, distance
def test_levenshtein():
assert levenshtein('a', 'a') == 0
assert levenshtein('a', 'b') == 1
assert levenshtein('Foo', 'Bar') == 3
assert levenshtein("a", "a") == 0
assert levenshtein("a", "b") == 1
assert levenshtein("Foo", "Bar") == 3
assert levenshtein('', '') == 0
assert levenshtein('Foo', '') == 3
assert levenshtein('', 'Foo') == 3
assert levenshtein("", "") == 0
assert levenshtein("Foo", "") == 3
assert levenshtein("", "Foo") == 3
assert levenshtein('Foo', 'Food') == 1
assert levenshtein('Fnord', 'Food') == 2
assert levenshtein('Müll', 'Mull') == 1
assert levenshtein('Abstand', 'Sand') == 4
assert levenshtein("Foo", "Food") == 1
assert levenshtein("Fnord", "Food") == 2
assert levenshtein("Müll", "Mull") == 1
assert levenshtein("Abstand", "Sand") == 4
def test_levenshtein_other_sequences():
assert levenshtein(['a', 'ab'], ['a', 'ab', 'c']) == 1
assert levenshtein(['a', 'ab'], ['a', 'c']) == 1
assert levenshtein(["a", "ab"], ["a", "ab", "c"]) == 1
assert levenshtein(["a", "ab"], ["a", "c"]) == 1
def test_distance():
assert distance('Fnord', 'Food') == 2
assert distance('Müll', 'Mull') == 1
assert distance("Fnord", "Food") == 2
assert distance("Müll", "Mull") == 1
word1 = unicodedata.normalize('NFC', 'Schlyñ')
word2 = unicodedata.normalize('NFD', 'Schlyñ') # Different, decomposed!
word1 = unicodedata.normalize("NFC", "Schlyñ")
word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed!
assert distance(word1, word2) == 0
word1 = 'Schlyñ'
assert len(word1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
word2 = 'Schlym̃'
assert len(word2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
word1 = "Schlyñ"
assert (
len(word1) == 6
) # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
word2 = "Schlym̃"
assert (
len(word2) == 7
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
assert distance(word1, word2) == 1

@ -4,45 +4,60 @@ from .. import seq_editops, editops
def test_trivial():
assert seq_editops('abc', 'abc') == []
assert seq_editops('', '') == []
assert seq_editops("abc", "abc") == []
assert seq_editops("", "") == []
def test_insert():
assert seq_editops('bc', 'abc') == [('insert', 0, 0)]
assert seq_editops('ac', 'abc') == [('insert', 1, 1)]
assert seq_editops('ab', 'abc') == [('insert', 2, 2)]
assert seq_editops('', 'a') == [('insert', 0, 0)]
assert seq_editops("bc", "abc") == [("insert", 0, 0)]
assert seq_editops("ac", "abc") == [("insert", 1, 1)]
assert seq_editops("ab", "abc") == [("insert", 2, 2)]
assert seq_editops("", "a") == [("insert", 0, 0)]
def test_multiple():
assert seq_editops('bcd', 'abce') == [('insert', 0, 0), ('replace', 2, 3)]
assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]
def test_delete():
assert seq_editops('abcdef', 'cdef') == [('delete', 0, 0), ('delete', 1, 0)]
assert seq_editops('Xabcdef', 'Xcdef') == [('delete', 1, 1), ('delete', 2, 1)]
assert seq_editops('abcdefg', 'acdefX') == [('delete', 1, 1), ('replace', 6, 5)]
assert seq_editops('abcde', 'aabcd') == [('insert', 1, 1), ('delete', 4, 5)]
assert seq_editops('Foo', '') == [('delete', 0, 0), ('delete', 1, 0), ('delete', 2, 0)]
assert seq_editops('Foolish', 'Foo') == [('delete', 3, 3), ('delete', 4, 3), ('delete', 5, 3), ('delete', 6, 3)]
assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
assert seq_editops("Foo", "") == [
("delete", 0, 0),
("delete", 1, 0),
("delete", 2, 0),
]
assert seq_editops("Foolish", "Foo") == [
("delete", 3, 3),
("delete", 4, 3),
("delete", 5, 3),
("delete", 6, 3),
]
def test_ambiguous():
assert seq_editops('bcd', 'abcef') == [('insert', 0, 0), ('replace', 2, 3), ('insert', 3, 4)]
assert seq_editops("bcd", "abcef") == [
("insert", 0, 0),
("replace", 2, 3),
("insert", 3, 4),
]
def test_editops():
"""Test editops() in cases where dealing with grapheme clusters matters"""
# In these cases, one of the words has a composed form, the other one does not.
assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)]
assert editops('oͤde', 'öde') == [('replace', 0, 0)]
assert editops("Schlyñ", "Schlym̃") == [("replace", 5, 5)]
assert editops("oͤde", "öde") == [("replace", 0, 0)]
def test_editops_canonically_equivalent():
left = unicodedata.lookup('LATIN SMALL LETTER N') + unicodedata.lookup('COMBINING TILDE')
right = unicodedata.lookup('LATIN SMALL LETTER N WITH TILDE')
left = unicodedata.lookup("LATIN SMALL LETTER N") + unicodedata.lookup(
"COMBINING TILDE"
)
right = unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE")
assert left != right
assert unicodedata.normalize('NFC', left) == unicodedata.normalize('NFC', right)
assert unicodedata.normalize("NFC", left) == unicodedata.normalize("NFC", right)
assert editops(left, right) == []

@ -7,7 +7,7 @@ from lxml import etree as ET
from .. import align, page_text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
@ -17,8 +17,8 @@ def test_align_page_files():
# (currently) not counted due to normalization.
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
result = list(align(gt, ocr))
for left, right in result:

@ -8,26 +8,34 @@ from uniseg.graphemecluster import grapheme_clusters
from .. import character_error_rate, page_text, alto_text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
def test_character_error_rate_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
# The fi ligature does not count.
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
gt_len = len(list(grapheme_clusters(gt)))
expected_cer = 2/gt_len
expected_cer = 2 / gt_len
assert character_error_rate(gt, ocr) == expected_cer
@pytest.mark.integration
def test_character_error_rate_between_page_alto():
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
gt = page_text(
ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml"))
)
ocr = alto_text(
ET.parse(
os.path.join(
data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml"
)
)
)
assert gt == ocr
assert character_error_rate(gt, ocr) == 0
@ -35,7 +43,17 @@ def test_character_error_rate_between_page_alto():
@pytest.mark.integration
def test_character_error_rate_between_page_alto_2():
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
assert character_error_rate(gt, ocr) == 8/591 # Manually verified
gt = page_text(
ET.parse(
os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml")
)
)
ocr = alto_text(
ET.parse(
os.path.join(
data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml"
)
)
)
assert character_error_rate(gt, ocr) == 8 / 591 # Manually verified

@ -10,31 +10,31 @@ def test_cli_json(tmp_path):
"""Test that the cli/process() yields a loadable JSON report"""
with working_directory(str(tmp_path)):
with open('gt.txt', 'w') as gtf:
gtf.write('AAAAA')
with open('ocr.txt', 'w') as ocrf:
ocrf.write('AAAAB')
with open("gt.txt", "w") as gtf:
gtf.write("AAAAA")
with open("ocr.txt", "w") as ocrf:
ocrf.write("AAAAB")
with open('gt.txt', 'r') as gtf:
with open("gt.txt", "r") as gtf:
print(gtf.read())
process('gt.txt', 'ocr.txt', 'report')
with open('report.json', 'r') as jsonf:
process("gt.txt", "ocr.txt", "report")
with open("report.json", "r") as jsonf:
print(jsonf.read())
with open('report.json', 'r') as jsonf:
with open("report.json", "r") as jsonf:
j = json.load(jsonf)
assert j['cer'] == pytest.approx(0.2)
assert j["cer"] == pytest.approx(0.2)
def test_cli_json_cer_is_infinity(tmp_path):
"""Test that the cli/process() yields a loadable JSON report when CER == inf"""
with working_directory(str(tmp_path)):
with open('gt.txt', 'w') as gtf:
gtf.write('') # Empty to yield CER == inf
with open('ocr.txt', 'w') as ocrf:
ocrf.write('Not important')
with open("gt.txt", "w") as gtf:
gtf.write("") # Empty to yield CER == inf
with open("ocr.txt", "w") as ocrf:
ocrf.write("Not important")
process('gt.txt', 'ocr.txt', 'report')
with open('report.json', 'r') as jsonf:
process("gt.txt", "ocr.txt", "report")
with open("report.json", "r") as jsonf:
j = json.load(jsonf)
assert j['cer'] == pytest.approx(float('inf'))
assert j["cer"] == pytest.approx(float("inf"))

@ -7,7 +7,7 @@ from lxml import etree as ET
from .. import distance, page_text, alto_text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
@ -15,15 +15,23 @@ def test_distance_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
# Due to normalization, we don't count the ligature.
# → 2 differences
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
assert distance(gt, ocr) == 2
@pytest.mark.integration
def test_distance_between_page_alto():
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
gt = page_text(
ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml"))
)
ocr = alto_text(
ET.parse(
os.path.join(
data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml"
)
)
)
assert gt == ocr
assert distance(gt, ocr) == 0
@ -31,7 +39,17 @@ def test_distance_between_page_alto():
@pytest.mark.integration
def test_distance_between_page_alto_2():
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
gt = page_text(
ET.parse(
os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml")
)
)
ocr = alto_text(
ET.parse(
os.path.join(
data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml"
)
)
)
assert distance(gt, ocr) == 8 # Manually verified

@ -10,27 +10,32 @@ from .util import working_directory
from ..ocrd_cli import ocrd_dinglehopper
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
def test_ocrd_cli(tmp_path):
"""Test OCR-D interface"""
# Copy test workspace
test_workspace_dir_source = Path(data_dir) / 'actevedef_718448162'
test_workspace_dir = tmp_path / 'test_ocrd_cli'
test_workspace_dir_source = Path(data_dir) / "actevedef_718448162"
test_workspace_dir = tmp_path / "test_ocrd_cli"
shutil.copytree(str(test_workspace_dir_source), str(test_workspace_dir))
# Run through the OCR-D interface
with working_directory(str(test_workspace_dir)):
runner = CliRunner()
args = [
'-m', 'mets.xml',
'-I', 'OCR-D-GT-PAGE,OCR-D-OCR-CALAMARI',
'-O', 'OCR-D-OCR-CALAMARI-EVAL'
"-m",
"mets.xml",
"-I",
"OCR-D-GT-PAGE,OCR-D-OCR-CALAMARI",
"-O",
"OCR-D-OCR-CALAMARI-EVAL",
]
sys.argv[1:] = args # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments
sys.argv[
1:
] = args # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments
result = runner.invoke(ocrd_dinglehopper, args)
assert result.exit_code == 0
result_json = list((test_workspace_dir / 'OCR-D-OCR-CALAMARI-EVAL').glob('*.json'))
assert json.load(open(str(result_json[0])))['cer'] < 0.03
result_json = list((test_workspace_dir / "OCR-D-OCR-CALAMARI-EVAL").glob("*.json"))
assert json.load(open(str(result_json[0])))["cer"] < 0.03

@ -7,26 +7,36 @@ from lxml import etree as ET
from .. import word_error_rate, words, page_text, alto_text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
def test_word_error_rate_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
# the ligature does not count → 2 errors
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line
gt_word_count = (
7 + 6 + 5 + 8 + 7 + 6 + 7 + 8 + 6 + 7 + 7 + 5 + 6 + 8 + 8 + 7 + 7 + 6 + 5 + 4
) # Manually verified word count per line
assert len(list(words(gt))) == gt_word_count
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
assert word_error_rate(gt, ocr) == 2/gt_word_count
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
assert word_error_rate(gt, ocr) == 2 / gt_word_count
@pytest.mark.integration
def test_word_error_rate_between_page_alto():
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
gt = page_text(
ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml"))
)
ocr = alto_text(
ET.parse(
os.path.join(
data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml"
)
)
)
assert gt == ocr
assert word_error_rate(gt, ocr) == 0
@ -34,11 +44,25 @@ def test_word_error_rate_between_page_alto():
@pytest.mark.integration
def test_word_error_rate_between_page_alto_2():
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
gt_word_count = 14+18+17+14+17+17+3 # Manually verified word count per line
gt = page_text(
ET.parse(
os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml")
)
)
gt_word_count = (
14 + 18 + 17 + 14 + 17 + 17 + 3
) # Manually verified word count per line
assert len(list(words(gt))) == gt_word_count
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
assert word_error_rate(gt, ocr) == 7/gt_word_count # Manually verified, 6 words are wrong, 1 got split (=2 errors)
ocr = alto_text(
ET.parse(
os.path.join(
data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml"
)
)
)
assert (
word_error_rate(gt, ocr) == 7 / gt_word_count
) # Manually verified, 6 words are wrong, 1 got split (=2 errors)

@ -9,46 +9,54 @@ import pytest
from .util import working_directory
from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
def test_alto_namespace():
tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
assert alto_namespace(tree) == 'http://www.loc.gov/standards/alto/ns-v3#'
tree = ET.parse(os.path.join(data_dir, "test.alto3.xml"))
assert alto_namespace(tree) == "http://www.loc.gov/standards/alto/ns-v3#"
def test_alto_text():
tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
tree = ET.parse(os.path.join(data_dir, "test.alto3.xml"))
result = alto_text(tree)
expected = textwrap.dedent("""\
expected = textwrap.dedent(
"""\
über die vielen Sorgen wegen deſſelben vergaß
Hartkopf, der Frau Amtmännin das ver-
ſprochene zu überliefern.""")
ſprochene zu überliefern."""
)
assert result == expected
def test_alto_text_ALTO1():
tree = ET.parse(os.path.join(data_dir, 'test.alto1.xml'))
tree = ET.parse(os.path.join(data_dir, "test.alto1.xml"))
assert "being erected at the Broadway stock" in alto_text(tree)
def test_alto_text_ALTO2():
tree = ET.parse(os.path.join(data_dir, 'test.alto2.xml'))
assert "Halbmonde, die genau durch einen Ouerstrich halbiert\nsind und an beiden Enden" in alto_text(tree)
tree = ET.parse(os.path.join(data_dir, "test.alto2.xml"))
assert (
"Halbmonde, die genau durch einen Ouerstrich halbiert\nsind und an beiden Enden"
in alto_text(tree)
)
def test_alto_text_ALTO3():
tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
tree = ET.parse(os.path.join(data_dir, "test.alto3.xml"))
assert "über die vielen Sorgen wegen deſſelben vergaß" in alto_text(tree)
def test_page_namespace():
tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
assert page_namespace(tree) == 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15'
tree = ET.parse(os.path.join(data_dir, "test.page2018.xml"))
assert (
page_namespace(tree)
== "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
)
def test_page_test():
tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
tree = ET.parse(os.path.join(data_dir, "test.page2018.xml"))
result = page_text(tree)
# We are currently normalizing on extraction, so the text is normalized.
@ -74,7 +82,8 @@ def test_page_test():
# Jndeß mangelten do einige Generalia, die
# alſo wegfielen. — Hartkopf gieng ſelb
# mit und berbrate es. —""")
expected = textwrap.dedent("""\
expected = textwrap.dedent(
"""\
über die vielen Sorgen wegen deſſelben vergaß
Hartkopf, der Frau Amtmännin das ver-
ſprochene zu überliefern. Ein Erpreſſer
@ -94,7 +103,8 @@ def test_page_test():
ſie das, was da wäre, herbeyſchaffen möchte.
Jndeß mangelten doch einige Generalia, die
alſo wegfielen. Hartkopf gieng ſelbſt
mit und überbrachte es. """)
mit und überbrachte es. """
)
assert result == expected
@ -107,56 +117,69 @@ def test_page_with_empty_region():
# <Unicode></Unicode>
# </TextEquiv>
# </TextRegion>
tree = ET.parse(os.path.join(data_dir, 'brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml'))
tree = ET.parse(
os.path.join(data_dir, "brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml")
)
result = page_text(tree)
assert result
def test_page_order():
# This file contains TextRegions where file order is not the same as reading order.
tree = ET.parse(os.path.join(data_dir, 'order.page.xml'))
tree = ET.parse(os.path.join(data_dir, "order.page.xml"))
result = page_text(tree)
print(result)
assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die', result, re.DOTALL)
assert re.search(
r"Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die",
result,
re.DOTALL,
)
def test_page_mixed_regions():
# This file contains ImageRegions and TextRegions in the ReadingOrder
tree = ET.parse(os.path.join(data_dir, 'mixed-regions.page.xml'))
tree = ET.parse(os.path.join(data_dir, "mixed-regions.page.xml"))
result = page_text(tree)
assert 'non exaudiam uos. Chriſtiani uero quia orant iuxta' in result
assert "non exaudiam uos. Chriſtiani uero quia orant iuxta" in result
def test_page_level():
# This file contains inconsistent TextRegion and TextLine texts
# TextRegion
tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml'))
tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml"))
result = page_text(tree)
assert result == 'Inconsistent dummy region text'
tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml'))
result = page_text(tree, textequiv_level='region')
assert result == 'Inconsistent dummy region text'
assert result == "Inconsistent dummy region text"
tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml"))
result = page_text(tree, textequiv_level="region")
assert result == "Inconsistent dummy region text"
# TextLine
tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml'))
result = page_text(tree, textequiv_level='line')
assert result == 'Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-'
tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml"))
result = page_text(tree, textequiv_level="line")
assert (
result
== "Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-"
)
def test_text():
assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
assert "wieder ein. Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt'))
assert "being erected at the Broadway stock" in text(
os.path.join(data_dir, "test.alto1.xml")
)
assert "wieder ein. Er langte den Zettel aus dem" in text(
os.path.join(data_dir, "test.page2018.xml")
)
assert "Lorem ipsum" in text(os.path.join(data_dir, "test.txt"))
def test_plain(tmp_path):
with working_directory(str(tmp_path)):
with open('ocr.txt', 'w') as ocrf:
ocrf.write('AAAAB')
with open("ocr.txt", "w") as ocrf:
ocrf.write("AAAAB")
result = plain_text('ocr.txt')
expected = 'AAAAB'
result = plain_text("ocr.txt")
expected = "AAAAB"
assert result == expected

@ -6,32 +6,81 @@ from .. import word_error_rate, words
def test_words():
result = list(words('Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?'))
expected = ['Der', 'schnelle', 'braune', 'Fuchs', 'kann', 'keine', '3,14', 'Meter', 'springen', 'oder']
result = list(
words("Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?")
)
expected = [
"Der",
"schnelle",
"braune",
"Fuchs",
"kann",
"keine",
"3,14",
"Meter",
"springen",
"oder",
]
assert result == expected
def test_words_private_use_area():
result = list(words(
'ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n'
'ſproene zu berliefern.'))
result = list(
words(
"ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n"
"ſproene zu berliefern."
)
)
expected = [
'ber', 'die', 'vielen', 'Sorgen', 'wegen', 'deelben', 'vergaß', 'Hartkopf',
'der', 'Frau', 'Amtmnnin', 'das', 'ver',
'ſproene', 'zu', 'berliefern']
"ber",
"die",
"vielen",
"Sorgen",
"wegen",
"deelben",
"vergaß",
"Hartkopf",
"der",
"Frau",
"Amtmnnin",
"das",
"ver",
"ſproene",
"zu",
"berliefern",
]
assert result == expected
def test_word_error_rate():
assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ist ein Beispielsatz!') == 0
assert word_error_rate('Dies. ist ein Beispielsatz!', 'Dies ist ein Beispielsatz!') == 0
assert word_error_rate('Dies. ist ein Beispielsatz!', 'Dies ist ein Beispielsatz.') == 0
assert (
word_error_rate("Dies ist ein Beispielsatz!", "Dies ist ein Beispielsatz!") == 0
)
assert (
word_error_rate("Dies. ist ein Beispielsatz!", "Dies ist ein Beispielsatz!")
== 0
)
assert (
word_error_rate("Dies. ist ein Beispielsatz!", "Dies ist ein Beispielsatz.")
== 0
)
assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ist ein Beispielsarz:') == 1/4
assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ein ist Beispielsatz!') == 2/4
assert (
word_error_rate("Dies ist ein Beispielsatz!", "Dies ist ein Beispielsarz:")
== 1 / 4
)
assert (
word_error_rate("Dies ist ein Beispielsatz!", "Dies ein ist Beispielsatz!")
== 2 / 4
)
assert word_error_rate('Dies ist ein Beispielsatz!', '') == 4/4
assert math.isinf(word_error_rate('', 'Dies ist ein Beispielsatz!'))
assert word_error_rate('', '') == 0
assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4
assert math.isinf(word_error_rate("", "Dies ist ein Beispielsatz!"))
assert word_error_rate("", "") == 0
assert word_error_rate('Schlyñ lorem ipsum dolor sit amet,', 'Schlym̃ lorem ipsum dolor sit amet.') == 1/6
assert (
word_error_rate(
"Schlyñ lorem ipsum dolor sit amet,", "Schlym̃ lorem ipsum dolor sit amet."
)
== 1 / 6
)

@ -27,6 +27,7 @@ def unzip(an_iterable_of_tuples):
class working_directory:
"""Context manager to temporarily change the working directory"""
def __init__(self, wd):
self.wd = wd

@ -20,9 +20,10 @@ def words(s: str):
def new_word_break(c, index=0):
if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area
return 'ALetter'
return "ALetter"
else:
return old_word_break(c, index)
uniseg.wordbreak.word_break = new_word_break
# Check if c is an unwanted character, i.e. whitespace, punctuation, or similar
@ -30,8 +31,8 @@ def words(s: str):
# See https://www.fileformat.info/info/unicode/category/index.htm
# and https://unicodebook.readthedocs.io/unicode.html#categories
unwanted_categories = 'O', 'M', 'P', 'Z', 'S'
unwanted_subcategories = 'Cc', 'Cf'
unwanted_categories = "O", "M", "P", "Z", "S"
unwanted_subcategories = "Cc", "Cf"
subcat = unicodedata.category(c)
cat = subcat[0]
@ -53,7 +54,7 @@ def words(s: ExtractedText):
@multimethod
def words_normalized(s: str):
return words(unicodedata.normalize('NFC', s))
return words(unicodedata.normalize("NFC", s))
@multimethod
@ -69,7 +70,9 @@ def word_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
@multimethod
def word_error_rate_n(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
def word_error_rate_n(
reference: ExtractedText, compared: ExtractedText
) -> Tuple[float, int]:
return word_error_rate_n(reference.text, compared.text)
@ -84,7 +87,7 @@ def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, i
if d == 0:
return 0, n
if n == 0:
return float('inf'), n
return float("inf"), n
return d / n, n

@ -1,29 +1,29 @@
from io import open
from setuptools import find_packages, setup
with open('requirements.txt') as fp:
with open("requirements.txt") as fp:
install_requires = fp.read()
setup(
name='dinglehopper',
author='Mike Gerber, The QURATOR SPK Team',
author_email='mike.gerber@sbb.spk-berlin.de, qurator@sbb.spk-berlin.de',
description='The OCR evaluation tool',
long_description=open('README.md', 'r', encoding='utf-8').read(),
long_description_content_type='text/markdown',
keywords='qurator ocr',
license='Apache',
namespace_packages=['qurator'],
packages=find_packages(exclude=['*.tests', '*.tests.*', 'tests.*', 'tests']),
name="dinglehopper",
author="Mike Gerber, The QURATOR SPK Team",
author_email="mike.gerber@sbb.spk-berlin.de, qurator@sbb.spk-berlin.de",
description="The OCR evaluation tool",
long_description=open("README.md", "r", encoding="utf-8").read(),
long_description_content_type="text/markdown",
keywords="qurator ocr",
license="Apache",
namespace_packages=["qurator"],
packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
install_requires=install_requires,
package_data={
'': ['*.json', 'templates/*'],
"": ["*.json", "templates/*"],
},
entry_points={
'console_scripts': [
'dinglehopper=qurator.dinglehopper.cli:main',
'dinglehopper-extract=qurator.dinglehopper.cli_extract:main',
'ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper',
]
}
"console_scripts": [
"dinglehopper=qurator.dinglehopper.cli:main",
"dinglehopper-extract=qurator.dinglehopper.cli_extract:main",
"ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper",
]
},
)

Loading…
Cancel
Save