mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 11:50:00 +02:00
🎨 dinglehopper: Reformat using black
This commit is contained in:
parent
31c63f9e4c
commit
14421c8e53
25 changed files with 774 additions and 466 deletions
|
@ -1,2 +1 @@
|
|||
__import__('pkg_resources').declare_namespace(__name__)
|
||||
|
||||
__import__("pkg_resources").declare_namespace(__name__)
|
||||
|
|
|
@ -3,8 +3,8 @@ from .edit_distance import *
|
|||
|
||||
def align(t1, t2):
|
||||
"""Align text."""
|
||||
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', t1)))
|
||||
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', t2)))
|
||||
s1 = list(grapheme_clusters(unicodedata.normalize("NFC", t1)))
|
||||
s2 = list(grapheme_clusters(unicodedata.normalize("NFC", t2)))
|
||||
return seq_align(s1, s2)
|
||||
|
||||
|
||||
|
@ -27,13 +27,13 @@ def seq_align(s1, s2):
|
|||
pass
|
||||
|
||||
if o:
|
||||
if o[0] == 'insert':
|
||||
if o[0] == "insert":
|
||||
yield None, s2[j]
|
||||
j += 1
|
||||
elif o[0] == 'delete':
|
||||
elif o[0] == "delete":
|
||||
yield s1[i], None
|
||||
i += 1
|
||||
elif o[0] == 'replace':
|
||||
elif o[0] == "replace":
|
||||
yield s1[i], s2[j]
|
||||
i += 1
|
||||
j += 1
|
||||
|
|
|
@ -19,19 +19,21 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
|
|||
"""
|
||||
|
||||
d = distance(reference, compared)
|
||||
n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))
|
||||
n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference))))
|
||||
|
||||
if d == 0:
|
||||
return 0, n
|
||||
if n == 0:
|
||||
return float('inf'), n
|
||||
return d/n, n
|
||||
return float("inf"), n
|
||||
return d / n, n
|
||||
|
||||
# XXX Should we really count newlines here?
|
||||
|
||||
|
||||
@multimethod
|
||||
def character_error_rate_n(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
|
||||
def character_error_rate_n(
|
||||
reference: ExtractedText, compared: ExtractedText
|
||||
) -> Tuple[float, int]:
|
||||
return character_error_rate_n(reference.text, compared.text)
|
||||
|
||||
|
||||
|
|
|
@ -12,16 +12,17 @@ from .extracted_text import ExtractedText
|
|||
from .ocr_files import extract
|
||||
from .config import Config
|
||||
|
||||
|
||||
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
|
||||
gtx = ''
|
||||
ocrx = ''
|
||||
gtx = ""
|
||||
ocrx = ""
|
||||
|
||||
def format_thing(t, css_classes=None, id_=None):
|
||||
if t is None:
|
||||
html_t = none
|
||||
css_classes += ' ellipsis'
|
||||
elif t == '\n':
|
||||
html_t = '<br>'
|
||||
css_classes += " ellipsis"
|
||||
elif t == "\n":
|
||||
html_t = "<br>"
|
||||
else:
|
||||
html_t = escape(t)
|
||||
|
||||
|
@ -32,9 +33,13 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
|
|||
html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)
|
||||
|
||||
if css_classes:
|
||||
return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs)
|
||||
return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(
|
||||
css_classes=css_classes,
|
||||
html_t=html_t,
|
||||
html_custom_attrs=html_custom_attrs,
|
||||
)
|
||||
else:
|
||||
return '{html_t}'.format(html_t=html_t)
|
||||
return "{html_t}".format(html_t=html_t)
|
||||
|
||||
if isinstance(gt_in, ExtractedText):
|
||||
if not isinstance(ocr_in, ExtractedText):
|
||||
|
@ -46,8 +51,6 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
|
|||
gt_things = gt_in
|
||||
ocr_things = ocr_in
|
||||
|
||||
|
||||
|
||||
g_pos = 0
|
||||
o_pos = 0
|
||||
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
|
||||
|
@ -55,7 +58,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
|
|||
gt_id = None
|
||||
ocr_id = None
|
||||
if g != o:
|
||||
css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k)
|
||||
css_classes = "{css_prefix}diff{k} diff".format(css_prefix=css_prefix, k=k)
|
||||
if isinstance(gt_in, ExtractedText):
|
||||
gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None
|
||||
ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None
|
||||
|
@ -70,17 +73,17 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
|
|||
if o is not None:
|
||||
o_pos += len(o)
|
||||
|
||||
|
||||
return \
|
||||
'''
|
||||
return """
|
||||
<div class="row">
|
||||
<div class="col-md-6 gt">{}</div>
|
||||
<div class="col-md-6 ocr">{}</div>
|
||||
</div>
|
||||
'''.format(gtx, ocrx)
|
||||
""".format(
|
||||
gtx, ocrx
|
||||
)
|
||||
|
||||
|
||||
def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'):
|
||||
def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
|
||||
"""Check OCR result against GT.
|
||||
|
||||
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
|
||||
|
@ -93,36 +96,47 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'):
|
|||
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
|
||||
wer, n_words = word_error_rate_n(gt_text, ocr_text)
|
||||
|
||||
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·')
|
||||
char_diff_report = gen_diff_report(
|
||||
gt_text, ocr_text, css_prefix="c", joiner="", none="·"
|
||||
)
|
||||
|
||||
gt_words = words_normalized(gt_text)
|
||||
ocr_words = words_normalized(ocr_text)
|
||||
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯')
|
||||
word_diff_report = gen_diff_report(
|
||||
gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
|
||||
)
|
||||
|
||||
def json_float(value):
|
||||
"""Convert a float value to an JSON float.
|
||||
|
||||
This is here so that float('inf') yields "Infinity", not "inf".
|
||||
"""
|
||||
if value == float('inf'):
|
||||
return 'Infinity'
|
||||
elif value == float('-inf'):
|
||||
return '-Infinity'
|
||||
if value == float("inf"):
|
||||
return "Infinity"
|
||||
elif value == float("-inf"):
|
||||
return "-Infinity"
|
||||
else:
|
||||
return str(value)
|
||||
|
||||
env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates')))
|
||||
env.filters['json_float'] = json_float
|
||||
env = Environment(
|
||||
loader=FileSystemLoader(
|
||||
os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
|
||||
)
|
||||
)
|
||||
env.filters["json_float"] = json_float
|
||||
|
||||
for report_suffix in ('.html', '.json'):
|
||||
template_fn = 'report' + report_suffix + '.j2'
|
||||
for report_suffix in (".html", ".json"):
|
||||
template_fn = "report" + report_suffix + ".j2"
|
||||
out_fn = report_prefix + report_suffix
|
||||
|
||||
template = env.get_template(template_fn)
|
||||
template.stream(
|
||||
gt=gt, ocr=ocr,
|
||||
cer=cer, n_characters=n_characters,
|
||||
wer=wer, n_words=n_words,
|
||||
gt=gt,
|
||||
ocr=ocr,
|
||||
cer=cer,
|
||||
n_characters=n_characters,
|
||||
wer=wer,
|
||||
n_words=n_words,
|
||||
char_diff_report=char_diff_report,
|
||||
word_diff_report=word_diff_report,
|
||||
metrics=metrics,
|
||||
|
@ -130,12 +144,19 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'):
|
|||
|
||||
|
||||
@click.command()
|
||||
@click.argument('gt', type=click.Path(exists=True))
|
||||
@click.argument('ocr', type=click.Path(exists=True))
|
||||
@click.argument('report_prefix', type=click.Path(), default='report')
|
||||
@click.option('--metrics/--no-metrics', default=True, help='Enable/disable metrics and green/red')
|
||||
@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL')
|
||||
@click.option('--progress', default=False, is_flag=True, help='Show progress bar')
|
||||
@click.argument("gt", type=click.Path(exists=True))
|
||||
@click.argument("ocr", type=click.Path(exists=True))
|
||||
@click.argument("report_prefix", type=click.Path(), default="report")
|
||||
@click.option(
|
||||
"--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
|
||||
)
|
||||
@click.option(
|
||||
"--textequiv-level",
|
||||
default="region",
|
||||
help="PAGE TextEquiv level to extract text from",
|
||||
metavar="LEVEL",
|
||||
)
|
||||
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
|
||||
def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
|
||||
"""
|
||||
Compare the PAGE/ALTO/text document GT against the document OCR.
|
||||
|
@ -159,5 +180,5 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
|
|||
process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -7,8 +7,13 @@ from .ocr_files import extract
|
|||
|
||||
|
||||
@click.command()
|
||||
@click.argument('input_file', type=click.Path(exists=True))
|
||||
@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL')
|
||||
@click.argument("input_file", type=click.Path(exists=True))
|
||||
@click.option(
|
||||
"--textequiv-level",
|
||||
default="region",
|
||||
help="PAGE TextEquiv level to extract text from",
|
||||
metavar="LEVEL",
|
||||
)
|
||||
def main(input_file, textequiv_level):
|
||||
"""
|
||||
Extract the text of the given INPUT_FILE.
|
||||
|
@ -23,5 +28,5 @@ def main(input_file, textequiv_level):
|
|||
print(input_text)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -48,9 +48,10 @@ def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
|
|||
for i in tqdm(from_to(1, m), disable=not Config.progress):
|
||||
for j in from_to(1, n):
|
||||
D[i, j] = min(
|
||||
D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution
|
||||
D[i - 1, j - 1]
|
||||
+ 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution
|
||||
D[i, j - 1] + 1, # Insertion
|
||||
D[i - 1, j] + 1 # Deletion
|
||||
D[i - 1, j] + 1, # Deletion
|
||||
)
|
||||
|
||||
return D
|
||||
|
@ -81,8 +82,8 @@ def distance(s1: str, s2: str):
|
|||
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
|
||||
clusters. This should be the correct way to compare two Unicode strings.
|
||||
"""
|
||||
seq1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
|
||||
seq2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
|
||||
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
|
||||
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
|
||||
return levenshtein(seq1, seq2)
|
||||
|
||||
|
||||
|
@ -106,11 +107,17 @@ def seq_editops(seq1, seq2):
|
|||
|
||||
def _tail_backtrace(i, j, accumulator):
|
||||
if i > 0 and D[i - 1, j] + 1 == D[i, j]:
|
||||
return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator)
|
||||
return partial(
|
||||
_tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator
|
||||
)
|
||||
if j > 0 and D[i, j - 1] + 1 == D[i, j]:
|
||||
return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator)
|
||||
return partial(
|
||||
_tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator
|
||||
)
|
||||
if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
|
||||
return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator)
|
||||
return partial(
|
||||
_tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator
|
||||
)
|
||||
if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
|
||||
return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP
|
||||
return accumulator
|
||||
|
@ -132,6 +139,6 @@ def editops(word1, word2):
|
|||
|
||||
Note that this returns indices to the _grapheme clusters_, not characters!
|
||||
"""
|
||||
word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))
|
||||
word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))
|
||||
word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
|
||||
word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
|
||||
return seq_editops(word1, word2)
|
||||
|
|
|
@ -10,6 +10,7 @@ import numpy as np
|
|||
from lxml import etree as ET
|
||||
from ocrd_utils import getLogger
|
||||
|
||||
|
||||
class Normalization(enum.Enum):
|
||||
NFC = 1
|
||||
NFC_MUFI = 2 # TODO
|
||||
|
@ -18,7 +19,7 @@ class Normalization(enum.Enum):
|
|||
|
||||
def normalize(text, normalization):
|
||||
if normalization == Normalization.NFC:
|
||||
return unicodedata.normalize('NFC', text)
|
||||
return unicodedata.normalize("NFC", text)
|
||||
if normalization == Normalization.NFC_MUFI:
|
||||
raise NotImplementedError()
|
||||
if normalization == Normalization.NFC_SBB:
|
||||
|
@ -36,31 +37,31 @@ def unjoin_ligatures(s):
|
|||
"""Unjoin ligatures, i.e. ff becomes ff."""
|
||||
|
||||
equivalences = {
|
||||
'': 'ſſ',
|
||||
"\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
|
||||
'': 'ch',
|
||||
'': 'ck',
|
||||
'': 'll',
|
||||
'': 'ſi',
|
||||
'': 'ſt',
|
||||
'fi': 'fi',
|
||||
'ff': 'ff',
|
||||
'fl': 'fl',
|
||||
'ffi': 'ffi',
|
||||
'': 'ct',
|
||||
'': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
|
||||
'\uf532': 'as', # eMOP: Latin small ligature as
|
||||
'\uf533': 'is', # eMOP: Latin small ligature is
|
||||
'\uf534': 'us', # eMOP: Latin small ligature us
|
||||
'\uf535': 'Qu', # eMOP: Latin ligature capital Q small u
|
||||
'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ
|
||||
'\uE8BF': 'q&',
|
||||
"": "ſſ",
|
||||
"\ueba7": "ſſi", # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
|
||||
"": "ch",
|
||||
"": "ck",
|
||||
"": "ll",
|
||||
"": "ſi",
|
||||
"": "ſt",
|
||||
"fi": "fi",
|
||||
"ff": "ff",
|
||||
"fl": "fl",
|
||||
"ffi": "ffi",
|
||||
"": "ct",
|
||||
"": "tz", # MUFI: LATIN SMALL LIGATURE TZ
|
||||
"\uf532": "as", # eMOP: Latin small ligature as
|
||||
"\uf533": "is", # eMOP: Latin small ligature is
|
||||
"\uf534": "us", # eMOP: Latin small ligature us
|
||||
"\uf535": "Qu", # eMOP: Latin ligature capital Q small u
|
||||
"ij": "ij", # U+0133 LATIN SMALL LIGATURE IJ
|
||||
"\uE8BF": "q&",
|
||||
# MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET
|
||||
# XXX How to replace this correctly?
|
||||
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
|
||||
'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST
|
||||
"\uEBA5": "ſp", # MUFI: LATIN SMALL LIGATURE LONG S P
|
||||
"st": "st", # U+FB06 LATIN SMALL LIGATURE ST
|
||||
}
|
||||
s = unicodedata.normalize('NFC', s)
|
||||
s = unicodedata.normalize("NFC", s)
|
||||
for fr, to in equivalences.items():
|
||||
s = s.replace(fr, to)
|
||||
return s
|
||||
|
@ -70,20 +71,20 @@ def substitute_equivalences(s):
|
|||
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
|
||||
# It might make sense to use different rules for GT and for the different OCR
|
||||
equivalences = {
|
||||
'': 'ü',
|
||||
'': 'ä',
|
||||
'==': '–', # → en-dash
|
||||
'—': '–', # em-dash → en-dash
|
||||
'': 'ö',
|
||||
'’': '\'',
|
||||
'⸗': '-',
|
||||
'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
|
||||
'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
|
||||
'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
|
||||
'\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
|
||||
"": "ü",
|
||||
"": "ä",
|
||||
"==": "–", # → en-dash
|
||||
"—": "–", # em-dash → en-dash
|
||||
"": "ö",
|
||||
"’": "'",
|
||||
"⸗": "-",
|
||||
"aͤ": "ä", # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
|
||||
"oͤ": "ö", # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
|
||||
"uͤ": "ü", # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
|
||||
"\uF50E": "q́", # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
|
||||
}
|
||||
|
||||
s = unicodedata.normalize('NFC', s)
|
||||
s = unicodedata.normalize("NFC", s)
|
||||
s = unjoin_ligatures(s)
|
||||
for fr, to in equivalences.items():
|
||||
s = s.replace(fr, to)
|
||||
|
@ -115,13 +116,14 @@ class ExtractedText:
|
|||
Objects of this class are guaranteed to be a. always in their normalization
|
||||
and b. in NFC.
|
||||
"""
|
||||
|
||||
segment_id = attr.ib(type=Optional[str])
|
||||
|
||||
@segment_id.validator
|
||||
def check(self, _, value):
|
||||
if value is None:
|
||||
return
|
||||
if not re.match(r'[\w\d_-]+', value):
|
||||
if not re.match(r"[\w\d_-]+", value):
|
||||
raise ValueError('Malformed segment id "{}"'.format(value))
|
||||
|
||||
# An object contains either
|
||||
|
@ -141,7 +143,7 @@ class ExtractedText:
|
|||
def check(self, _, value):
|
||||
if value is not None and self.segments is not None:
|
||||
raise ValueError("Can't have both segments and text")
|
||||
if value is not None and unicodedata.normalize('NFC', value) != value:
|
||||
if value is not None and unicodedata.normalize("NFC", value) != value:
|
||||
raise ValueError('String "{}" is not in NFC.'.format(value))
|
||||
if value is not None and normalize(value, self.normalization) != value:
|
||||
raise ValueError('String "{}" is not normalized.'.format(value))
|
||||
|
@ -169,31 +171,24 @@ class ExtractedText:
|
|||
seg_ids = [s.segment_id_for_pos(i) for i in range(len(s.text))]
|
||||
segment_id_for_pos.extend(seg_ids)
|
||||
segment_id_for_pos.extend(repeat(None, len(self.joiner)))
|
||||
segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)]
|
||||
segment_id_for_pos = segment_id_for_pos[: -len(self.joiner)]
|
||||
|
||||
# This is frozen, so we have to jump through the hoop:
|
||||
object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
|
||||
object.__setattr__(self, "_segment_id_for_pos", segment_id_for_pos)
|
||||
assert self._segment_id_for_pos
|
||||
|
||||
return self._segment_id_for_pos[pos]
|
||||
|
||||
@classmethod
|
||||
def from_text_segment(cls, text_segment, nsmap, textequiv_level='region'):
|
||||
def from_text_segment(cls, text_segment, nsmap, textequiv_level="region"):
|
||||
"""Build an ExtractedText from a PAGE content text element"""
|
||||
|
||||
localname_for_textequiv_level = {
|
||||
'region': 'TextRegion',
|
||||
'line': 'TextLine'
|
||||
}
|
||||
localname_for_textequiv_level = {"region": "TextRegion", "line": "TextLine"}
|
||||
textequiv_level_for_localname = invert_dict(localname_for_textequiv_level)
|
||||
children_for_localname = {
|
||||
'TextRegion': 'TextLine'
|
||||
}
|
||||
joiner_for_textequiv_level = {
|
||||
'line': '\n'
|
||||
}
|
||||
children_for_localname = {"TextRegion": "TextLine"}
|
||||
joiner_for_textequiv_level = {"line": "\n"}
|
||||
|
||||
segment_id = text_segment.attrib['id']
|
||||
segment_id = text_segment.attrib["id"]
|
||||
localname = ET.QName(text_segment).localname
|
||||
if localname == localname_for_textequiv_level[textequiv_level]:
|
||||
segment_text = None
|
||||
|
@ -201,19 +196,20 @@ class ExtractedText:
|
|||
segment_text = get_textequiv_unicode(text_segment, nsmap)
|
||||
# FIXME hardcoded SBB normalization
|
||||
segment_text = normalize_sbb(segment_text)
|
||||
segment_text = segment_text or ''
|
||||
segment_text = segment_text or ""
|
||||
return cls(segment_id, None, None, segment_text)
|
||||
else:
|
||||
# Recurse
|
||||
sub_localname = children_for_localname[localname]
|
||||
sub_textequiv_level = textequiv_level_for_localname[sub_localname]
|
||||
segments = []
|
||||
for sub_segment in text_segment.iterfind('./page:%s' % sub_localname,
|
||||
namespaces=nsmap):
|
||||
for sub_segment in text_segment.iterfind(
|
||||
"./page:%s" % sub_localname, namespaces=nsmap
|
||||
):
|
||||
segments.append(
|
||||
ExtractedText.from_text_segment(
|
||||
sub_segment, nsmap,
|
||||
textequiv_level=sub_textequiv_level)
|
||||
sub_segment, nsmap, textequiv_level=sub_textequiv_level
|
||||
)
|
||||
)
|
||||
joiner = joiner_for_textequiv_level[sub_textequiv_level]
|
||||
return cls(segment_id, segments, joiner, None)
|
||||
|
@ -231,24 +227,24 @@ def invert_dict(d):
|
|||
|
||||
def get_textequiv_unicode(text_segment, nsmap) -> str:
|
||||
"""Get the TextEquiv/Unicode text of the given PAGE text element."""
|
||||
segment_id = text_segment.attrib['id']
|
||||
textequivs = text_segment.findall('./page:TextEquiv', namespaces=nsmap)
|
||||
segment_id = text_segment.attrib["id"]
|
||||
textequivs = text_segment.findall("./page:TextEquiv", namespaces=nsmap)
|
||||
|
||||
if not textequivs:
|
||||
return ''
|
||||
return ""
|
||||
|
||||
textequiv = get_first_textequiv(textequivs, segment_id)
|
||||
return textequiv.find('./page:Unicode', namespaces=nsmap).text or ''
|
||||
return textequiv.find("./page:Unicode", namespaces=nsmap).text or ""
|
||||
|
||||
|
||||
def get_first_textequiv(textequivs, segment_id):
|
||||
"""Get the first TextEquiv based on index or conf order if index is not present."""
|
||||
log = getLogger('processor.OcrdDinglehopperEvaluate')
|
||||
log = getLogger("processor.OcrdDinglehopperEvaluate")
|
||||
if len(textequivs) == 1:
|
||||
return textequivs[0]
|
||||
|
||||
# try ordering by index
|
||||
indices = np.array([get_attr(te, 'index') for te in textequivs], dtype=float)
|
||||
indices = np.array([get_attr(te, "index") for te in textequivs], dtype=float)
|
||||
nan_mask = np.isnan(indices)
|
||||
if np.any(~nan_mask):
|
||||
if np.any(nan_mask):
|
||||
|
@ -256,10 +252,12 @@ def get_first_textequiv(textequivs, segment_id):
|
|||
index = np.nanargmin(indices)
|
||||
else:
|
||||
# try ordering by conf
|
||||
confidences = np.array([get_attr(te, 'conf') for te in textequivs], dtype=float)
|
||||
confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float)
|
||||
if np.any(~np.isnan(confidences)):
|
||||
log.info("No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
|
||||
segment_id)
|
||||
log.info(
|
||||
"No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
|
||||
segment_id,
|
||||
)
|
||||
index = np.nanargmax(confidences)
|
||||
else:
|
||||
# fallback to first entry in case of neither index or conf present
|
||||
|
|
|
@ -17,24 +17,27 @@ def alto_namespace(tree: ET.ElementTree) -> str:
|
|||
check if the files uses any valid ALTO namespace.
|
||||
"""
|
||||
root_name = ET.QName(tree.getroot().tag)
|
||||
if root_name.localname == 'alto':
|
||||
if root_name.localname == "alto":
|
||||
return root_name.namespace
|
||||
else:
|
||||
raise ValueError('Not an ALTO tree')
|
||||
raise ValueError("Not an ALTO tree")
|
||||
|
||||
|
||||
def alto_extract_lines(tree: ET.ElementTree) -> Generator[ExtractedText, None, None]:
|
||||
nsmap = {'alto': alto_namespace(tree)}
|
||||
for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap):
|
||||
line_id = line.attrib.get('ID')
|
||||
line_text = ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap))
|
||||
nsmap = {"alto": alto_namespace(tree)}
|
||||
for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
|
||||
line_id = line.attrib.get("ID")
|
||||
line_text = " ".join(
|
||||
string.attrib.get("CONTENT")
|
||||
for string in line.iterfind("alto:String", namespaces=nsmap)
|
||||
)
|
||||
yield ExtractedText(line_id, None, None, normalize_sbb(line_text))
|
||||
# FIXME hardcoded SBB normalization
|
||||
|
||||
|
||||
def alto_extract(tree: ET.ElementTree()) -> ExtractedText:
|
||||
"""Extract text from the given ALTO ElementTree."""
|
||||
return ExtractedText(None, list(alto_extract_lines(tree)), '\n', None)
|
||||
return ExtractedText(None, list(alto_extract_lines(tree)), "\n", None)
|
||||
|
||||
|
||||
def alto_text(tree):
|
||||
|
@ -48,56 +51,73 @@ def page_namespace(tree):
|
|||
do not check if the files uses any valid PAGE namespace.
|
||||
"""
|
||||
root_name = ET.QName(tree.getroot().tag)
|
||||
if root_name.localname == 'PcGts':
|
||||
if root_name.localname == "PcGts":
|
||||
return root_name.namespace
|
||||
else:
|
||||
raise ValueError('Not a PAGE tree')
|
||||
raise ValueError("Not a PAGE tree")
|
||||
|
||||
|
||||
def page_extract(tree, *, textequiv_level='region'):
|
||||
def page_extract(tree, *, textequiv_level="region"):
|
||||
"""Extract text from the given PAGE content ElementTree."""
|
||||
|
||||
# Internally, this is just parsing the Reading Order (if it exists) and
|
||||
# and leaves reading the TextRegions to ExtractedText.from_text_segment().
|
||||
|
||||
nsmap = {'page': page_namespace(tree)}
|
||||
nsmap = {"page": page_namespace(tree)}
|
||||
|
||||
regions = []
|
||||
reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap)
|
||||
reading_order = tree.find(".//page:ReadingOrder", namespaces=nsmap)
|
||||
if reading_order is not None:
|
||||
for group in reading_order.iterfind('./*', namespaces=nsmap):
|
||||
if ET.QName(group.tag).localname == 'OrderedGroup':
|
||||
region_ref_indexeds = group.findall('./page:RegionRefIndexed', namespaces=nsmap)
|
||||
for region_ref_indexed in sorted(region_ref_indexeds, key=lambda r: int(r.attrib['index'])):
|
||||
region_id = region_ref_indexed.attrib['regionRef']
|
||||
region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap)
|
||||
for group in reading_order.iterfind("./*", namespaces=nsmap):
|
||||
if ET.QName(group.tag).localname == "OrderedGroup":
|
||||
region_ref_indexeds = group.findall(
|
||||
"./page:RegionRefIndexed", namespaces=nsmap
|
||||
)
|
||||
for region_ref_indexed in sorted(
|
||||
region_ref_indexeds, key=lambda r: int(r.attrib["index"])
|
||||
):
|
||||
region_id = region_ref_indexed.attrib["regionRef"]
|
||||
region = tree.find(
|
||||
'.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap
|
||||
)
|
||||
if region is not None:
|
||||
regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level))
|
||||
regions.append(
|
||||
ExtractedText.from_text_segment(
|
||||
region, nsmap, textequiv_level=textequiv_level
|
||||
)
|
||||
)
|
||||
else:
|
||||
pass # Not a TextRegion
|
||||
else:
|
||||
raise NotImplementedError
|
||||
else:
|
||||
for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap):
|
||||
regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level))
|
||||
for region in tree.iterfind(".//page:TextRegion", namespaces=nsmap):
|
||||
regions.append(
|
||||
ExtractedText.from_text_segment(
|
||||
region, nsmap, textequiv_level=textequiv_level
|
||||
)
|
||||
)
|
||||
|
||||
# Filter empty region texts
|
||||
regions = [r for r in regions if r.text != '']
|
||||
regions = [r for r in regions if r.text != ""]
|
||||
|
||||
return ExtractedText(None, regions, '\n', None)
|
||||
return ExtractedText(None, regions, "\n", None)
|
||||
|
||||
|
||||
def page_text(tree, *, textequiv_level='region'):
|
||||
def page_text(tree, *, textequiv_level="region"):
|
||||
return page_extract(tree, textequiv_level=textequiv_level).text
|
||||
|
||||
|
||||
def plain_extract(filename):
|
||||
with open(filename, 'r') as f:
|
||||
with open(filename, "r") as f:
|
||||
return ExtractedText(
|
||||
None,
|
||||
[ExtractedText('line %d' % no, None, None, line) for no, line in enumerate(f.readlines())],
|
||||
'\n',
|
||||
None
|
||||
None,
|
||||
[
|
||||
ExtractedText("line %d" % no, None, None, line)
|
||||
for no, line in enumerate(f.readlines())
|
||||
],
|
||||
"\n",
|
||||
None,
|
||||
)
|
||||
|
||||
|
||||
|
@ -105,7 +125,7 @@ def plain_text(filename):
|
|||
return plain_extract(filename).text
|
||||
|
||||
|
||||
def extract(filename, *, textequiv_level='region'):
|
||||
def extract(filename, *, textequiv_level="region"):
|
||||
"""Extract the text from the given file.
|
||||
|
||||
Supports PAGE, ALTO and falls back to plain text.
|
||||
|
@ -124,5 +144,5 @@ def text(filename):
|
|||
return extract(filename).text
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
print(text(sys.argv[1]))
|
||||
|
|
|
@ -10,7 +10,7 @@ from pkg_resources import resource_string
|
|||
from .cli import process as cli_process
|
||||
from .edit_distance import levenshtein_matrix_cache_clear
|
||||
|
||||
OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
|
||||
OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
|
||||
|
||||
|
||||
@click.command()
|
||||
|
@ -20,20 +20,19 @@ def ocrd_dinglehopper(*args, **kwargs):
|
|||
|
||||
|
||||
class OcrdDinglehopperEvaluate(Processor):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dinglehopper']
|
||||
kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
|
||||
super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
|
||||
|
||||
def process(self):
|
||||
assert_file_grp_cardinality(self.input_file_grp, 2, 'GT and OCR')
|
||||
assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
|
||||
assert_file_grp_cardinality(self.output_file_grp, 1)
|
||||
|
||||
log = getLogger('processor.OcrdDinglehopperEvaluate')
|
||||
log = getLogger("processor.OcrdDinglehopperEvaluate")
|
||||
|
||||
metrics = self.parameter['metrics']
|
||||
textequiv_level = self.parameter['textequiv_level']
|
||||
gt_grp, ocr_grp = self.input_file_grp.split(',')
|
||||
metrics = self.parameter["metrics"]
|
||||
textequiv_level = self.parameter["textequiv_level"]
|
||||
gt_grp, ocr_grp = self.input_file_grp.split(",")
|
||||
|
||||
input_file_tuples = self._zip_input_files([gt_grp, ocr_grp])
|
||||
for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
|
||||
|
@ -55,40 +54,47 @@ class OcrdDinglehopperEvaluate(Processor):
|
|||
except FileExistsError:
|
||||
pass
|
||||
cli_process(
|
||||
gt_file.local_filename,
|
||||
ocr_file.local_filename,
|
||||
report_prefix,
|
||||
metrics=metrics,
|
||||
textequiv_level=textequiv_level
|
||||
gt_file.local_filename,
|
||||
ocr_file.local_filename,
|
||||
report_prefix,
|
||||
metrics=metrics,
|
||||
textequiv_level=textequiv_level,
|
||||
)
|
||||
|
||||
# Add reports to the workspace
|
||||
for report_suffix, mimetype in \
|
||||
[
|
||||
['.html', 'text/html'],
|
||||
['.json', 'application/json']
|
||||
]:
|
||||
for report_suffix, mimetype in [
|
||||
[".html", "text/html"],
|
||||
[".json", "application/json"],
|
||||
]:
|
||||
self.workspace.add_file(
|
||||
ID=file_id + report_suffix,
|
||||
file_grp=self.output_file_grp,
|
||||
pageId=page_id,
|
||||
mimetype=mimetype,
|
||||
local_filename=report_prefix + report_suffix)
|
||||
ID=file_id + report_suffix,
|
||||
file_grp=self.output_file_grp,
|
||||
pageId=page_id,
|
||||
mimetype=mimetype,
|
||||
local_filename=report_prefix + report_suffix,
|
||||
)
|
||||
|
||||
# Clear cache between files
|
||||
levenshtein_matrix_cache_clear()
|
||||
|
||||
def _zip_input_files(self, input_file_grps):
|
||||
log = getLogger('processor.OcrdDinglehopperEvaluate')
|
||||
log = getLogger("processor.OcrdDinglehopperEvaluate")
|
||||
input_file_tuples = list()
|
||||
for page_id in ([self.page_id] if self.page_id else
|
||||
self.workspace.mets.physical_pages):
|
||||
for page_id in (
|
||||
[self.page_id] if self.page_id else self.workspace.mets.physical_pages
|
||||
):
|
||||
ifiles = list()
|
||||
for input_file_grp in input_file_grps:
|
||||
log.debug("Adding input file group %s to page %s", input_file_grp, page_id)
|
||||
files = self.workspace.mets.find_all_files(pageId=page_id, fileGrp=input_file_grp)
|
||||
log.debug(
|
||||
"Adding input file group %s to page %s", input_file_grp, page_id
|
||||
)
|
||||
files = self.workspace.mets.find_all_files(
|
||||
pageId=page_id, fileGrp=input_file_grp
|
||||
)
|
||||
if not files:
|
||||
log.error('Found no page "%s" in file group %s', page_id, input_file_grp)
|
||||
log.error(
|
||||
'Found no page "%s" in file group %s', page_id, input_file_grp
|
||||
)
|
||||
ifiles.append(None)
|
||||
else:
|
||||
ifiles.append(files[0])
|
||||
|
@ -97,5 +103,5 @@ class OcrdDinglehopperEvaluate(Processor):
|
|||
return input_file_tuples
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
ocrd_dinglehopper()
|
||||
|
|
|
@ -10,25 +10,30 @@ from .. import seq_align, ExtractedText
|
|||
|
||||
|
||||
def test_text():
|
||||
test1 = ExtractedText(None, [
|
||||
ExtractedText('s0', None, None, 'foo'),
|
||||
ExtractedText('s1', None, None, 'bar'),
|
||||
ExtractedText('s2', None, None, 'bazinga')
|
||||
], ' ', None)
|
||||
test1 = ExtractedText(
|
||||
None,
|
||||
[
|
||||
ExtractedText("s0", None, None, "foo"),
|
||||
ExtractedText("s1", None, None, "bar"),
|
||||
ExtractedText("s2", None, None, "bazinga"),
|
||||
],
|
||||
" ",
|
||||
None,
|
||||
)
|
||||
|
||||
assert test1.text == 'foo bar bazinga'
|
||||
assert test1.segment_id_for_pos(0) == 's0'
|
||||
assert test1.text == "foo bar bazinga"
|
||||
assert test1.segment_id_for_pos(0) == "s0"
|
||||
assert test1.segment_id_for_pos(3) is None
|
||||
assert test1.segment_id_for_pos(10) == 's2'
|
||||
assert test1.segment_id_for_pos(10) == "s2"
|
||||
|
||||
|
||||
def test_normalization_check():
|
||||
with pytest.raises(ValueError, match=r'.*is not in NFC.*'):
|
||||
ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ'))
|
||||
assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ'))
|
||||
with pytest.raises(ValueError, match=r".*is not in NFC.*"):
|
||||
ExtractedText("foo", None, None, unicodedata.normalize("NFD", "Schlyñ"))
|
||||
assert ExtractedText("foo", None, None, unicodedata.normalize("NFC", "Schlyñ"))
|
||||
|
||||
|
||||
AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id')
|
||||
AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")
|
||||
|
||||
|
||||
def test_align():
|
||||
|
@ -39,25 +44,36 @@ def test_align():
|
|||
not Python characters.
|
||||
"""
|
||||
|
||||
test1 = ExtractedText(None, [
|
||||
ExtractedText('s0', None, None, 'foo'),
|
||||
ExtractedText('s1', None, None, 'bar'),
|
||||
ExtractedText('s2', None, None, 'batzinga')
|
||||
], ' ', None)
|
||||
test2 = ExtractedText(None, [
|
||||
ExtractedText('x0', None, None, 'foo'),
|
||||
ExtractedText('x1', None, None, 'bar'),
|
||||
# extra .
|
||||
ExtractedText('x2', None, None, '.'),
|
||||
# deletion + different grapheme cluster, m̃ also is two Python characters
|
||||
ExtractedText('x3', None, None, 'bazim̃ga'),
|
||||
], ' ', None)
|
||||
test1 = ExtractedText(
|
||||
None,
|
||||
[
|
||||
ExtractedText("s0", None, None, "foo"),
|
||||
ExtractedText("s1", None, None, "bar"),
|
||||
ExtractedText("s2", None, None, "batzinga"),
|
||||
],
|
||||
" ",
|
||||
None,
|
||||
)
|
||||
test2 = ExtractedText(
|
||||
None,
|
||||
[
|
||||
ExtractedText("x0", None, None, "foo"),
|
||||
ExtractedText("x1", None, None, "bar"),
|
||||
# extra .
|
||||
ExtractedText("x2", None, None, "."),
|
||||
# deletion + different grapheme cluster, m̃ also is two Python characters
|
||||
ExtractedText("x3", None, None, "bazim̃ga"),
|
||||
],
|
||||
" ",
|
||||
None,
|
||||
)
|
||||
|
||||
left_pos = 0
|
||||
right_pos = 0
|
||||
alignment = []
|
||||
for left, right in seq_align(grapheme_clusters(test1.text),
|
||||
grapheme_clusters(test2.text)):
|
||||
for left, right in seq_align(
|
||||
grapheme_clusters(test1.text), grapheme_clusters(test2.text)
|
||||
):
|
||||
left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
|
||||
right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
|
||||
el = AlignmentElement(left, right, left_id, right_id)
|
||||
|
@ -67,46 +83,57 @@ def test_align():
|
|||
if right is not None:
|
||||
right_pos += len(right)
|
||||
|
||||
print('test1: {}'.format(test1.text))
|
||||
print('test2: {}'.format(test2.text))
|
||||
print("test1: {}".format(test1.text))
|
||||
print("test2: {}".format(test2.text))
|
||||
|
||||
assert alignment[0] == ('f', 'f', 's0', 'x0')
|
||||
assert alignment[8] == (None, '.', None, 'x2')
|
||||
assert alignment[12] == ('t', None, 's2', None)
|
||||
assert alignment[15] == ('n', 'm̃', 's2', 'x3')
|
||||
assert alignment[0] == ("f", "f", "s0", "x0")
|
||||
assert alignment[8] == (None, ".", None, "x2")
|
||||
assert alignment[12] == ("t", None, "s2", None)
|
||||
assert alignment[15] == ("n", "m̃", "s2", "x3")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("attributes,expected_index,expected_log", [
|
||||
([], None, None),
|
||||
(['index="0"'], 0, None),
|
||||
([''], 0, None),
|
||||
(['conf="0.5"'], 0, None),
|
||||
(['index="1"', 'index="0"'], 1, None),
|
||||
(['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"),
|
||||
(['conf="0.4"', 'conf="0.5"', 'conf="0.9"'], 2,
|
||||
"No index attributes, use 'conf' attribute to sort TextEquiv"),
|
||||
(['index="0"', ''], 0, "TextEquiv without index"),
|
||||
(['', 'conf="0.4"'], 1,
|
||||
"No index attributes, use 'conf' attribute to sort TextEquiv"),
|
||||
(['', ''], 0, "No index attributes, use first TextEquiv"),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"attributes,expected_index,expected_log",
|
||||
[
|
||||
([], None, None),
|
||||
(['index="0"'], 0, None),
|
||||
([""], 0, None),
|
||||
(['conf="0.5"'], 0, None),
|
||||
(['index="1"', 'index="0"'], 1, None),
|
||||
(['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"),
|
||||
(
|
||||
['conf="0.4"', 'conf="0.5"', 'conf="0.9"'],
|
||||
2,
|
||||
"No index attributes, use 'conf' attribute to sort TextEquiv",
|
||||
),
|
||||
(['index="0"', ""], 0, "TextEquiv without index"),
|
||||
(
|
||||
["", 'conf="0.4"'],
|
||||
1,
|
||||
"No index attributes, use 'conf' attribute to sort TextEquiv",
|
||||
),
|
||||
(["", ""], 0, "No index attributes, use first TextEquiv"),
|
||||
],
|
||||
)
|
||||
def test_textequiv(attributes, expected_index, expected_log, caplog):
|
||||
"""Test that extracting text from a PAGE TextEquiv is working without index attr."""
|
||||
caplog.set_level(logging.INFO)
|
||||
xml = "<?xml version=\"1.0\"?>"
|
||||
xml = '<?xml version="1.0"?>'
|
||||
ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
|
||||
text = ["Text {0}".format(i) for i in range(len(attributes) + 1)]
|
||||
|
||||
equiv = ["<TextEquiv {0}><Unicode>{1}</Unicode></TextEquiv>".format(attr, text[i])
|
||||
for i, attr in enumerate(attributes)]
|
||||
equiv = [
|
||||
"<TextEquiv {0}><Unicode>{1}</Unicode></TextEquiv>".format(attr, text[i])
|
||||
for i, attr in enumerate(attributes)
|
||||
]
|
||||
|
||||
textline = "{0}<TextLine id=\"l3\" xmlns=\"{1}\">{2}</TextLine>"
|
||||
textline = textline.format(xml, ns, ''.join(equiv))
|
||||
textline = '{0}<TextLine id="l3" xmlns="{1}">{2}</TextLine>'
|
||||
textline = textline.format(xml, ns, "".join(equiv))
|
||||
|
||||
root = ET.fromstring(textline)
|
||||
result = ExtractedText.from_text_segment(root,
|
||||
{'page': ns},
|
||||
textequiv_level='line').text
|
||||
result = ExtractedText.from_text_segment(
|
||||
root, {"page": ns}, textequiv_level="line"
|
||||
).text
|
||||
if expected_index is None:
|
||||
assert not result
|
||||
else:
|
||||
|
|
|
@ -3,64 +3,85 @@ from .. import align, seq_align, distance
|
|||
|
||||
|
||||
def test_left_empty():
|
||||
result = list(align('', 'foo'))
|
||||
expected = [(None, 'f'), (None, 'o'), (None, 'o')]
|
||||
result = list(align("", "foo"))
|
||||
expected = [(None, "f"), (None, "o"), (None, "o")]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_right_empty():
|
||||
result = list(align('foo', ''))
|
||||
expected = [('f', None), ('o', None), ('o', None)]
|
||||
result = list(align("foo", ""))
|
||||
expected = [("f", None), ("o", None), ("o", None)]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_left_longer():
|
||||
result = list(align('food', 'foo'))
|
||||
expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), ('d', None)]
|
||||
result = list(align("food", "foo"))
|
||||
expected = [("f", "f"), ("o", "o"), ("o", "o"), ("d", None)]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_right_longer():
|
||||
result = list(align('foo', 'food'))
|
||||
expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), (None, 'd')]
|
||||
result = list(align("foo", "food"))
|
||||
expected = [("f", "f"), ("o", "o"), ("o", "o"), (None, "d")]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_some_diff():
|
||||
result = list(align('abcde', 'aaadef'))
|
||||
result = list(align("abcde", "aaadef"))
|
||||
left, right = unzip(result)
|
||||
assert list(left) == ['a', 'b', 'c', 'd', 'e', None]
|
||||
assert list(right) == ['a', 'a', 'a', 'd', 'e', 'f']
|
||||
assert list(left) == ["a", "b", "c", "d", "e", None]
|
||||
assert list(right) == ["a", "a", "a", "d", "e", "f"]
|
||||
|
||||
|
||||
def test_longer():
|
||||
s1 = 'Dies ist eine Tst!'
|
||||
s2 = 'Dies ist ein Test.'
|
||||
s1 = "Dies ist eine Tst!"
|
||||
s2 = "Dies ist ein Test."
|
||||
|
||||
result = list(align(s1, s2)) # ; diffprint(*unzip(result))
|
||||
expected = [('D', 'D'), ('i', 'i'), ('e', 'e'), ('s', 's'), (' ', ' '),
|
||||
('i', 'i'), ('s', 's'), ('t', 't'), (' ', ' '),
|
||||
('e', 'e'), ('i', 'i'), ('n', 'n'), ('e', None), (' ', ' '),
|
||||
('T', 'T'), (None, 'e'), ('s', 's'), ('t', 't'), ('!', '.')]
|
||||
expected = [
|
||||
("D", "D"),
|
||||
("i", "i"),
|
||||
("e", "e"),
|
||||
("s", "s"),
|
||||
(" ", " "),
|
||||
("i", "i"),
|
||||
("s", "s"),
|
||||
("t", "t"),
|
||||
(" ", " "),
|
||||
("e", "e"),
|
||||
("i", "i"),
|
||||
("n", "n"),
|
||||
("e", None),
|
||||
(" ", " "),
|
||||
("T", "T"),
|
||||
(None, "e"),
|
||||
("s", "s"),
|
||||
("t", "t"),
|
||||
("!", "."),
|
||||
]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_completely_different():
|
||||
assert len(list(align('abcde', 'fghij'))) == 5
|
||||
assert len(list(align("abcde", "fghij"))) == 5
|
||||
|
||||
|
||||
def test_with_some_fake_ocr_errors():
|
||||
result = list(align('Über die vielen Sorgen wegen desselben vergaß',
|
||||
'SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab'))
|
||||
result = list(
|
||||
align(
|
||||
"Über die vielen Sorgen wegen desselben vergaß",
|
||||
"SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
|
||||
)
|
||||
)
|
||||
left, right = unzip(result)
|
||||
|
||||
# Beginning
|
||||
assert list(left[:18]) == [None]*18
|
||||
assert list(right[:18]) == list('SomeJunk MoreJunk ')
|
||||
assert list(left[:18]) == [None] * 18
|
||||
assert list(right[:18]) == list("SomeJunk MoreJunk ")
|
||||
|
||||
# End
|
||||
assert list(left[-1:]) == ['ß']
|
||||
assert list(right[-1:]) == ['b']
|
||||
assert list(left[-1:]) == ["ß"]
|
||||
assert list(right[-1:]) == ["b"]
|
||||
|
||||
|
||||
def test_lines():
|
||||
|
@ -68,13 +89,30 @@ def test_lines():
|
|||
|
||||
This mainly serves as documentation for comparing lists of lines.
|
||||
"""
|
||||
result = list(seq_align(
|
||||
['This is a line.', 'This is another', 'And the last line'],
|
||||
['This is a line.', 'This is another', 'J u n k', 'And the last line']
|
||||
))
|
||||
result = list(
|
||||
seq_align(
|
||||
["This is a line.", "This is another", "And the last line"],
|
||||
[
|
||||
"This is a line.",
|
||||
"This is another",
|
||||
"J u n k",
|
||||
"And the last line",
|
||||
],
|
||||
)
|
||||
)
|
||||
left, right = unzip(result)
|
||||
assert list(left) == ['This is a line.', 'This is another', None, 'And the last line']
|
||||
assert list(right) == ['This is a line.', 'This is another', 'J u n k', 'And the last line']
|
||||
assert list(left) == [
|
||||
"This is a line.",
|
||||
"This is another",
|
||||
None,
|
||||
"And the last line",
|
||||
]
|
||||
assert list(right) == [
|
||||
"This is a line.",
|
||||
"This is another",
|
||||
"J u n k",
|
||||
"And the last line",
|
||||
]
|
||||
|
||||
|
||||
def test_lines_similar():
|
||||
|
@ -92,7 +130,7 @@ def test_lines_similar():
|
|||
# Just an example!
|
||||
min_len = min(len(self._string), len(other._string))
|
||||
if min_len > 0:
|
||||
normalized_distance = distance(self._string, other._string)/min_len
|
||||
normalized_distance = distance(self._string, other._string) / min_len
|
||||
similar = normalized_distance < 0.1
|
||||
else:
|
||||
similar = False
|
||||
|
@ -102,18 +140,39 @@ def test_lines_similar():
|
|||
return not self.__eq__(other)
|
||||
|
||||
def __repr__(self):
|
||||
return 'SimilarString(\'%s\')' % self._string
|
||||
return "SimilarString('%s')" % self._string
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self._string)
|
||||
|
||||
result = list(seq_align(
|
||||
[SimilarString('This is a line.'), SimilarString('This is another'), SimilarString('And the last line')],
|
||||
[SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')]
|
||||
))
|
||||
result = list(
|
||||
seq_align(
|
||||
[
|
||||
SimilarString("This is a line."),
|
||||
SimilarString("This is another"),
|
||||
SimilarString("And the last line"),
|
||||
],
|
||||
[
|
||||
SimilarString("This is a ljne."),
|
||||
SimilarString("This is another"),
|
||||
SimilarString("J u n k"),
|
||||
SimilarString("And the last line"),
|
||||
],
|
||||
)
|
||||
)
|
||||
left, right = unzip(result)
|
||||
assert list(left) == [SimilarString('This is a line.'), SimilarString('This is another'), None, SimilarString('And the last line')]
|
||||
assert list(right) == [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')]
|
||||
assert list(left) == [
|
||||
SimilarString("This is a line."),
|
||||
SimilarString("This is another"),
|
||||
None,
|
||||
SimilarString("And the last line"),
|
||||
]
|
||||
assert list(right) == [
|
||||
SimilarString("This is a ljne."),
|
||||
SimilarString("This is another"),
|
||||
SimilarString("J u n k"),
|
||||
SimilarString("And the last line"),
|
||||
]
|
||||
|
||||
# Test __eq__ (i.e. is it a substitution or a similar string?)
|
||||
assert list(left)[0] == list(right)[0]
|
||||
|
|
|
@ -7,31 +7,35 @@ from .. import character_error_rate
|
|||
|
||||
|
||||
def test_character_error_rate():
|
||||
assert character_error_rate('a', 'a') == 0
|
||||
assert character_error_rate('a', 'b') == 1/1
|
||||
assert character_error_rate('Foo', 'Bar') == 3/3
|
||||
assert character_error_rate("a", "a") == 0
|
||||
assert character_error_rate("a", "b") == 1 / 1
|
||||
assert character_error_rate("Foo", "Bar") == 3 / 3
|
||||
|
||||
assert character_error_rate('Foo', '') == 3/3
|
||||
assert character_error_rate("Foo", "") == 3 / 3
|
||||
|
||||
assert character_error_rate('', '') == 0
|
||||
assert math.isinf(character_error_rate('', 'Foo'))
|
||||
assert character_error_rate("", "") == 0
|
||||
assert math.isinf(character_error_rate("", "Foo"))
|
||||
|
||||
assert character_error_rate('Foo', 'Food') == 1/3
|
||||
assert character_error_rate('Fnord', 'Food') == 2/5
|
||||
assert character_error_rate('Müll', 'Mull') == 1/4
|
||||
assert character_error_rate('Abstand', 'Sand') == 4/7
|
||||
assert character_error_rate("Foo", "Food") == 1 / 3
|
||||
assert character_error_rate("Fnord", "Food") == 2 / 5
|
||||
assert character_error_rate("Müll", "Mull") == 1 / 4
|
||||
assert character_error_rate("Abstand", "Sand") == 4 / 7
|
||||
|
||||
|
||||
def test_character_error_rate_hard():
|
||||
s1 = unicodedata.normalize('NFC', 'Schlyñ lorem ipsum.')
|
||||
s2 = unicodedata.normalize('NFD', 'Schlyñ lorem ipsum!') # Different, decomposed!
|
||||
assert character_error_rate(s1, s2) == 1/19
|
||||
s1 = unicodedata.normalize("NFC", "Schlyñ lorem ipsum.")
|
||||
s2 = unicodedata.normalize("NFD", "Schlyñ lorem ipsum!") # Different, decomposed!
|
||||
assert character_error_rate(s1, s2) == 1 / 19
|
||||
|
||||
s1 = 'Schlyñ'
|
||||
assert len(s1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
|
||||
s2 = 'Schlym̃'
|
||||
assert len(s2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||
s1 = "Schlyñ"
|
||||
assert (
|
||||
len(s1) == 6
|
||||
) # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
|
||||
s2 = "Schlym̃"
|
||||
assert (
|
||||
len(s2) == 7
|
||||
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||
|
||||
# Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
|
||||
assert character_error_rate(s2, s1) == 1/6
|
||||
assert character_error_rate(s1, s2) == 1/6
|
||||
assert character_error_rate(s2, s1) == 1 / 6
|
||||
assert character_error_rate(s1, s2) == 1 / 6
|
||||
|
|
|
@ -6,35 +6,39 @@ from .. import levenshtein, distance
|
|||
|
||||
|
||||
def test_levenshtein():
|
||||
assert levenshtein('a', 'a') == 0
|
||||
assert levenshtein('a', 'b') == 1
|
||||
assert levenshtein('Foo', 'Bar') == 3
|
||||
assert levenshtein("a", "a") == 0
|
||||
assert levenshtein("a", "b") == 1
|
||||
assert levenshtein("Foo", "Bar") == 3
|
||||
|
||||
assert levenshtein('', '') == 0
|
||||
assert levenshtein('Foo', '') == 3
|
||||
assert levenshtein('', 'Foo') == 3
|
||||
assert levenshtein("", "") == 0
|
||||
assert levenshtein("Foo", "") == 3
|
||||
assert levenshtein("", "Foo") == 3
|
||||
|
||||
assert levenshtein('Foo', 'Food') == 1
|
||||
assert levenshtein('Fnord', 'Food') == 2
|
||||
assert levenshtein('Müll', 'Mull') == 1
|
||||
assert levenshtein('Abstand', 'Sand') == 4
|
||||
assert levenshtein("Foo", "Food") == 1
|
||||
assert levenshtein("Fnord", "Food") == 2
|
||||
assert levenshtein("Müll", "Mull") == 1
|
||||
assert levenshtein("Abstand", "Sand") == 4
|
||||
|
||||
|
||||
def test_levenshtein_other_sequences():
|
||||
assert levenshtein(['a', 'ab'], ['a', 'ab', 'c']) == 1
|
||||
assert levenshtein(['a', 'ab'], ['a', 'c']) == 1
|
||||
assert levenshtein(["a", "ab"], ["a", "ab", "c"]) == 1
|
||||
assert levenshtein(["a", "ab"], ["a", "c"]) == 1
|
||||
|
||||
|
||||
def test_distance():
|
||||
assert distance('Fnord', 'Food') == 2
|
||||
assert distance('Müll', 'Mull') == 1
|
||||
assert distance("Fnord", "Food") == 2
|
||||
assert distance("Müll", "Mull") == 1
|
||||
|
||||
word1 = unicodedata.normalize('NFC', 'Schlyñ')
|
||||
word2 = unicodedata.normalize('NFD', 'Schlyñ') # Different, decomposed!
|
||||
word1 = unicodedata.normalize("NFC", "Schlyñ")
|
||||
word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed!
|
||||
assert distance(word1, word2) == 0
|
||||
|
||||
word1 = 'Schlyñ'
|
||||
assert len(word1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
|
||||
word2 = 'Schlym̃'
|
||||
assert len(word2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||
word1 = "Schlyñ"
|
||||
assert (
|
||||
len(word1) == 6
|
||||
) # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
|
||||
word2 = "Schlym̃"
|
||||
assert (
|
||||
len(word2) == 7
|
||||
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||
assert distance(word1, word2) == 1
|
||||
|
|
|
@ -4,45 +4,60 @@ from .. import seq_editops, editops
|
|||
|
||||
|
||||
def test_trivial():
|
||||
assert seq_editops('abc', 'abc') == []
|
||||
assert seq_editops('', '') == []
|
||||
assert seq_editops("abc", "abc") == []
|
||||
assert seq_editops("", "") == []
|
||||
|
||||
|
||||
def test_insert():
|
||||
assert seq_editops('bc', 'abc') == [('insert', 0, 0)]
|
||||
assert seq_editops('ac', 'abc') == [('insert', 1, 1)]
|
||||
assert seq_editops('ab', 'abc') == [('insert', 2, 2)]
|
||||
assert seq_editops('', 'a') == [('insert', 0, 0)]
|
||||
assert seq_editops("bc", "abc") == [("insert", 0, 0)]
|
||||
assert seq_editops("ac", "abc") == [("insert", 1, 1)]
|
||||
assert seq_editops("ab", "abc") == [("insert", 2, 2)]
|
||||
assert seq_editops("", "a") == [("insert", 0, 0)]
|
||||
|
||||
|
||||
def test_multiple():
|
||||
assert seq_editops('bcd', 'abce') == [('insert', 0, 0), ('replace', 2, 3)]
|
||||
assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]
|
||||
|
||||
|
||||
def test_delete():
|
||||
assert seq_editops('abcdef', 'cdef') == [('delete', 0, 0), ('delete', 1, 0)]
|
||||
assert seq_editops('Xabcdef', 'Xcdef') == [('delete', 1, 1), ('delete', 2, 1)]
|
||||
assert seq_editops('abcdefg', 'acdefX') == [('delete', 1, 1), ('replace', 6, 5)]
|
||||
assert seq_editops('abcde', 'aabcd') == [('insert', 1, 1), ('delete', 4, 5)]
|
||||
assert seq_editops('Foo', '') == [('delete', 0, 0), ('delete', 1, 0), ('delete', 2, 0)]
|
||||
assert seq_editops('Foolish', 'Foo') == [('delete', 3, 3), ('delete', 4, 3), ('delete', 5, 3), ('delete', 6, 3)]
|
||||
assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
|
||||
assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
|
||||
assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
|
||||
assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
|
||||
assert seq_editops("Foo", "") == [
|
||||
("delete", 0, 0),
|
||||
("delete", 1, 0),
|
||||
("delete", 2, 0),
|
||||
]
|
||||
assert seq_editops("Foolish", "Foo") == [
|
||||
("delete", 3, 3),
|
||||
("delete", 4, 3),
|
||||
("delete", 5, 3),
|
||||
("delete", 6, 3),
|
||||
]
|
||||
|
||||
|
||||
def test_ambiguous():
|
||||
assert seq_editops('bcd', 'abcef') == [('insert', 0, 0), ('replace', 2, 3), ('insert', 3, 4)]
|
||||
assert seq_editops("bcd", "abcef") == [
|
||||
("insert", 0, 0),
|
||||
("replace", 2, 3),
|
||||
("insert", 3, 4),
|
||||
]
|
||||
|
||||
|
||||
def test_editops():
|
||||
"""Test editops() in cases where dealing with grapheme clusters matters"""
|
||||
|
||||
# In these cases, one of the words has a composed form, the other one does not.
|
||||
assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)]
|
||||
assert editops('oͤde', 'öde') == [('replace', 0, 0)]
|
||||
assert editops("Schlyñ", "Schlym̃") == [("replace", 5, 5)]
|
||||
assert editops("oͤde", "öde") == [("replace", 0, 0)]
|
||||
|
||||
|
||||
def test_editops_canonically_equivalent():
|
||||
left = unicodedata.lookup('LATIN SMALL LETTER N') + unicodedata.lookup('COMBINING TILDE')
|
||||
right = unicodedata.lookup('LATIN SMALL LETTER N WITH TILDE')
|
||||
left = unicodedata.lookup("LATIN SMALL LETTER N") + unicodedata.lookup(
|
||||
"COMBINING TILDE"
|
||||
)
|
||||
right = unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE")
|
||||
assert left != right
|
||||
assert unicodedata.normalize('NFC', left) == unicodedata.normalize('NFC', right)
|
||||
assert unicodedata.normalize("NFC", left) == unicodedata.normalize("NFC", right)
|
||||
assert editops(left, right) == []
|
||||
|
|
|
@ -7,7 +7,7 @@ from lxml import etree as ET
|
|||
|
||||
from .. import align, page_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
|
@ -17,8 +17,8 @@ def test_align_page_files():
|
|||
# (currently) not counted due to normalization.
|
||||
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
|
||||
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
|
||||
|
||||
result = list(align(gt, ocr))
|
||||
for left, right in result:
|
||||
|
|
|
@ -8,26 +8,34 @@ from uniseg.graphemecluster import grapheme_clusters
|
|||
|
||||
from .. import character_error_rate, page_text, alto_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_character_error_rate_between_page_files():
|
||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||
# The fi ligature does not count.
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
|
||||
|
||||
gt_len = len(list(grapheme_clusters(gt)))
|
||||
expected_cer = 2/gt_len
|
||||
expected_cer = 2 / gt_len
|
||||
|
||||
assert character_error_rate(gt, ocr) == expected_cer
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_character_error_rate_between_page_alto():
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
|
||||
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
|
||||
gt = page_text(
|
||||
ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml"))
|
||||
)
|
||||
ocr = alto_text(
|
||||
ET.parse(
|
||||
os.path.join(
|
||||
data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
assert gt == ocr
|
||||
assert character_error_rate(gt, ocr) == 0
|
||||
|
@ -35,7 +43,17 @@ def test_character_error_rate_between_page_alto():
|
|||
|
||||
@pytest.mark.integration
|
||||
def test_character_error_rate_between_page_alto_2():
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
|
||||
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
|
||||
gt = page_text(
|
||||
ET.parse(
|
||||
os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml")
|
||||
)
|
||||
)
|
||||
ocr = alto_text(
|
||||
ET.parse(
|
||||
os.path.join(
|
||||
data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
assert character_error_rate(gt, ocr) == 8/591 # Manually verified
|
||||
assert character_error_rate(gt, ocr) == 8 / 591 # Manually verified
|
||||
|
|
|
@ -10,31 +10,31 @@ def test_cli_json(tmp_path):
|
|||
"""Test that the cli/process() yields a loadable JSON report"""
|
||||
|
||||
with working_directory(str(tmp_path)):
|
||||
with open('gt.txt', 'w') as gtf:
|
||||
gtf.write('AAAAA')
|
||||
with open('ocr.txt', 'w') as ocrf:
|
||||
ocrf.write('AAAAB')
|
||||
with open("gt.txt", "w") as gtf:
|
||||
gtf.write("AAAAA")
|
||||
with open("ocr.txt", "w") as ocrf:
|
||||
ocrf.write("AAAAB")
|
||||
|
||||
with open('gt.txt', 'r') as gtf:
|
||||
with open("gt.txt", "r") as gtf:
|
||||
print(gtf.read())
|
||||
process('gt.txt', 'ocr.txt', 'report')
|
||||
with open('report.json', 'r') as jsonf:
|
||||
process("gt.txt", "ocr.txt", "report")
|
||||
with open("report.json", "r") as jsonf:
|
||||
print(jsonf.read())
|
||||
with open('report.json', 'r') as jsonf:
|
||||
with open("report.json", "r") as jsonf:
|
||||
j = json.load(jsonf)
|
||||
assert j['cer'] == pytest.approx(0.2)
|
||||
assert j["cer"] == pytest.approx(0.2)
|
||||
|
||||
|
||||
def test_cli_json_cer_is_infinity(tmp_path):
|
||||
"""Test that the cli/process() yields a loadable JSON report when CER == inf"""
|
||||
|
||||
with working_directory(str(tmp_path)):
|
||||
with open('gt.txt', 'w') as gtf:
|
||||
gtf.write('') # Empty to yield CER == inf
|
||||
with open('ocr.txt', 'w') as ocrf:
|
||||
ocrf.write('Not important')
|
||||
with open("gt.txt", "w") as gtf:
|
||||
gtf.write("") # Empty to yield CER == inf
|
||||
with open("ocr.txt", "w") as ocrf:
|
||||
ocrf.write("Not important")
|
||||
|
||||
process('gt.txt', 'ocr.txt', 'report')
|
||||
with open('report.json', 'r') as jsonf:
|
||||
process("gt.txt", "ocr.txt", "report")
|
||||
with open("report.json", "r") as jsonf:
|
||||
j = json.load(jsonf)
|
||||
assert j['cer'] == pytest.approx(float('inf'))
|
||||
assert j["cer"] == pytest.approx(float("inf"))
|
||||
|
|
|
@ -7,7 +7,7 @@ from lxml import etree as ET
|
|||
|
||||
from .. import distance, page_text, alto_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
|
@ -15,15 +15,23 @@ def test_distance_between_page_files():
|
|||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||
# Due to normalization, we don't count the ligature.
|
||||
# → 2 differences
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
|
||||
assert distance(gt, ocr) == 2
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_distance_between_page_alto():
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
|
||||
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
|
||||
gt = page_text(
|
||||
ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml"))
|
||||
)
|
||||
ocr = alto_text(
|
||||
ET.parse(
|
||||
os.path.join(
|
||||
data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
assert gt == ocr
|
||||
assert distance(gt, ocr) == 0
|
||||
|
@ -31,7 +39,17 @@ def test_distance_between_page_alto():
|
|||
|
||||
@pytest.mark.integration
|
||||
def test_distance_between_page_alto_2():
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
|
||||
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
|
||||
gt = page_text(
|
||||
ET.parse(
|
||||
os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml")
|
||||
)
|
||||
)
|
||||
ocr = alto_text(
|
||||
ET.parse(
|
||||
os.path.join(
|
||||
data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
assert distance(gt, ocr) == 8 # Manually verified
|
||||
|
|
|
@ -10,27 +10,32 @@ from .util import working_directory
|
|||
|
||||
from ..ocrd_cli import ocrd_dinglehopper
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
def test_ocrd_cli(tmp_path):
|
||||
"""Test OCR-D interface"""
|
||||
|
||||
# Copy test workspace
|
||||
test_workspace_dir_source = Path(data_dir) / 'actevedef_718448162'
|
||||
test_workspace_dir = tmp_path / 'test_ocrd_cli'
|
||||
test_workspace_dir_source = Path(data_dir) / "actevedef_718448162"
|
||||
test_workspace_dir = tmp_path / "test_ocrd_cli"
|
||||
shutil.copytree(str(test_workspace_dir_source), str(test_workspace_dir))
|
||||
|
||||
# Run through the OCR-D interface
|
||||
with working_directory(str(test_workspace_dir)):
|
||||
runner = CliRunner()
|
||||
args = [
|
||||
'-m', 'mets.xml',
|
||||
'-I', 'OCR-D-GT-PAGE,OCR-D-OCR-CALAMARI',
|
||||
'-O', 'OCR-D-OCR-CALAMARI-EVAL'
|
||||
"-m",
|
||||
"mets.xml",
|
||||
"-I",
|
||||
"OCR-D-GT-PAGE,OCR-D-OCR-CALAMARI",
|
||||
"-O",
|
||||
"OCR-D-OCR-CALAMARI-EVAL",
|
||||
]
|
||||
sys.argv[1:] = args # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments
|
||||
sys.argv[
|
||||
1:
|
||||
] = args # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments
|
||||
result = runner.invoke(ocrd_dinglehopper, args)
|
||||
assert result.exit_code == 0
|
||||
result_json = list((test_workspace_dir / 'OCR-D-OCR-CALAMARI-EVAL').glob('*.json'))
|
||||
assert json.load(open(str(result_json[0])))['cer'] < 0.03
|
||||
result_json = list((test_workspace_dir / "OCR-D-OCR-CALAMARI-EVAL").glob("*.json"))
|
||||
assert json.load(open(str(result_json[0])))["cer"] < 0.03
|
||||
|
|
|
@ -7,26 +7,36 @@ from lxml import etree as ET
|
|||
|
||||
from .. import word_error_rate, words, page_text, alto_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_word_error_rate_between_page_files():
|
||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
|
||||
# the ligature does not count → 2 errors
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
||||
|
||||
gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line
|
||||
gt_word_count = (
|
||||
7 + 6 + 5 + 8 + 7 + 6 + 7 + 8 + 6 + 7 + 7 + 5 + 6 + 8 + 8 + 7 + 7 + 6 + 5 + 4
|
||||
) # Manually verified word count per line
|
||||
assert len(list(words(gt))) == gt_word_count
|
||||
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||
assert word_error_rate(gt, ocr) == 2/gt_word_count
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
|
||||
assert word_error_rate(gt, ocr) == 2 / gt_word_count
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_word_error_rate_between_page_alto():
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
|
||||
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
|
||||
gt = page_text(
|
||||
ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml"))
|
||||
)
|
||||
ocr = alto_text(
|
||||
ET.parse(
|
||||
os.path.join(
|
||||
data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
assert gt == ocr
|
||||
assert word_error_rate(gt, ocr) == 0
|
||||
|
@ -34,11 +44,25 @@ def test_word_error_rate_between_page_alto():
|
|||
|
||||
@pytest.mark.integration
|
||||
def test_word_error_rate_between_page_alto_2():
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
|
||||
gt = page_text(
|
||||
ET.parse(
|
||||
os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml")
|
||||
)
|
||||
)
|
||||
|
||||
gt_word_count = 14+18+17+14+17+17+3 # Manually verified word count per line
|
||||
gt_word_count = (
|
||||
14 + 18 + 17 + 14 + 17 + 17 + 3
|
||||
) # Manually verified word count per line
|
||||
assert len(list(words(gt))) == gt_word_count
|
||||
|
||||
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
|
||||
ocr = alto_text(
|
||||
ET.parse(
|
||||
os.path.join(
|
||||
data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
assert word_error_rate(gt, ocr) == 7/gt_word_count # Manually verified, 6 words are wrong, 1 got split (=2 errors)
|
||||
assert (
|
||||
word_error_rate(gt, ocr) == 7 / gt_word_count
|
||||
) # Manually verified, 6 words are wrong, 1 got split (=2 errors)
|
||||
|
|
|
@ -9,46 +9,54 @@ import pytest
|
|||
from .util import working_directory
|
||||
from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
def test_alto_namespace():
|
||||
tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
|
||||
assert alto_namespace(tree) == 'http://www.loc.gov/standards/alto/ns-v3#'
|
||||
tree = ET.parse(os.path.join(data_dir, "test.alto3.xml"))
|
||||
assert alto_namespace(tree) == "http://www.loc.gov/standards/alto/ns-v3#"
|
||||
|
||||
|
||||
def test_alto_text():
|
||||
tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
|
||||
tree = ET.parse(os.path.join(data_dir, "test.alto3.xml"))
|
||||
result = alto_text(tree)
|
||||
expected = textwrap.dedent("""\
|
||||
expected = textwrap.dedent(
|
||||
"""\
|
||||
über die vielen Sorgen wegen deſſelben vergaß
|
||||
Hartkopf, der Frau Amtmännin das ver-
|
||||
ſprochene zu überliefern.""")
|
||||
ſprochene zu überliefern."""
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_alto_text_ALTO1():
|
||||
tree = ET.parse(os.path.join(data_dir, 'test.alto1.xml'))
|
||||
tree = ET.parse(os.path.join(data_dir, "test.alto1.xml"))
|
||||
assert "being erected at the Broadway stock" in alto_text(tree)
|
||||
|
||||
|
||||
def test_alto_text_ALTO2():
|
||||
tree = ET.parse(os.path.join(data_dir, 'test.alto2.xml'))
|
||||
assert "Halbmonde, die genau durch einen Ouerstrich halbiert\nsind und an beiden Enden" in alto_text(tree)
|
||||
tree = ET.parse(os.path.join(data_dir, "test.alto2.xml"))
|
||||
assert (
|
||||
"Halbmonde, die genau durch einen Ouerstrich halbiert\nsind und an beiden Enden"
|
||||
in alto_text(tree)
|
||||
)
|
||||
|
||||
|
||||
def test_alto_text_ALTO3():
|
||||
tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
|
||||
tree = ET.parse(os.path.join(data_dir, "test.alto3.xml"))
|
||||
assert "über die vielen Sorgen wegen deſſelben vergaß" in alto_text(tree)
|
||||
|
||||
|
||||
def test_page_namespace():
|
||||
tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
|
||||
assert page_namespace(tree) == 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15'
|
||||
tree = ET.parse(os.path.join(data_dir, "test.page2018.xml"))
|
||||
assert (
|
||||
page_namespace(tree)
|
||||
== "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
|
||||
)
|
||||
|
||||
|
||||
def test_page_test():
|
||||
tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
|
||||
tree = ET.parse(os.path.join(data_dir, "test.page2018.xml"))
|
||||
result = page_text(tree)
|
||||
|
||||
# We are currently normalizing on extraction, so the text is normalized.
|
||||
|
@ -74,7 +82,8 @@ def test_page_test():
|
|||
# Jndeß mangelten do einige Generalia, die
|
||||
# alſo wegfielen. — Hartkopf gieng ſelb
|
||||
# mit und berbrate es. —""")
|
||||
expected = textwrap.dedent("""\
|
||||
expected = textwrap.dedent(
|
||||
"""\
|
||||
über die vielen Sorgen wegen deſſelben vergaß
|
||||
Hartkopf, der Frau Amtmännin das ver-
|
||||
ſprochene zu überliefern. – Ein Erpreſſer
|
||||
|
@ -94,7 +103,8 @@ def test_page_test():
|
|||
ſie das, was da wäre, herbeyſchaffen möchte.
|
||||
Jndeß mangelten doch einige Generalia, die
|
||||
alſo wegfielen. – Hartkopf gieng ſelbſt
|
||||
mit und überbrachte es. –""")
|
||||
mit und überbrachte es. –"""
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
|
||||
|
@ -107,56 +117,69 @@ def test_page_with_empty_region():
|
|||
# <Unicode></Unicode>
|
||||
# </TextEquiv>
|
||||
# </TextRegion>
|
||||
tree = ET.parse(os.path.join(data_dir, 'brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml'))
|
||||
tree = ET.parse(
|
||||
os.path.join(data_dir, "brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml")
|
||||
)
|
||||
result = page_text(tree)
|
||||
assert result
|
||||
|
||||
|
||||
def test_page_order():
|
||||
# This file contains TextRegions where file order is not the same as reading order.
|
||||
tree = ET.parse(os.path.join(data_dir, 'order.page.xml'))
|
||||
tree = ET.parse(os.path.join(data_dir, "order.page.xml"))
|
||||
result = page_text(tree)
|
||||
|
||||
print(result)
|
||||
assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die', result, re.DOTALL)
|
||||
assert re.search(
|
||||
r"Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die",
|
||||
result,
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
|
||||
def test_page_mixed_regions():
|
||||
# This file contains ImageRegions and TextRegions in the ReadingOrder
|
||||
tree = ET.parse(os.path.join(data_dir, 'mixed-regions.page.xml'))
|
||||
tree = ET.parse(os.path.join(data_dir, "mixed-regions.page.xml"))
|
||||
result = page_text(tree)
|
||||
|
||||
assert 'non exaudiam uos. Chriſtiani uero quia orant iuxta' in result
|
||||
assert "non exaudiam uos. Chriſtiani uero quia orant iuxta" in result
|
||||
|
||||
|
||||
def test_page_level():
|
||||
# This file contains inconsistent TextRegion and TextLine texts
|
||||
|
||||
# TextRegion
|
||||
tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml'))
|
||||
tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml"))
|
||||
result = page_text(tree)
|
||||
assert result == 'Inconsistent dummy region text'
|
||||
tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml'))
|
||||
result = page_text(tree, textequiv_level='region')
|
||||
assert result == 'Inconsistent dummy region text'
|
||||
assert result == "Inconsistent dummy region text"
|
||||
tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml"))
|
||||
result = page_text(tree, textequiv_level="region")
|
||||
assert result == "Inconsistent dummy region text"
|
||||
|
||||
# TextLine
|
||||
tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml'))
|
||||
result = page_text(tree, textequiv_level='line')
|
||||
assert result == 'Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-'
|
||||
tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml"))
|
||||
result = page_text(tree, textequiv_level="line")
|
||||
assert (
|
||||
result
|
||||
== "Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-"
|
||||
)
|
||||
|
||||
|
||||
def test_text():
|
||||
assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
|
||||
assert "wieder ein. – Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
|
||||
assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt'))
|
||||
assert "being erected at the Broadway stock" in text(
|
||||
os.path.join(data_dir, "test.alto1.xml")
|
||||
)
|
||||
assert "wieder ein. – Er langte den Zettel aus dem" in text(
|
||||
os.path.join(data_dir, "test.page2018.xml")
|
||||
)
|
||||
assert "Lorem ipsum" in text(os.path.join(data_dir, "test.txt"))
|
||||
|
||||
|
||||
def test_plain(tmp_path):
|
||||
with working_directory(str(tmp_path)):
|
||||
with open('ocr.txt', 'w') as ocrf:
|
||||
ocrf.write('AAAAB')
|
||||
with open("ocr.txt", "w") as ocrf:
|
||||
ocrf.write("AAAAB")
|
||||
|
||||
result = plain_text('ocr.txt')
|
||||
expected = 'AAAAB'
|
||||
result = plain_text("ocr.txt")
|
||||
expected = "AAAAB"
|
||||
assert result == expected
|
||||
|
|
|
@ -6,32 +6,81 @@ from .. import word_error_rate, words
|
|||
|
||||
|
||||
def test_words():
|
||||
result = list(words('Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?'))
|
||||
expected = ['Der', 'schnelle', 'braune', 'Fuchs', 'kann', 'keine', '3,14', 'Meter', 'springen', 'oder']
|
||||
result = list(
|
||||
words("Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?")
|
||||
)
|
||||
expected = [
|
||||
"Der",
|
||||
"schnelle",
|
||||
"braune",
|
||||
"Fuchs",
|
||||
"kann",
|
||||
"keine",
|
||||
"3,14",
|
||||
"Meter",
|
||||
"springen",
|
||||
"oder",
|
||||
]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_words_private_use_area():
|
||||
result = list(words(
|
||||
'ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n'
|
||||
'ſproene zu berliefern.'))
|
||||
result = list(
|
||||
words(
|
||||
"ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n"
|
||||
"ſproene zu berliefern."
|
||||
)
|
||||
)
|
||||
expected = [
|
||||
'ber', 'die', 'vielen', 'Sorgen', 'wegen', 'deelben', 'vergaß', 'Hartkopf',
|
||||
'der', 'Frau', 'Amtmnnin', 'das', 'ver',
|
||||
'ſproene', 'zu', 'berliefern']
|
||||
"ber",
|
||||
"die",
|
||||
"vielen",
|
||||
"Sorgen",
|
||||
"wegen",
|
||||
"deelben",
|
||||
"vergaß",
|
||||
"Hartkopf",
|
||||
"der",
|
||||
"Frau",
|
||||
"Amtmnnin",
|
||||
"das",
|
||||
"ver",
|
||||
"ſproene",
|
||||
"zu",
|
||||
"berliefern",
|
||||
]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_word_error_rate():
|
||||
assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ist ein Beispielsatz!') == 0
|
||||
assert word_error_rate('Dies. ist ein Beispielsatz!', 'Dies ist ein Beispielsatz!') == 0
|
||||
assert word_error_rate('Dies. ist ein Beispielsatz!', 'Dies ist ein Beispielsatz.') == 0
|
||||
assert (
|
||||
word_error_rate("Dies ist ein Beispielsatz!", "Dies ist ein Beispielsatz!") == 0
|
||||
)
|
||||
assert (
|
||||
word_error_rate("Dies. ist ein Beispielsatz!", "Dies ist ein Beispielsatz!")
|
||||
== 0
|
||||
)
|
||||
assert (
|
||||
word_error_rate("Dies. ist ein Beispielsatz!", "Dies ist ein Beispielsatz.")
|
||||
== 0
|
||||
)
|
||||
|
||||
assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ist ein Beispielsarz:') == 1/4
|
||||
assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ein ist Beispielsatz!') == 2/4
|
||||
assert (
|
||||
word_error_rate("Dies ist ein Beispielsatz!", "Dies ist ein Beispielsarz:")
|
||||
== 1 / 4
|
||||
)
|
||||
assert (
|
||||
word_error_rate("Dies ist ein Beispielsatz!", "Dies ein ist Beispielsatz!")
|
||||
== 2 / 4
|
||||
)
|
||||
|
||||
assert word_error_rate('Dies ist ein Beispielsatz!', '') == 4/4
|
||||
assert math.isinf(word_error_rate('', 'Dies ist ein Beispielsatz!'))
|
||||
assert word_error_rate('', '') == 0
|
||||
assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4
|
||||
assert math.isinf(word_error_rate("", "Dies ist ein Beispielsatz!"))
|
||||
assert word_error_rate("", "") == 0
|
||||
|
||||
assert word_error_rate('Schlyñ lorem ipsum dolor sit amet,', 'Schlym̃ lorem ipsum dolor sit amet.') == 1/6
|
||||
assert (
|
||||
word_error_rate(
|
||||
"Schlyñ lorem ipsum dolor sit amet,", "Schlym̃ lorem ipsum dolor sit amet."
|
||||
)
|
||||
== 1 / 6
|
||||
)
|
||||
|
|
|
@ -27,6 +27,7 @@ def unzip(an_iterable_of_tuples):
|
|||
|
||||
class working_directory:
|
||||
"""Context manager to temporarily change the working directory"""
|
||||
|
||||
def __init__(self, wd):
|
||||
self.wd = wd
|
||||
|
||||
|
|
|
@ -20,9 +20,10 @@ def words(s: str):
|
|||
|
||||
def new_word_break(c, index=0):
|
||||
if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area
|
||||
return 'ALetter'
|
||||
return "ALetter"
|
||||
else:
|
||||
return old_word_break(c, index)
|
||||
|
||||
uniseg.wordbreak.word_break = new_word_break
|
||||
|
||||
# Check if c is an unwanted character, i.e. whitespace, punctuation, or similar
|
||||
|
@ -30,8 +31,8 @@ def words(s: str):
|
|||
|
||||
# See https://www.fileformat.info/info/unicode/category/index.htm
|
||||
# and https://unicodebook.readthedocs.io/unicode.html#categories
|
||||
unwanted_categories = 'O', 'M', 'P', 'Z', 'S'
|
||||
unwanted_subcategories = 'Cc', 'Cf'
|
||||
unwanted_categories = "O", "M", "P", "Z", "S"
|
||||
unwanted_subcategories = "Cc", "Cf"
|
||||
|
||||
subcat = unicodedata.category(c)
|
||||
cat = subcat[0]
|
||||
|
@ -53,7 +54,7 @@ def words(s: ExtractedText):
|
|||
|
||||
@multimethod
|
||||
def words_normalized(s: str):
|
||||
return words(unicodedata.normalize('NFC', s))
|
||||
return words(unicodedata.normalize("NFC", s))
|
||||
|
||||
|
||||
@multimethod
|
||||
|
@ -69,7 +70,9 @@ def word_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
|
|||
|
||||
|
||||
@multimethod
|
||||
def word_error_rate_n(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
|
||||
def word_error_rate_n(
|
||||
reference: ExtractedText, compared: ExtractedText
|
||||
) -> Tuple[float, int]:
|
||||
return word_error_rate_n(reference.text, compared.text)
|
||||
|
||||
|
||||
|
@ -84,7 +87,7 @@ def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, i
|
|||
if d == 0:
|
||||
return 0, n
|
||||
if n == 0:
|
||||
return float('inf'), n
|
||||
return float("inf"), n
|
||||
return d / n, n
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue