diff --git a/qurator/__init__.py b/qurator/__init__.py
index 8d17c21..5284146 100644
--- a/qurator/__init__.py
+++ b/qurator/__init__.py
@@ -1,2 +1 @@
-__import__('pkg_resources').declare_namespace(__name__)
-
+__import__("pkg_resources").declare_namespace(__name__)
diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py
index 87febb7..c7e7733 100644
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@@ -3,8 +3,8 @@ from .edit_distance import *
def align(t1, t2):
"""Align text."""
- s1 = list(grapheme_clusters(unicodedata.normalize('NFC', t1)))
- s2 = list(grapheme_clusters(unicodedata.normalize('NFC', t2)))
+ s1 = list(grapheme_clusters(unicodedata.normalize("NFC", t1)))
+ s2 = list(grapheme_clusters(unicodedata.normalize("NFC", t2)))
return seq_align(s1, s2)
@@ -27,13 +27,13 @@ def seq_align(s1, s2):
pass
if o:
- if o[0] == 'insert':
+ if o[0] == "insert":
yield None, s2[j]
j += 1
- elif o[0] == 'delete':
+ elif o[0] == "delete":
yield s1[i], None
i += 1
- elif o[0] == 'replace':
+ elif o[0] == "replace":
yield s1[i], s2[j]
i += 1
j += 1
diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py
index 2b13f55..0c3ef7d 100644
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
@@ -19,19 +19,21 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
"""
d = distance(reference, compared)
- n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))
+ n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference))))
if d == 0:
return 0, n
if n == 0:
- return float('inf'), n
- return d/n, n
+ return float("inf"), n
+ return d / n, n
# XXX Should we really count newlines here?
@multimethod
-def character_error_rate_n(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
+def character_error_rate_n(
+ reference: ExtractedText, compared: ExtractedText
+) -> Tuple[float, int]:
return character_error_rate_n(reference.text, compared.text)
diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py
index 2aef644..09c26f0 100644
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@@ -12,16 +12,17 @@ from .extracted_text import ExtractedText
from .ocr_files import extract
from .config import Config
+
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
- gtx = ''
- ocrx = ''
+ gtx = ""
+ ocrx = ""
def format_thing(t, css_classes=None, id_=None):
if t is None:
html_t = none
- css_classes += ' ellipsis'
- elif t == '\n':
- html_t = '
'
+ css_classes += " ellipsis"
+ elif t == "\n":
+ html_t = "
"
else:
html_t = escape(t)
@@ -32,9 +33,13 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)
if css_classes:
- return '{html_t}'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs)
+ return '{html_t}'.format(
+ css_classes=css_classes,
+ html_t=html_t,
+ html_custom_attrs=html_custom_attrs,
+ )
else:
- return '{html_t}'.format(html_t=html_t)
+ return "{html_t}".format(html_t=html_t)
if isinstance(gt_in, ExtractedText):
if not isinstance(ocr_in, ExtractedText):
@@ -46,8 +51,6 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
gt_things = gt_in
ocr_things = ocr_in
-
-
g_pos = 0
o_pos = 0
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
@@ -55,7 +58,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
gt_id = None
ocr_id = None
if g != o:
- css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k)
+ css_classes = "{css_prefix}diff{k} diff".format(css_prefix=css_prefix, k=k)
if isinstance(gt_in, ExtractedText):
gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None
ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None
@@ -70,17 +73,17 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
if o is not None:
o_pos += len(o)
-
- return \
- '''
+ return """
- '''.format(gtx, ocrx)
+ """.format(
+ gtx, ocrx
+ )
-def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'):
+def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
"""Check OCR result against GT.
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
@@ -93,36 +96,47 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'):
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
wer, n_words = word_error_rate_n(gt_text, ocr_text)
- char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·')
+ char_diff_report = gen_diff_report(
+ gt_text, ocr_text, css_prefix="c", joiner="", none="·"
+ )
gt_words = words_normalized(gt_text)
ocr_words = words_normalized(ocr_text)
- word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯')
+ word_diff_report = gen_diff_report(
+ gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
+ )
def json_float(value):
"""Convert a float value to an JSON float.
This is here so that float('inf') yields "Infinity", not "inf".
"""
- if value == float('inf'):
- return 'Infinity'
- elif value == float('-inf'):
- return '-Infinity'
+ if value == float("inf"):
+ return "Infinity"
+ elif value == float("-inf"):
+ return "-Infinity"
else:
return str(value)
- env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates')))
- env.filters['json_float'] = json_float
+ env = Environment(
+ loader=FileSystemLoader(
+ os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
+ )
+ )
+ env.filters["json_float"] = json_float
- for report_suffix in ('.html', '.json'):
- template_fn = 'report' + report_suffix + '.j2'
+ for report_suffix in (".html", ".json"):
+ template_fn = "report" + report_suffix + ".j2"
out_fn = report_prefix + report_suffix
template = env.get_template(template_fn)
template.stream(
- gt=gt, ocr=ocr,
- cer=cer, n_characters=n_characters,
- wer=wer, n_words=n_words,
+ gt=gt,
+ ocr=ocr,
+ cer=cer,
+ n_characters=n_characters,
+ wer=wer,
+ n_words=n_words,
char_diff_report=char_diff_report,
word_diff_report=word_diff_report,
metrics=metrics,
@@ -130,12 +144,19 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'):
@click.command()
-@click.argument('gt', type=click.Path(exists=True))
-@click.argument('ocr', type=click.Path(exists=True))
-@click.argument('report_prefix', type=click.Path(), default='report')
-@click.option('--metrics/--no-metrics', default=True, help='Enable/disable metrics and green/red')
-@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL')
-@click.option('--progress', default=False, is_flag=True, help='Show progress bar')
+@click.argument("gt", type=click.Path(exists=True))
+@click.argument("ocr", type=click.Path(exists=True))
+@click.argument("report_prefix", type=click.Path(), default="report")
+@click.option(
+ "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
+)
+@click.option(
+ "--textequiv-level",
+ default="region",
+ help="PAGE TextEquiv level to extract text from",
+ metavar="LEVEL",
+)
+@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
"""
Compare the PAGE/ALTO/text document GT against the document OCR.
@@ -159,5 +180,5 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
-if __name__ == '__main__':
+if __name__ == "__main__":
main()
diff --git a/qurator/dinglehopper/cli_extract.py b/qurator/dinglehopper/cli_extract.py
index a5d36d8..ce49db4 100644
--- a/qurator/dinglehopper/cli_extract.py
+++ b/qurator/dinglehopper/cli_extract.py
@@ -7,8 +7,13 @@ from .ocr_files import extract
@click.command()
-@click.argument('input_file', type=click.Path(exists=True))
-@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL')
+@click.argument("input_file", type=click.Path(exists=True))
+@click.option(
+ "--textequiv-level",
+ default="region",
+ help="PAGE TextEquiv level to extract text from",
+ metavar="LEVEL",
+)
def main(input_file, textequiv_level):
"""
Extract the text of the given INPUT_FILE.
@@ -23,5 +28,5 @@ def main(input_file, textequiv_level):
print(input_text)
-if __name__ == '__main__':
+if __name__ == "__main__":
main()
diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py
index 721296d..0b9c8f4 100644
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@@ -48,9 +48,10 @@ def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
for i in tqdm(from_to(1, m), disable=not Config.progress):
for j in from_to(1, n):
D[i, j] = min(
- D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution
+ D[i - 1, j - 1]
+ + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution
D[i, j - 1] + 1, # Insertion
- D[i - 1, j] + 1 # Deletion
+ D[i - 1, j] + 1, # Deletion
)
return D
@@ -81,8 +82,8 @@ def distance(s1: str, s2: str):
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
clusters. This should be the correct way to compare two Unicode strings.
"""
- seq1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
- seq2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
+ seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
+ seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
return levenshtein(seq1, seq2)
@@ -106,11 +107,17 @@ def seq_editops(seq1, seq2):
def _tail_backtrace(i, j, accumulator):
if i > 0 and D[i - 1, j] + 1 == D[i, j]:
- return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator)
+ return partial(
+ _tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator
+ )
if j > 0 and D[i, j - 1] + 1 == D[i, j]:
- return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator)
+ return partial(
+ _tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator
+ )
if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
- return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator)
+ return partial(
+ _tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator
+ )
if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP
return accumulator
@@ -132,6 +139,6 @@ def editops(word1, word2):
Note that this returns indices to the _grapheme clusters_, not characters!
"""
- word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))
- word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))
+ word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
+ word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
return seq_editops(word1, word2)
diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py
index d785754..9703b6b 100644
--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
@@ -10,6 +10,7 @@ import numpy as np
from lxml import etree as ET
from ocrd_utils import getLogger
+
class Normalization(enum.Enum):
NFC = 1
NFC_MUFI = 2 # TODO
@@ -18,7 +19,7 @@ class Normalization(enum.Enum):
def normalize(text, normalization):
if normalization == Normalization.NFC:
- return unicodedata.normalize('NFC', text)
+ return unicodedata.normalize("NFC", text)
if normalization == Normalization.NFC_MUFI:
raise NotImplementedError()
if normalization == Normalization.NFC_SBB:
@@ -36,31 +37,31 @@ def unjoin_ligatures(s):
"""Unjoin ligatures, i.e. ff becomes ff."""
equivalences = {
- '': 'ſſ',
- "\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
- '': 'ch',
- '': 'ck',
- '': 'll',
- '': 'ſi',
- '': 'ſt',
- 'fi': 'fi',
- 'ff': 'ff',
- 'fl': 'fl',
- 'ffi': 'ffi',
- '': 'ct',
- '': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
- '\uf532': 'as', # eMOP: Latin small ligature as
- '\uf533': 'is', # eMOP: Latin small ligature is
- '\uf534': 'us', # eMOP: Latin small ligature us
- '\uf535': 'Qu', # eMOP: Latin ligature capital Q small u
- 'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ
- '\uE8BF': 'q&',
+ "": "ſſ",
+ "\ueba7": "ſſi", # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
+ "": "ch",
+ "": "ck",
+ "": "ll",
+ "": "ſi",
+ "": "ſt",
+ "fi": "fi",
+ "ff": "ff",
+ "fl": "fl",
+ "ffi": "ffi",
+ "": "ct",
+ "": "tz", # MUFI: LATIN SMALL LIGATURE TZ
+ "\uf532": "as", # eMOP: Latin small ligature as
+ "\uf533": "is", # eMOP: Latin small ligature is
+ "\uf534": "us", # eMOP: Latin small ligature us
+ "\uf535": "Qu", # eMOP: Latin ligature capital Q small u
+ "ij": "ij", # U+0133 LATIN SMALL LIGATURE IJ
+ "\uE8BF": "q&",
# MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET
# XXX How to replace this correctly?
- '\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
- 'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST
+ "\uEBA5": "ſp", # MUFI: LATIN SMALL LIGATURE LONG S P
+ "st": "st", # U+FB06 LATIN SMALL LIGATURE ST
}
- s = unicodedata.normalize('NFC', s)
+ s = unicodedata.normalize("NFC", s)
for fr, to in equivalences.items():
s = s.replace(fr, to)
return s
@@ -70,20 +71,20 @@ def substitute_equivalences(s):
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
# It might make sense to use different rules for GT and for the different OCR
equivalences = {
- '': 'ü',
- '': 'ä',
- '==': '–', # → en-dash
- '—': '–', # em-dash → en-dash
- '': 'ö',
- '’': '\'',
- '⸗': '-',
- 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
- 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
- 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
- '\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
+ "": "ü",
+ "": "ä",
+ "==": "–", # → en-dash
+ "—": "–", # em-dash → en-dash
+ "": "ö",
+ "’": "'",
+ "⸗": "-",
+ "aͤ": "ä", # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
+ "oͤ": "ö", # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
+ "uͤ": "ü", # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
+ "\uF50E": "q́", # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
}
- s = unicodedata.normalize('NFC', s)
+ s = unicodedata.normalize("NFC", s)
s = unjoin_ligatures(s)
for fr, to in equivalences.items():
s = s.replace(fr, to)
@@ -115,13 +116,14 @@ class ExtractedText:
Objects of this class are guaranteed to be a. always in their normalization
and b. in NFC.
"""
+
segment_id = attr.ib(type=Optional[str])
@segment_id.validator
def check(self, _, value):
if value is None:
return
- if not re.match(r'[\w\d_-]+', value):
+ if not re.match(r"[\w\d_-]+", value):
raise ValueError('Malformed segment id "{}"'.format(value))
# An object contains either
@@ -141,7 +143,7 @@ class ExtractedText:
def check(self, _, value):
if value is not None and self.segments is not None:
raise ValueError("Can't have both segments and text")
- if value is not None and unicodedata.normalize('NFC', value) != value:
+ if value is not None and unicodedata.normalize("NFC", value) != value:
raise ValueError('String "{}" is not in NFC.'.format(value))
if value is not None and normalize(value, self.normalization) != value:
raise ValueError('String "{}" is not normalized.'.format(value))
@@ -169,31 +171,24 @@ class ExtractedText:
seg_ids = [s.segment_id_for_pos(i) for i in range(len(s.text))]
segment_id_for_pos.extend(seg_ids)
segment_id_for_pos.extend(repeat(None, len(self.joiner)))
- segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)]
+ segment_id_for_pos = segment_id_for_pos[: -len(self.joiner)]
# This is frozen, so we have to jump through the hoop:
- object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
+ object.__setattr__(self, "_segment_id_for_pos", segment_id_for_pos)
assert self._segment_id_for_pos
return self._segment_id_for_pos[pos]
@classmethod
- def from_text_segment(cls, text_segment, nsmap, textequiv_level='region'):
+ def from_text_segment(cls, text_segment, nsmap, textequiv_level="region"):
"""Build an ExtractedText from a PAGE content text element"""
- localname_for_textequiv_level = {
- 'region': 'TextRegion',
- 'line': 'TextLine'
- }
+ localname_for_textequiv_level = {"region": "TextRegion", "line": "TextLine"}
textequiv_level_for_localname = invert_dict(localname_for_textequiv_level)
- children_for_localname = {
- 'TextRegion': 'TextLine'
- }
- joiner_for_textequiv_level = {
- 'line': '\n'
- }
-
- segment_id = text_segment.attrib['id']
+ children_for_localname = {"TextRegion": "TextLine"}
+ joiner_for_textequiv_level = {"line": "\n"}
+
+ segment_id = text_segment.attrib["id"]
localname = ET.QName(text_segment).localname
if localname == localname_for_textequiv_level[textequiv_level]:
segment_text = None
@@ -201,19 +196,20 @@ class ExtractedText:
segment_text = get_textequiv_unicode(text_segment, nsmap)
# FIXME hardcoded SBB normalization
segment_text = normalize_sbb(segment_text)
- segment_text = segment_text or ''
+ segment_text = segment_text or ""
return cls(segment_id, None, None, segment_text)
else:
# Recurse
sub_localname = children_for_localname[localname]
sub_textequiv_level = textequiv_level_for_localname[sub_localname]
segments = []
- for sub_segment in text_segment.iterfind('./page:%s' % sub_localname,
- namespaces=nsmap):
+ for sub_segment in text_segment.iterfind(
+ "./page:%s" % sub_localname, namespaces=nsmap
+ ):
segments.append(
ExtractedText.from_text_segment(
- sub_segment, nsmap,
- textequiv_level=sub_textequiv_level)
+ sub_segment, nsmap, textequiv_level=sub_textequiv_level
+ )
)
joiner = joiner_for_textequiv_level[sub_textequiv_level]
return cls(segment_id, segments, joiner, None)
@@ -231,24 +227,24 @@ def invert_dict(d):
def get_textequiv_unicode(text_segment, nsmap) -> str:
"""Get the TextEquiv/Unicode text of the given PAGE text element."""
- segment_id = text_segment.attrib['id']
- textequivs = text_segment.findall('./page:TextEquiv', namespaces=nsmap)
+ segment_id = text_segment.attrib["id"]
+ textequivs = text_segment.findall("./page:TextEquiv", namespaces=nsmap)
if not textequivs:
- return ''
+ return ""
textequiv = get_first_textequiv(textequivs, segment_id)
- return textequiv.find('./page:Unicode', namespaces=nsmap).text or ''
+ return textequiv.find("./page:Unicode", namespaces=nsmap).text or ""
def get_first_textequiv(textequivs, segment_id):
"""Get the first TextEquiv based on index or conf order if index is not present."""
- log = getLogger('processor.OcrdDinglehopperEvaluate')
+ log = getLogger("processor.OcrdDinglehopperEvaluate")
if len(textequivs) == 1:
return textequivs[0]
# try ordering by index
- indices = np.array([get_attr(te, 'index') for te in textequivs], dtype=float)
+ indices = np.array([get_attr(te, "index") for te in textequivs], dtype=float)
nan_mask = np.isnan(indices)
if np.any(~nan_mask):
if np.any(nan_mask):
@@ -256,10 +252,12 @@ def get_first_textequiv(textequivs, segment_id):
index = np.nanargmin(indices)
else:
# try ordering by conf
- confidences = np.array([get_attr(te, 'conf') for te in textequivs], dtype=float)
+ confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float)
if np.any(~np.isnan(confidences)):
- log.info("No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
- segment_id)
+ log.info(
+ "No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
+ segment_id,
+ )
index = np.nanargmax(confidences)
else:
# fallback to first entry in case of neither index or conf present
diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py
index 9cb2475..755061c 100644
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@@ -17,24 +17,27 @@ def alto_namespace(tree: ET.ElementTree) -> str:
check if the files uses any valid ALTO namespace.
"""
root_name = ET.QName(tree.getroot().tag)
- if root_name.localname == 'alto':
+ if root_name.localname == "alto":
return root_name.namespace
else:
- raise ValueError('Not an ALTO tree')
+ raise ValueError("Not an ALTO tree")
def alto_extract_lines(tree: ET.ElementTree) -> Generator[ExtractedText, None, None]:
- nsmap = {'alto': alto_namespace(tree)}
- for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap):
- line_id = line.attrib.get('ID')
- line_text = ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap))
+ nsmap = {"alto": alto_namespace(tree)}
+ for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
+ line_id = line.attrib.get("ID")
+ line_text = " ".join(
+ string.attrib.get("CONTENT")
+ for string in line.iterfind("alto:String", namespaces=nsmap)
+ )
yield ExtractedText(line_id, None, None, normalize_sbb(line_text))
# FIXME hardcoded SBB normalization
def alto_extract(tree: ET.ElementTree()) -> ExtractedText:
"""Extract text from the given ALTO ElementTree."""
- return ExtractedText(None, list(alto_extract_lines(tree)), '\n', None)
+ return ExtractedText(None, list(alto_extract_lines(tree)), "\n", None)
def alto_text(tree):
@@ -48,56 +51,73 @@ def page_namespace(tree):
do not check if the files uses any valid PAGE namespace.
"""
root_name = ET.QName(tree.getroot().tag)
- if root_name.localname == 'PcGts':
+ if root_name.localname == "PcGts":
return root_name.namespace
else:
- raise ValueError('Not a PAGE tree')
+ raise ValueError("Not a PAGE tree")
-def page_extract(tree, *, textequiv_level='region'):
+def page_extract(tree, *, textequiv_level="region"):
"""Extract text from the given PAGE content ElementTree."""
# Internally, this is just parsing the Reading Order (if it exists) and
# and leaves reading the TextRegions to ExtractedText.from_text_segment().
- nsmap = {'page': page_namespace(tree)}
+ nsmap = {"page": page_namespace(tree)}
regions = []
- reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap)
+ reading_order = tree.find(".//page:ReadingOrder", namespaces=nsmap)
if reading_order is not None:
- for group in reading_order.iterfind('./*', namespaces=nsmap):
- if ET.QName(group.tag).localname == 'OrderedGroup':
- region_ref_indexeds = group.findall('./page:RegionRefIndexed', namespaces=nsmap)
- for region_ref_indexed in sorted(region_ref_indexeds, key=lambda r: int(r.attrib['index'])):
- region_id = region_ref_indexed.attrib['regionRef']
- region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap)
+ for group in reading_order.iterfind("./*", namespaces=nsmap):
+ if ET.QName(group.tag).localname == "OrderedGroup":
+ region_ref_indexeds = group.findall(
+ "./page:RegionRefIndexed", namespaces=nsmap
+ )
+ for region_ref_indexed in sorted(
+ region_ref_indexeds, key=lambda r: int(r.attrib["index"])
+ ):
+ region_id = region_ref_indexed.attrib["regionRef"]
+ region = tree.find(
+ './/page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap
+ )
if region is not None:
- regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level))
+ regions.append(
+ ExtractedText.from_text_segment(
+ region, nsmap, textequiv_level=textequiv_level
+ )
+ )
else:
pass # Not a TextRegion
else:
raise NotImplementedError
else:
- for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap):
- regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level))
+ for region in tree.iterfind(".//page:TextRegion", namespaces=nsmap):
+ regions.append(
+ ExtractedText.from_text_segment(
+ region, nsmap, textequiv_level=textequiv_level
+ )
+ )
# Filter empty region texts
- regions = [r for r in regions if r.text != '']
+ regions = [r for r in regions if r.text != ""]
- return ExtractedText(None, regions, '\n', None)
+ return ExtractedText(None, regions, "\n", None)
-def page_text(tree, *, textequiv_level='region'):
+def page_text(tree, *, textequiv_level="region"):
return page_extract(tree, textequiv_level=textequiv_level).text
def plain_extract(filename):
- with open(filename, 'r') as f:
+ with open(filename, "r") as f:
return ExtractedText(
- None,
- [ExtractedText('line %d' % no, None, None, line) for no, line in enumerate(f.readlines())],
- '\n',
- None
+ None,
+ [
+ ExtractedText("line %d" % no, None, None, line)
+ for no, line in enumerate(f.readlines())
+ ],
+ "\n",
+ None,
)
@@ -105,7 +125,7 @@ def plain_text(filename):
return plain_extract(filename).text
-def extract(filename, *, textequiv_level='region'):
+def extract(filename, *, textequiv_level="region"):
"""Extract the text from the given file.
Supports PAGE, ALTO and falls back to plain text.
@@ -124,5 +144,5 @@ def text(filename):
return extract(filename).text
-if __name__ == '__main__':
+if __name__ == "__main__":
print(text(sys.argv[1]))
diff --git a/qurator/dinglehopper/ocrd_cli.py b/qurator/dinglehopper/ocrd_cli.py
index 1850eb1..008b70c 100644
--- a/qurator/dinglehopper/ocrd_cli.py
+++ b/qurator/dinglehopper/ocrd_cli.py
@@ -10,7 +10,7 @@ from pkg_resources import resource_string
from .cli import process as cli_process
from .edit_distance import levenshtein_matrix_cache_clear
-OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
+OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
@click.command()
@@ -20,20 +20,19 @@ def ocrd_dinglehopper(*args, **kwargs):
class OcrdDinglehopperEvaluate(Processor):
-
def __init__(self, *args, **kwargs):
- kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dinglehopper']
+ kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
def process(self):
- assert_file_grp_cardinality(self.input_file_grp, 2, 'GT and OCR')
+ assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
assert_file_grp_cardinality(self.output_file_grp, 1)
- log = getLogger('processor.OcrdDinglehopperEvaluate')
+ log = getLogger("processor.OcrdDinglehopperEvaluate")
- metrics = self.parameter['metrics']
- textequiv_level = self.parameter['textequiv_level']
- gt_grp, ocr_grp = self.input_file_grp.split(',')
+ metrics = self.parameter["metrics"]
+ textequiv_level = self.parameter["textequiv_level"]
+ gt_grp, ocr_grp = self.input_file_grp.split(",")
input_file_tuples = self._zip_input_files([gt_grp, ocr_grp])
for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
@@ -55,40 +54,47 @@ class OcrdDinglehopperEvaluate(Processor):
except FileExistsError:
pass
cli_process(
- gt_file.local_filename,
- ocr_file.local_filename,
- report_prefix,
- metrics=metrics,
- textequiv_level=textequiv_level
+ gt_file.local_filename,
+ ocr_file.local_filename,
+ report_prefix,
+ metrics=metrics,
+ textequiv_level=textequiv_level,
)
# Add reports to the workspace
- for report_suffix, mimetype in \
- [
- ['.html', 'text/html'],
- ['.json', 'application/json']
- ]:
+ for report_suffix, mimetype in [
+ [".html", "text/html"],
+ [".json", "application/json"],
+ ]:
self.workspace.add_file(
- ID=file_id + report_suffix,
- file_grp=self.output_file_grp,
- pageId=page_id,
- mimetype=mimetype,
- local_filename=report_prefix + report_suffix)
+ ID=file_id + report_suffix,
+ file_grp=self.output_file_grp,
+ pageId=page_id,
+ mimetype=mimetype,
+ local_filename=report_prefix + report_suffix,
+ )
# Clear cache between files
levenshtein_matrix_cache_clear()
def _zip_input_files(self, input_file_grps):
- log = getLogger('processor.OcrdDinglehopperEvaluate')
+ log = getLogger("processor.OcrdDinglehopperEvaluate")
input_file_tuples = list()
- for page_id in ([self.page_id] if self.page_id else
- self.workspace.mets.physical_pages):
+ for page_id in (
+ [self.page_id] if self.page_id else self.workspace.mets.physical_pages
+ ):
ifiles = list()
for input_file_grp in input_file_grps:
- log.debug("Adding input file group %s to page %s", input_file_grp, page_id)
- files = self.workspace.mets.find_all_files(pageId=page_id, fileGrp=input_file_grp)
+ log.debug(
+ "Adding input file group %s to page %s", input_file_grp, page_id
+ )
+ files = self.workspace.mets.find_all_files(
+ pageId=page_id, fileGrp=input_file_grp
+ )
if not files:
- log.error('Found no page "%s" in file group %s', page_id, input_file_grp)
+ log.error(
+ 'Found no page "%s" in file group %s', page_id, input_file_grp
+ )
ifiles.append(None)
else:
ifiles.append(files[0])
@@ -97,5 +103,5 @@ class OcrdDinglehopperEvaluate(Processor):
return input_file_tuples
-if __name__ == '__main__':
+if __name__ == "__main__":
ocrd_dinglehopper()
diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py
index 2ce81cd..8a81587 100644
--- a/qurator/dinglehopper/tests/extracted_text_test.py
+++ b/qurator/dinglehopper/tests/extracted_text_test.py
@@ -10,25 +10,30 @@ from .. import seq_align, ExtractedText
def test_text():
- test1 = ExtractedText(None, [
- ExtractedText('s0', None, None, 'foo'),
- ExtractedText('s1', None, None, 'bar'),
- ExtractedText('s2', None, None, 'bazinga')
- ], ' ', None)
-
- assert test1.text == 'foo bar bazinga'
- assert test1.segment_id_for_pos(0) == 's0'
+ test1 = ExtractedText(
+ None,
+ [
+ ExtractedText("s0", None, None, "foo"),
+ ExtractedText("s1", None, None, "bar"),
+ ExtractedText("s2", None, None, "bazinga"),
+ ],
+ " ",
+ None,
+ )
+
+ assert test1.text == "foo bar bazinga"
+ assert test1.segment_id_for_pos(0) == "s0"
assert test1.segment_id_for_pos(3) is None
- assert test1.segment_id_for_pos(10) == 's2'
+ assert test1.segment_id_for_pos(10) == "s2"
def test_normalization_check():
- with pytest.raises(ValueError, match=r'.*is not in NFC.*'):
- ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ'))
- assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ'))
+ with pytest.raises(ValueError, match=r".*is not in NFC.*"):
+ ExtractedText("foo", None, None, unicodedata.normalize("NFD", "Schlyñ"))
+ assert ExtractedText("foo", None, None, unicodedata.normalize("NFC", "Schlyñ"))
-AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id')
+AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")
def test_align():
@@ -39,25 +44,36 @@ def test_align():
not Python characters.
"""
- test1 = ExtractedText(None, [
- ExtractedText('s0', None, None, 'foo'),
- ExtractedText('s1', None, None, 'bar'),
- ExtractedText('s2', None, None, 'batzinga')
- ], ' ', None)
- test2 = ExtractedText(None, [
- ExtractedText('x0', None, None, 'foo'),
- ExtractedText('x1', None, None, 'bar'),
- # extra .
- ExtractedText('x2', None, None, '.'),
- # deletion + different grapheme cluster, m̃ also is two Python characters
- ExtractedText('x3', None, None, 'bazim̃ga'),
- ], ' ', None)
+ test1 = ExtractedText(
+ None,
+ [
+ ExtractedText("s0", None, None, "foo"),
+ ExtractedText("s1", None, None, "bar"),
+ ExtractedText("s2", None, None, "batzinga"),
+ ],
+ " ",
+ None,
+ )
+ test2 = ExtractedText(
+ None,
+ [
+ ExtractedText("x0", None, None, "foo"),
+ ExtractedText("x1", None, None, "bar"),
+ # extra .
+ ExtractedText("x2", None, None, "."),
+ # deletion + different grapheme cluster, m̃ also is two Python characters
+ ExtractedText("x3", None, None, "bazim̃ga"),
+ ],
+ " ",
+ None,
+ )
left_pos = 0
right_pos = 0
alignment = []
- for left, right in seq_align(grapheme_clusters(test1.text),
- grapheme_clusters(test2.text)):
+ for left, right in seq_align(
+ grapheme_clusters(test1.text), grapheme_clusters(test2.text)
+ ):
left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
el = AlignmentElement(left, right, left_id, right_id)
@@ -67,46 +83,57 @@ def test_align():
if right is not None:
right_pos += len(right)
- print('test1: {}'.format(test1.text))
- print('test2: {}'.format(test2.text))
-
- assert alignment[0] == ('f', 'f', 's0', 'x0')
- assert alignment[8] == (None, '.', None, 'x2')
- assert alignment[12] == ('t', None, 's2', None)
- assert alignment[15] == ('n', 'm̃', 's2', 'x3')
-
-
-@pytest.mark.parametrize("attributes,expected_index,expected_log", [
- ([], None, None),
- (['index="0"'], 0, None),
- ([''], 0, None),
- (['conf="0.5"'], 0, None),
- (['index="1"', 'index="0"'], 1, None),
- (['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"),
- (['conf="0.4"', 'conf="0.5"', 'conf="0.9"'], 2,
- "No index attributes, use 'conf' attribute to sort TextEquiv"),
- (['index="0"', ''], 0, "TextEquiv without index"),
- (['', 'conf="0.4"'], 1,
- "No index attributes, use 'conf' attribute to sort TextEquiv"),
- (['', ''], 0, "No index attributes, use first TextEquiv"),
-])
+ print("test1: {}".format(test1.text))
+ print("test2: {}".format(test2.text))
+
+ assert alignment[0] == ("f", "f", "s0", "x0")
+ assert alignment[8] == (None, ".", None, "x2")
+ assert alignment[12] == ("t", None, "s2", None)
+ assert alignment[15] == ("n", "m̃", "s2", "x3")
+
+
+@pytest.mark.parametrize(
+ "attributes,expected_index,expected_log",
+ [
+ ([], None, None),
+ (['index="0"'], 0, None),
+ ([""], 0, None),
+ (['conf="0.5"'], 0, None),
+ (['index="1"', 'index="0"'], 1, None),
+ (['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"),
+ (
+ ['conf="0.4"', 'conf="0.5"', 'conf="0.9"'],
+ 2,
+ "No index attributes, use 'conf' attribute to sort TextEquiv",
+ ),
+ (['index="0"', ""], 0, "TextEquiv without index"),
+ (
+ ["", 'conf="0.4"'],
+ 1,
+ "No index attributes, use 'conf' attribute to sort TextEquiv",
+ ),
+ (["", ""], 0, "No index attributes, use first TextEquiv"),
+ ],
+)
def test_textequiv(attributes, expected_index, expected_log, caplog):
"""Test that extracting text from a PAGE TextEquiv is working without index attr."""
caplog.set_level(logging.INFO)
- xml = ""
+ xml = ''
ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
text = ["Text {0}".format(i) for i in range(len(attributes) + 1)]
- equiv = ["{1}".format(attr, text[i])
- for i, attr in enumerate(attributes)]
+ equiv = [
+ "{1}".format(attr, text[i])
+ for i, attr in enumerate(attributes)
+ ]
- textline = "{0}{2}"
- textline = textline.format(xml, ns, ''.join(equiv))
+ textline = '{0}{2}'
+ textline = textline.format(xml, ns, "".join(equiv))
root = ET.fromstring(textline)
- result = ExtractedText.from_text_segment(root,
- {'page': ns},
- textequiv_level='line').text
+ result = ExtractedText.from_text_segment(
+ root, {"page": ns}, textequiv_level="line"
+ ).text
if expected_index is None:
assert not result
else:
diff --git a/qurator/dinglehopper/tests/test_align.py b/qurator/dinglehopper/tests/test_align.py
index 23483f8..9f9d926 100644
--- a/qurator/dinglehopper/tests/test_align.py
+++ b/qurator/dinglehopper/tests/test_align.py
@@ -3,64 +3,85 @@ from .. import align, seq_align, distance
def test_left_empty():
- result = list(align('', 'foo'))
- expected = [(None, 'f'), (None, 'o'), (None, 'o')]
+ result = list(align("", "foo"))
+ expected = [(None, "f"), (None, "o"), (None, "o")]
assert result == expected
def test_right_empty():
- result = list(align('foo', ''))
- expected = [('f', None), ('o', None), ('o', None)]
+ result = list(align("foo", ""))
+ expected = [("f", None), ("o", None), ("o", None)]
assert result == expected
def test_left_longer():
- result = list(align('food', 'foo'))
- expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), ('d', None)]
+ result = list(align("food", "foo"))
+ expected = [("f", "f"), ("o", "o"), ("o", "o"), ("d", None)]
assert result == expected
def test_right_longer():
- result = list(align('foo', 'food'))
- expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), (None, 'd')]
+ result = list(align("foo", "food"))
+ expected = [("f", "f"), ("o", "o"), ("o", "o"), (None, "d")]
assert result == expected
def test_some_diff():
- result = list(align('abcde', 'aaadef'))
+ result = list(align("abcde", "aaadef"))
left, right = unzip(result)
- assert list(left) == ['a', 'b', 'c', 'd', 'e', None]
- assert list(right) == ['a', 'a', 'a', 'd', 'e', 'f']
+ assert list(left) == ["a", "b", "c", "d", "e", None]
+ assert list(right) == ["a", "a", "a", "d", "e", "f"]
def test_longer():
- s1 = 'Dies ist eine Tst!'
- s2 = 'Dies ist ein Test.'
+ s1 = "Dies ist eine Tst!"
+ s2 = "Dies ist ein Test."
result = list(align(s1, s2)) # ; diffprint(*unzip(result))
- expected = [('D', 'D'), ('i', 'i'), ('e', 'e'), ('s', 's'), (' ', ' '),
- ('i', 'i'), ('s', 's'), ('t', 't'), (' ', ' '),
- ('e', 'e'), ('i', 'i'), ('n', 'n'), ('e', None), (' ', ' '),
- ('T', 'T'), (None, 'e'), ('s', 's'), ('t', 't'), ('!', '.')]
+ expected = [
+ ("D", "D"),
+ ("i", "i"),
+ ("e", "e"),
+ ("s", "s"),
+ (" ", " "),
+ ("i", "i"),
+ ("s", "s"),
+ ("t", "t"),
+ (" ", " "),
+ ("e", "e"),
+ ("i", "i"),
+ ("n", "n"),
+ ("e", None),
+ (" ", " "),
+ ("T", "T"),
+ (None, "e"),
+ ("s", "s"),
+ ("t", "t"),
+ ("!", "."),
+ ]
assert result == expected
def test_completely_different():
- assert len(list(align('abcde', 'fghij'))) == 5
+ assert len(list(align("abcde", "fghij"))) == 5
def test_with_some_fake_ocr_errors():
- result = list(align('Über die vielen Sorgen wegen desselben vergaß',
- 'SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab'))
+ result = list(
+ align(
+ "Über die vielen Sorgen wegen desselben vergaß",
+ "SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
+ )
+ )
left, right = unzip(result)
# Beginning
- assert list(left[:18]) == [None]*18
- assert list(right[:18]) == list('SomeJunk MoreJunk ')
+ assert list(left[:18]) == [None] * 18
+ assert list(right[:18]) == list("SomeJunk MoreJunk ")
# End
- assert list(left[-1:]) == ['ß']
- assert list(right[-1:]) == ['b']
+ assert list(left[-1:]) == ["ß"]
+ assert list(right[-1:]) == ["b"]
def test_lines():
@@ -68,13 +89,30 @@ def test_lines():
This mainly serves as documentation for comparing lists of lines.
"""
- result = list(seq_align(
- ['This is a line.', 'This is another', 'And the last line'],
- ['This is a line.', 'This is another', 'J u n k', 'And the last line']
- ))
+ result = list(
+ seq_align(
+ ["This is a line.", "This is another", "And the last line"],
+ [
+ "This is a line.",
+ "This is another",
+ "J u n k",
+ "And the last line",
+ ],
+ )
+ )
left, right = unzip(result)
- assert list(left) == ['This is a line.', 'This is another', None, 'And the last line']
- assert list(right) == ['This is a line.', 'This is another', 'J u n k', 'And the last line']
+ assert list(left) == [
+ "This is a line.",
+ "This is another",
+ None,
+ "And the last line",
+ ]
+ assert list(right) == [
+ "This is a line.",
+ "This is another",
+ "J u n k",
+ "And the last line",
+ ]
def test_lines_similar():
@@ -92,7 +130,7 @@ def test_lines_similar():
# Just an example!
min_len = min(len(self._string), len(other._string))
if min_len > 0:
- normalized_distance = distance(self._string, other._string)/min_len
+ normalized_distance = distance(self._string, other._string) / min_len
similar = normalized_distance < 0.1
else:
similar = False
@@ -102,18 +140,39 @@ def test_lines_similar():
return not self.__eq__(other)
def __repr__(self):
- return 'SimilarString(\'%s\')' % self._string
+ return "SimilarString('%s')" % self._string
def __hash__(self):
return hash(self._string)
- result = list(seq_align(
- [SimilarString('This is a line.'), SimilarString('This is another'), SimilarString('And the last line')],
- [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')]
- ))
+ result = list(
+ seq_align(
+ [
+ SimilarString("This is a line."),
+ SimilarString("This is another"),
+ SimilarString("And the last line"),
+ ],
+ [
+ SimilarString("This is a ljne."),
+ SimilarString("This is another"),
+ SimilarString("J u n k"),
+ SimilarString("And the last line"),
+ ],
+ )
+ )
left, right = unzip(result)
- assert list(left) == [SimilarString('This is a line.'), SimilarString('This is another'), None, SimilarString('And the last line')]
- assert list(right) == [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')]
+ assert list(left) == [
+ SimilarString("This is a line."),
+ SimilarString("This is another"),
+ None,
+ SimilarString("And the last line"),
+ ]
+ assert list(right) == [
+ SimilarString("This is a ljne."),
+ SimilarString("This is another"),
+ SimilarString("J u n k"),
+ SimilarString("And the last line"),
+ ]
# Test __eq__ (i.e. is it a substitution or a similar string?)
assert list(left)[0] == list(right)[0]
diff --git a/qurator/dinglehopper/tests/test_character_error_rate.py b/qurator/dinglehopper/tests/test_character_error_rate.py
index b16d37c..39301b4 100644
--- a/qurator/dinglehopper/tests/test_character_error_rate.py
+++ b/qurator/dinglehopper/tests/test_character_error_rate.py
@@ -7,31 +7,35 @@ from .. import character_error_rate
def test_character_error_rate():
- assert character_error_rate('a', 'a') == 0
- assert character_error_rate('a', 'b') == 1/1
- assert character_error_rate('Foo', 'Bar') == 3/3
+ assert character_error_rate("a", "a") == 0
+ assert character_error_rate("a", "b") == 1 / 1
+ assert character_error_rate("Foo", "Bar") == 3 / 3
- assert character_error_rate('Foo', '') == 3/3
+ assert character_error_rate("Foo", "") == 3 / 3
- assert character_error_rate('', '') == 0
- assert math.isinf(character_error_rate('', 'Foo'))
+ assert character_error_rate("", "") == 0
+ assert math.isinf(character_error_rate("", "Foo"))
- assert character_error_rate('Foo', 'Food') == 1/3
- assert character_error_rate('Fnord', 'Food') == 2/5
- assert character_error_rate('Müll', 'Mull') == 1/4
- assert character_error_rate('Abstand', 'Sand') == 4/7
+ assert character_error_rate("Foo", "Food") == 1 / 3
+ assert character_error_rate("Fnord", "Food") == 2 / 5
+ assert character_error_rate("Müll", "Mull") == 1 / 4
+ assert character_error_rate("Abstand", "Sand") == 4 / 7
def test_character_error_rate_hard():
- s1 = unicodedata.normalize('NFC', 'Schlyñ lorem ipsum.')
- s2 = unicodedata.normalize('NFD', 'Schlyñ lorem ipsum!') # Different, decomposed!
- assert character_error_rate(s1, s2) == 1/19
-
- s1 = 'Schlyñ'
- assert len(s1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
- s2 = 'Schlym̃'
- assert len(s2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
+ s1 = unicodedata.normalize("NFC", "Schlyñ lorem ipsum.")
+ s2 = unicodedata.normalize("NFD", "Schlyñ lorem ipsum!") # Different, decomposed!
+ assert character_error_rate(s1, s2) == 1 / 19
+
+ s1 = "Schlyñ"
+ assert (
+ len(s1) == 6
+ ) # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
+ s2 = "Schlym̃"
+ assert (
+ len(s2) == 7
+ ) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
# Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
- assert character_error_rate(s2, s1) == 1/6
- assert character_error_rate(s1, s2) == 1/6
+ assert character_error_rate(s2, s1) == 1 / 6
+ assert character_error_rate(s1, s2) == 1 / 6
diff --git a/qurator/dinglehopper/tests/test_edit_distance.py b/qurator/dinglehopper/tests/test_edit_distance.py
index fa901a8..dc1f202 100644
--- a/qurator/dinglehopper/tests/test_edit_distance.py
+++ b/qurator/dinglehopper/tests/test_edit_distance.py
@@ -6,35 +6,39 @@ from .. import levenshtein, distance
def test_levenshtein():
- assert levenshtein('a', 'a') == 0
- assert levenshtein('a', 'b') == 1
- assert levenshtein('Foo', 'Bar') == 3
+ assert levenshtein("a", "a") == 0
+ assert levenshtein("a", "b") == 1
+ assert levenshtein("Foo", "Bar") == 3
- assert levenshtein('', '') == 0
- assert levenshtein('Foo', '') == 3
- assert levenshtein('', 'Foo') == 3
+ assert levenshtein("", "") == 0
+ assert levenshtein("Foo", "") == 3
+ assert levenshtein("", "Foo") == 3
- assert levenshtein('Foo', 'Food') == 1
- assert levenshtein('Fnord', 'Food') == 2
- assert levenshtein('Müll', 'Mull') == 1
- assert levenshtein('Abstand', 'Sand') == 4
+ assert levenshtein("Foo", "Food") == 1
+ assert levenshtein("Fnord", "Food") == 2
+ assert levenshtein("Müll", "Mull") == 1
+ assert levenshtein("Abstand", "Sand") == 4
def test_levenshtein_other_sequences():
- assert levenshtein(['a', 'ab'], ['a', 'ab', 'c']) == 1
- assert levenshtein(['a', 'ab'], ['a', 'c']) == 1
+ assert levenshtein(["a", "ab"], ["a", "ab", "c"]) == 1
+ assert levenshtein(["a", "ab"], ["a", "c"]) == 1
def test_distance():
- assert distance('Fnord', 'Food') == 2
- assert distance('Müll', 'Mull') == 1
+ assert distance("Fnord", "Food") == 2
+ assert distance("Müll", "Mull") == 1
- word1 = unicodedata.normalize('NFC', 'Schlyñ')
- word2 = unicodedata.normalize('NFD', 'Schlyñ') # Different, decomposed!
+ word1 = unicodedata.normalize("NFC", "Schlyñ")
+ word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed!
assert distance(word1, word2) == 0
- word1 = 'Schlyñ'
- assert len(word1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
- word2 = 'Schlym̃'
- assert len(word2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
+ word1 = "Schlyñ"
+ assert (
+ len(word1) == 6
+ ) # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
+ word2 = "Schlym̃"
+ assert (
+ len(word2) == 7
+ ) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
assert distance(word1, word2) == 1
diff --git a/qurator/dinglehopper/tests/test_editops.py b/qurator/dinglehopper/tests/test_editops.py
index 8fafe5d..06afbfc 100644
--- a/qurator/dinglehopper/tests/test_editops.py
+++ b/qurator/dinglehopper/tests/test_editops.py
@@ -4,45 +4,60 @@ from .. import seq_editops, editops
def test_trivial():
- assert seq_editops('abc', 'abc') == []
- assert seq_editops('', '') == []
+ assert seq_editops("abc", "abc") == []
+ assert seq_editops("", "") == []
def test_insert():
- assert seq_editops('bc', 'abc') == [('insert', 0, 0)]
- assert seq_editops('ac', 'abc') == [('insert', 1, 1)]
- assert seq_editops('ab', 'abc') == [('insert', 2, 2)]
- assert seq_editops('', 'a') == [('insert', 0, 0)]
+ assert seq_editops("bc", "abc") == [("insert", 0, 0)]
+ assert seq_editops("ac", "abc") == [("insert", 1, 1)]
+ assert seq_editops("ab", "abc") == [("insert", 2, 2)]
+ assert seq_editops("", "a") == [("insert", 0, 0)]
def test_multiple():
- assert seq_editops('bcd', 'abce') == [('insert', 0, 0), ('replace', 2, 3)]
+ assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]
def test_delete():
- assert seq_editops('abcdef', 'cdef') == [('delete', 0, 0), ('delete', 1, 0)]
- assert seq_editops('Xabcdef', 'Xcdef') == [('delete', 1, 1), ('delete', 2, 1)]
- assert seq_editops('abcdefg', 'acdefX') == [('delete', 1, 1), ('replace', 6, 5)]
- assert seq_editops('abcde', 'aabcd') == [('insert', 1, 1), ('delete', 4, 5)]
- assert seq_editops('Foo', '') == [('delete', 0, 0), ('delete', 1, 0), ('delete', 2, 0)]
- assert seq_editops('Foolish', 'Foo') == [('delete', 3, 3), ('delete', 4, 3), ('delete', 5, 3), ('delete', 6, 3)]
+ assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
+ assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
+ assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
+ assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
+ assert seq_editops("Foo", "") == [
+ ("delete", 0, 0),
+ ("delete", 1, 0),
+ ("delete", 2, 0),
+ ]
+ assert seq_editops("Foolish", "Foo") == [
+ ("delete", 3, 3),
+ ("delete", 4, 3),
+ ("delete", 5, 3),
+ ("delete", 6, 3),
+ ]
def test_ambiguous():
- assert seq_editops('bcd', 'abcef') == [('insert', 0, 0), ('replace', 2, 3), ('insert', 3, 4)]
+ assert seq_editops("bcd", "abcef") == [
+ ("insert", 0, 0),
+ ("replace", 2, 3),
+ ("insert", 3, 4),
+ ]
def test_editops():
"""Test editops() in cases where dealing with grapheme clusters matters"""
# In these cases, one of the words has a composed form, the other one does not.
- assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)]
- assert editops('oͤde', 'öde') == [('replace', 0, 0)]
+ assert editops("Schlyñ", "Schlym̃") == [("replace", 5, 5)]
+ assert editops("oͤde", "öde") == [("replace", 0, 0)]
def test_editops_canonically_equivalent():
- left = unicodedata.lookup('LATIN SMALL LETTER N') + unicodedata.lookup('COMBINING TILDE')
- right = unicodedata.lookup('LATIN SMALL LETTER N WITH TILDE')
+ left = unicodedata.lookup("LATIN SMALL LETTER N") + unicodedata.lookup(
+ "COMBINING TILDE"
+ )
+ right = unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE")
assert left != right
- assert unicodedata.normalize('NFC', left) == unicodedata.normalize('NFC', right)
+ assert unicodedata.normalize("NFC", left) == unicodedata.normalize("NFC", right)
assert editops(left, right) == []
diff --git a/qurator/dinglehopper/tests/test_integ_align.py b/qurator/dinglehopper/tests/test_integ_align.py
index b35974b..74b8c7e 100644
--- a/qurator/dinglehopper/tests/test_integ_align.py
+++ b/qurator/dinglehopper/tests/test_integ_align.py
@@ -7,7 +7,7 @@ from lxml import etree as ET
from .. import align, page_text
-data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
@@ -17,8 +17,8 @@ def test_align_page_files():
# (currently) not counted due to normalization.
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
- gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
- ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
+ gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
+ ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
result = list(align(gt, ocr))
for left, right in result:
diff --git a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
index 1c3bf52..e307a84 100644
--- a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
@@ -8,26 +8,34 @@ from uniseg.graphemecluster import grapheme_clusters
from .. import character_error_rate, page_text, alto_text
-data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
def test_character_error_rate_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
# The fi ligature does not count.
- gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
- ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
+ gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
+ ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
gt_len = len(list(grapheme_clusters(gt)))
- expected_cer = 2/gt_len
+ expected_cer = 2 / gt_len
assert character_error_rate(gt, ocr) == expected_cer
@pytest.mark.integration
def test_character_error_rate_between_page_alto():
- gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
- ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
+ gt = page_text(
+ ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml"))
+ )
+ ocr = alto_text(
+ ET.parse(
+ os.path.join(
+ data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml"
+ )
+ )
+ )
assert gt == ocr
assert character_error_rate(gt, ocr) == 0
@@ -35,7 +43,17 @@ def test_character_error_rate_between_page_alto():
@pytest.mark.integration
def test_character_error_rate_between_page_alto_2():
- gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
- ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
-
- assert character_error_rate(gt, ocr) == 8/591 # Manually verified
+ gt = page_text(
+ ET.parse(
+ os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml")
+ )
+ )
+ ocr = alto_text(
+ ET.parse(
+ os.path.join(
+ data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml"
+ )
+ )
+ )
+
+ assert character_error_rate(gt, ocr) == 8 / 591 # Manually verified
diff --git a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
index d71bc14..d251f9d 100644
--- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
+++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
@@ -10,31 +10,31 @@ def test_cli_json(tmp_path):
"""Test that the cli/process() yields a loadable JSON report"""
with working_directory(str(tmp_path)):
- with open('gt.txt', 'w') as gtf:
- gtf.write('AAAAA')
- with open('ocr.txt', 'w') as ocrf:
- ocrf.write('AAAAB')
+ with open("gt.txt", "w") as gtf:
+ gtf.write("AAAAA")
+ with open("ocr.txt", "w") as ocrf:
+ ocrf.write("AAAAB")
- with open('gt.txt', 'r') as gtf:
+ with open("gt.txt", "r") as gtf:
print(gtf.read())
- process('gt.txt', 'ocr.txt', 'report')
- with open('report.json', 'r') as jsonf:
+ process("gt.txt", "ocr.txt", "report")
+ with open("report.json", "r") as jsonf:
print(jsonf.read())
- with open('report.json', 'r') as jsonf:
+ with open("report.json", "r") as jsonf:
j = json.load(jsonf)
- assert j['cer'] == pytest.approx(0.2)
+ assert j["cer"] == pytest.approx(0.2)
def test_cli_json_cer_is_infinity(tmp_path):
"""Test that the cli/process() yields a loadable JSON report when CER == inf"""
with working_directory(str(tmp_path)):
- with open('gt.txt', 'w') as gtf:
- gtf.write('') # Empty to yield CER == inf
- with open('ocr.txt', 'w') as ocrf:
- ocrf.write('Not important')
+ with open("gt.txt", "w") as gtf:
+ gtf.write("") # Empty to yield CER == inf
+ with open("ocr.txt", "w") as ocrf:
+ ocrf.write("Not important")
- process('gt.txt', 'ocr.txt', 'report')
- with open('report.json', 'r') as jsonf:
+ process("gt.txt", "ocr.txt", "report")
+ with open("report.json", "r") as jsonf:
j = json.load(jsonf)
- assert j['cer'] == pytest.approx(float('inf'))
+ assert j["cer"] == pytest.approx(float("inf"))
diff --git a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
index cbe12f8..0e1e7da 100644
--- a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
@@ -7,7 +7,7 @@ from lxml import etree as ET
from .. import distance, page_text, alto_text
-data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
@@ -15,15 +15,23 @@ def test_distance_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
# Due to normalization, we don't count the ligature.
# → 2 differences
- gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
- ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
+ gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
+ ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
assert distance(gt, ocr) == 2
@pytest.mark.integration
def test_distance_between_page_alto():
- gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
- ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
+ gt = page_text(
+ ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml"))
+ )
+ ocr = alto_text(
+ ET.parse(
+ os.path.join(
+ data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml"
+ )
+ )
+ )
assert gt == ocr
assert distance(gt, ocr) == 0
@@ -31,7 +39,17 @@ def test_distance_between_page_alto():
@pytest.mark.integration
def test_distance_between_page_alto_2():
- gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
- ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
+ gt = page_text(
+ ET.parse(
+ os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml")
+ )
+ )
+ ocr = alto_text(
+ ET.parse(
+ os.path.join(
+ data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml"
+ )
+ )
+ )
assert distance(gt, ocr) == 8 # Manually verified
diff --git a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py
index 5e535b5..5cf6a41 100644
--- a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py
+++ b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py
@@ -10,27 +10,32 @@ from .util import working_directory
from ..ocrd_cli import ocrd_dinglehopper
-data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
def test_ocrd_cli(tmp_path):
"""Test OCR-D interface"""
# Copy test workspace
- test_workspace_dir_source = Path(data_dir) / 'actevedef_718448162'
- test_workspace_dir = tmp_path / 'test_ocrd_cli'
+ test_workspace_dir_source = Path(data_dir) / "actevedef_718448162"
+ test_workspace_dir = tmp_path / "test_ocrd_cli"
shutil.copytree(str(test_workspace_dir_source), str(test_workspace_dir))
# Run through the OCR-D interface
with working_directory(str(test_workspace_dir)):
runner = CliRunner()
args = [
- '-m', 'mets.xml',
- '-I', 'OCR-D-GT-PAGE,OCR-D-OCR-CALAMARI',
- '-O', 'OCR-D-OCR-CALAMARI-EVAL'
+ "-m",
+ "mets.xml",
+ "-I",
+ "OCR-D-GT-PAGE,OCR-D-OCR-CALAMARI",
+ "-O",
+ "OCR-D-OCR-CALAMARI-EVAL",
]
- sys.argv[1:] = args # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments
+ sys.argv[
+ 1:
+ ] = args # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments
result = runner.invoke(ocrd_dinglehopper, args)
assert result.exit_code == 0
- result_json = list((test_workspace_dir / 'OCR-D-OCR-CALAMARI-EVAL').glob('*.json'))
- assert json.load(open(str(result_json[0])))['cer'] < 0.03
+ result_json = list((test_workspace_dir / "OCR-D-OCR-CALAMARI-EVAL").glob("*.json"))
+ assert json.load(open(str(result_json[0])))["cer"] < 0.03
diff --git a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
index f5c922b..ba865b4 100644
--- a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
@@ -7,26 +7,36 @@ from lxml import etree as ET
from .. import word_error_rate, words, page_text, alto_text
-data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
def test_word_error_rate_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
# the ligature does not count → 2 errors
- gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
+ gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
- gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line
+ gt_word_count = (
+ 7 + 6 + 5 + 8 + 7 + 6 + 7 + 8 + 6 + 7 + 7 + 5 + 6 + 8 + 8 + 7 + 7 + 6 + 5 + 4
+ ) # Manually verified word count per line
assert len(list(words(gt))) == gt_word_count
- ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
- assert word_error_rate(gt, ocr) == 2/gt_word_count
+ ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
+ assert word_error_rate(gt, ocr) == 2 / gt_word_count
@pytest.mark.integration
def test_word_error_rate_between_page_alto():
- gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
- ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
+ gt = page_text(
+ ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml"))
+ )
+ ocr = alto_text(
+ ET.parse(
+ os.path.join(
+ data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml"
+ )
+ )
+ )
assert gt == ocr
assert word_error_rate(gt, ocr) == 0
@@ -34,11 +44,25 @@ def test_word_error_rate_between_page_alto():
@pytest.mark.integration
def test_word_error_rate_between_page_alto_2():
- gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
-
- gt_word_count = 14+18+17+14+17+17+3 # Manually verified word count per line
+ gt = page_text(
+ ET.parse(
+ os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml")
+ )
+ )
+
+ gt_word_count = (
+ 14 + 18 + 17 + 14 + 17 + 17 + 3
+ ) # Manually verified word count per line
assert len(list(words(gt))) == gt_word_count
- ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
-
- assert word_error_rate(gt, ocr) == 7/gt_word_count # Manually verified, 6 words are wrong, 1 got split (=2 errors)
+ ocr = alto_text(
+ ET.parse(
+ os.path.join(
+ data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml"
+ )
+ )
+ )
+
+ assert (
+ word_error_rate(gt, ocr) == 7 / gt_word_count
+ ) # Manually verified, 6 words are wrong, 1 got split (=2 errors)
diff --git a/qurator/dinglehopper/tests/test_ocr_files.py b/qurator/dinglehopper/tests/test_ocr_files.py
index 6848fa1..fb38c4a 100644
--- a/qurator/dinglehopper/tests/test_ocr_files.py
+++ b/qurator/dinglehopper/tests/test_ocr_files.py
@@ -9,46 +9,54 @@ import pytest
from .util import working_directory
from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text
-data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
def test_alto_namespace():
- tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
- assert alto_namespace(tree) == 'http://www.loc.gov/standards/alto/ns-v3#'
+ tree = ET.parse(os.path.join(data_dir, "test.alto3.xml"))
+ assert alto_namespace(tree) == "http://www.loc.gov/standards/alto/ns-v3#"
def test_alto_text():
- tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
+ tree = ET.parse(os.path.join(data_dir, "test.alto3.xml"))
result = alto_text(tree)
- expected = textwrap.dedent("""\
+ expected = textwrap.dedent(
+ """\
über die vielen Sorgen wegen deſſelben vergaß
Hartkopf, der Frau Amtmännin das ver-
- ſprochene zu überliefern.""")
+ ſprochene zu überliefern."""
+ )
assert result == expected
def test_alto_text_ALTO1():
- tree = ET.parse(os.path.join(data_dir, 'test.alto1.xml'))
+ tree = ET.parse(os.path.join(data_dir, "test.alto1.xml"))
assert "being erected at the Broadway stock" in alto_text(tree)
def test_alto_text_ALTO2():
- tree = ET.parse(os.path.join(data_dir, 'test.alto2.xml'))
- assert "Halbmonde, die genau durch einen Ouerstrich halbiert\nsind und an beiden Enden" in alto_text(tree)
+ tree = ET.parse(os.path.join(data_dir, "test.alto2.xml"))
+ assert (
+ "Halbmonde, die genau durch einen Ouerstrich halbiert\nsind und an beiden Enden"
+ in alto_text(tree)
+ )
def test_alto_text_ALTO3():
- tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
+ tree = ET.parse(os.path.join(data_dir, "test.alto3.xml"))
assert "über die vielen Sorgen wegen deſſelben vergaß" in alto_text(tree)
def test_page_namespace():
- tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
- assert page_namespace(tree) == 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15'
+ tree = ET.parse(os.path.join(data_dir, "test.page2018.xml"))
+ assert (
+ page_namespace(tree)
+ == "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
+ )
def test_page_test():
- tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
+ tree = ET.parse(os.path.join(data_dir, "test.page2018.xml"))
result = page_text(tree)
# We are currently normalizing on extraction, so the text is normalized.
@@ -74,7 +82,8 @@ def test_page_test():
# Jndeß mangelten do einige Generalia, die
# alſo wegfielen. — Hartkopf gieng ſelb
# mit und berbrate es. —""")
- expected = textwrap.dedent("""\
+ expected = textwrap.dedent(
+ """\
über die vielen Sorgen wegen deſſelben vergaß
Hartkopf, der Frau Amtmännin das ver-
ſprochene zu überliefern. – Ein Erpreſſer
@@ -94,7 +103,8 @@ def test_page_test():
ſie das, was da wäre, herbeyſchaffen möchte.
Jndeß mangelten doch einige Generalia, die
alſo wegfielen. – Hartkopf gieng ſelbſt
- mit und überbrachte es. –""")
+ mit und überbrachte es. –"""
+ )
assert result == expected
@@ -107,56 +117,69 @@ def test_page_with_empty_region():
#
#
#
- tree = ET.parse(os.path.join(data_dir, 'brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml'))
+ tree = ET.parse(
+ os.path.join(data_dir, "brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml")
+ )
result = page_text(tree)
assert result
def test_page_order():
# This file contains TextRegions where file order is not the same as reading order.
- tree = ET.parse(os.path.join(data_dir, 'order.page.xml'))
+ tree = ET.parse(os.path.join(data_dir, "order.page.xml"))
result = page_text(tree)
print(result)
- assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die', result, re.DOTALL)
+ assert re.search(
+ r"Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die",
+ result,
+ re.DOTALL,
+ )
def test_page_mixed_regions():
# This file contains ImageRegions and TextRegions in the ReadingOrder
- tree = ET.parse(os.path.join(data_dir, 'mixed-regions.page.xml'))
+ tree = ET.parse(os.path.join(data_dir, "mixed-regions.page.xml"))
result = page_text(tree)
- assert 'non exaudiam uos. Chriſtiani uero quia orant iuxta' in result
+ assert "non exaudiam uos. Chriſtiani uero quia orant iuxta" in result
def test_page_level():
# This file contains inconsistent TextRegion and TextLine texts
# TextRegion
- tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml'))
+ tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml"))
result = page_text(tree)
- assert result == 'Inconsistent dummy region text'
- tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml'))
- result = page_text(tree, textequiv_level='region')
- assert result == 'Inconsistent dummy region text'
+ assert result == "Inconsistent dummy region text"
+ tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml"))
+ result = page_text(tree, textequiv_level="region")
+ assert result == "Inconsistent dummy region text"
# TextLine
- tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml'))
- result = page_text(tree, textequiv_level='line')
- assert result == 'Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-'
+ tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml"))
+ result = page_text(tree, textequiv_level="line")
+ assert (
+ result
+ == "Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-"
+ )
def test_text():
- assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
- assert "wieder ein. – Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
- assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt'))
+ assert "being erected at the Broadway stock" in text(
+ os.path.join(data_dir, "test.alto1.xml")
+ )
+ assert "wieder ein. – Er langte den Zettel aus dem" in text(
+ os.path.join(data_dir, "test.page2018.xml")
+ )
+ assert "Lorem ipsum" in text(os.path.join(data_dir, "test.txt"))
def test_plain(tmp_path):
with working_directory(str(tmp_path)):
- with open('ocr.txt', 'w') as ocrf:
- ocrf.write('AAAAB')
+ with open("ocr.txt", "w") as ocrf:
+ ocrf.write("AAAAB")
- result = plain_text('ocr.txt')
- expected = 'AAAAB'
+ result = plain_text("ocr.txt")
+ expected = "AAAAB"
assert result == expected
diff --git a/qurator/dinglehopper/tests/test_word_error_rate.py b/qurator/dinglehopper/tests/test_word_error_rate.py
index ad19172..bc7b91e 100644
--- a/qurator/dinglehopper/tests/test_word_error_rate.py
+++ b/qurator/dinglehopper/tests/test_word_error_rate.py
@@ -6,32 +6,81 @@ from .. import word_error_rate, words
def test_words():
- result = list(words('Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?'))
- expected = ['Der', 'schnelle', 'braune', 'Fuchs', 'kann', 'keine', '3,14', 'Meter', 'springen', 'oder']
+ result = list(
+ words("Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?")
+ )
+ expected = [
+ "Der",
+ "schnelle",
+ "braune",
+ "Fuchs",
+ "kann",
+ "keine",
+ "3,14",
+ "Meter",
+ "springen",
+ "oder",
+ ]
assert result == expected
def test_words_private_use_area():
- result = list(words(
- 'ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n'
- 'ſproene zu berliefern.'))
+ result = list(
+ words(
+ "ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n"
+ "ſproene zu berliefern."
+ )
+ )
expected = [
- 'ber', 'die', 'vielen', 'Sorgen', 'wegen', 'deelben', 'vergaß', 'Hartkopf',
- 'der', 'Frau', 'Amtmnnin', 'das', 'ver',
- 'ſproene', 'zu', 'berliefern']
+ "ber",
+ "die",
+ "vielen",
+ "Sorgen",
+ "wegen",
+ "deelben",
+ "vergaß",
+ "Hartkopf",
+ "der",
+ "Frau",
+ "Amtmnnin",
+ "das",
+ "ver",
+ "ſproene",
+ "zu",
+ "berliefern",
+ ]
assert result == expected
def test_word_error_rate():
- assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ist ein Beispielsatz!') == 0
- assert word_error_rate('Dies. ist ein Beispielsatz!', 'Dies ist ein Beispielsatz!') == 0
- assert word_error_rate('Dies. ist ein Beispielsatz!', 'Dies ist ein Beispielsatz.') == 0
+ assert (
+ word_error_rate("Dies ist ein Beispielsatz!", "Dies ist ein Beispielsatz!") == 0
+ )
+ assert (
+ word_error_rate("Dies. ist ein Beispielsatz!", "Dies ist ein Beispielsatz!")
+ == 0
+ )
+ assert (
+ word_error_rate("Dies. ist ein Beispielsatz!", "Dies ist ein Beispielsatz.")
+ == 0
+ )
- assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ist ein Beispielsarz:') == 1/4
- assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ein ist Beispielsatz!') == 2/4
+ assert (
+ word_error_rate("Dies ist ein Beispielsatz!", "Dies ist ein Beispielsarz:")
+ == 1 / 4
+ )
+ assert (
+ word_error_rate("Dies ist ein Beispielsatz!", "Dies ein ist Beispielsatz!")
+ == 2 / 4
+ )
- assert word_error_rate('Dies ist ein Beispielsatz!', '') == 4/4
- assert math.isinf(word_error_rate('', 'Dies ist ein Beispielsatz!'))
- assert word_error_rate('', '') == 0
+ assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4
+ assert math.isinf(word_error_rate("", "Dies ist ein Beispielsatz!"))
+ assert word_error_rate("", "") == 0
- assert word_error_rate('Schlyñ lorem ipsum dolor sit amet,', 'Schlym̃ lorem ipsum dolor sit amet.') == 1/6
+ assert (
+ word_error_rate(
+ "Schlyñ lorem ipsum dolor sit amet,", "Schlym̃ lorem ipsum dolor sit amet."
+ )
+ == 1 / 6
+ )
diff --git a/qurator/dinglehopper/tests/util.py b/qurator/dinglehopper/tests/util.py
index 1f224e5..8a735aa 100644
--- a/qurator/dinglehopper/tests/util.py
+++ b/qurator/dinglehopper/tests/util.py
@@ -27,6 +27,7 @@ def unzip(an_iterable_of_tuples):
class working_directory:
"""Context manager to temporarily change the working directory"""
+
def __init__(self, wd):
self.wd = wd
diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py
index 2f5a1f6..dde57b9 100644
--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@@ -20,9 +20,10 @@ def words(s: str):
def new_word_break(c, index=0):
if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area
- return 'ALetter'
+ return "ALetter"
else:
return old_word_break(c, index)
+
uniseg.wordbreak.word_break = new_word_break
# Check if c is an unwanted character, i.e. whitespace, punctuation, or similar
@@ -30,8 +31,8 @@ def words(s: str):
# See https://www.fileformat.info/info/unicode/category/index.htm
# and https://unicodebook.readthedocs.io/unicode.html#categories
- unwanted_categories = 'O', 'M', 'P', 'Z', 'S'
- unwanted_subcategories = 'Cc', 'Cf'
+ unwanted_categories = "O", "M", "P", "Z", "S"
+ unwanted_subcategories = "Cc", "Cf"
subcat = unicodedata.category(c)
cat = subcat[0]
@@ -53,7 +54,7 @@ def words(s: ExtractedText):
@multimethod
def words_normalized(s: str):
- return words(unicodedata.normalize('NFC', s))
+ return words(unicodedata.normalize("NFC", s))
@multimethod
@@ -69,7 +70,9 @@ def word_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
@multimethod
-def word_error_rate_n(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
+def word_error_rate_n(
+ reference: ExtractedText, compared: ExtractedText
+) -> Tuple[float, int]:
return word_error_rate_n(reference.text, compared.text)
@@ -84,7 +87,7 @@ def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, i
if d == 0:
return 0, n
if n == 0:
- return float('inf'), n
+ return float("inf"), n
return d / n, n
diff --git a/setup.py b/setup.py
index 7b8107a..56ae184 100644
--- a/setup.py
+++ b/setup.py
@@ -1,29 +1,29 @@
from io import open
from setuptools import find_packages, setup
-with open('requirements.txt') as fp:
+with open("requirements.txt") as fp:
install_requires = fp.read()
setup(
- name='dinglehopper',
- author='Mike Gerber, The QURATOR SPK Team',
- author_email='mike.gerber@sbb.spk-berlin.de, qurator@sbb.spk-berlin.de',
- description='The OCR evaluation tool',
- long_description=open('README.md', 'r', encoding='utf-8').read(),
- long_description_content_type='text/markdown',
- keywords='qurator ocr',
- license='Apache',
- namespace_packages=['qurator'],
- packages=find_packages(exclude=['*.tests', '*.tests.*', 'tests.*', 'tests']),
+ name="dinglehopper",
+ author="Mike Gerber, The QURATOR SPK Team",
+ author_email="mike.gerber@sbb.spk-berlin.de, qurator@sbb.spk-berlin.de",
+ description="The OCR evaluation tool",
+ long_description=open("README.md", "r", encoding="utf-8").read(),
+ long_description_content_type="text/markdown",
+ keywords="qurator ocr",
+ license="Apache",
+ namespace_packages=["qurator"],
+ packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
install_requires=install_requires,
package_data={
- '': ['*.json', 'templates/*'],
+ "": ["*.json", "templates/*"],
},
entry_points={
- 'console_scripts': [
- 'dinglehopper=qurator.dinglehopper.cli:main',
- 'dinglehopper-extract=qurator.dinglehopper.cli_extract:main',
- 'ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper',
- ]
- }
+ "console_scripts": [
+ "dinglehopper=qurator.dinglehopper.cli:main",
+ "dinglehopper-extract=qurator.dinglehopper.cli_extract:main",
+ "ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper",
+ ]
+ },
)