mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-07 19:05:13 +02:00
Merge branch 'feat/display-segment-id'
This commit is contained in:
commit
f50591abac
23 changed files with 493 additions and 139 deletions
3
.idea/dinglehopper.iml
generated
3
.idea/dinglehopper.iml
generated
|
@ -2,11 +2,10 @@
|
||||||
<module type="PYTHON_MODULE" version="4">
|
<module type="PYTHON_MODULE" version="4">
|
||||||
<component name="NewModuleRootManager">
|
<component name="NewModuleRootManager">
|
||||||
<content url="file://$MODULE_DIR$" />
|
<content url="file://$MODULE_DIR$" />
|
||||||
<orderEntry type="inheritedJdk" />
|
<orderEntry type="jdk" jdkName="Python 3.7 (dinglehopper-github)" jdkType="Python SDK" />
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
</component>
|
</component>
|
||||||
<component name="TestRunnerService">
|
<component name="TestRunnerService">
|
||||||
<option name="projectConfiguration" value="pytest" />
|
|
||||||
<option name="PROJECT_TEST_RUNNER" value="pytest" />
|
<option name="PROJECT_TEST_RUNNER" value="pytest" />
|
||||||
</component>
|
</component>
|
||||||
</module>
|
</module>
|
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
@ -1,4 +1,4 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project version="4">
|
<project version="4">
|
||||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (dinglehopper)" project-jdk-type="Python SDK" />
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (dinglehopper-github)" project-jdk-type="Python SDK" />
|
||||||
</project>
|
</project>
|
|
@ -28,16 +28,16 @@ def seq_align(s1, s2):
|
||||||
|
|
||||||
if o:
|
if o:
|
||||||
if o[0] == 'insert':
|
if o[0] == 'insert':
|
||||||
yield (None, s2[j])
|
yield None, s2[j]
|
||||||
j += 1
|
j += 1
|
||||||
elif o[0] == 'delete':
|
elif o[0] == 'delete':
|
||||||
yield (s1[i], None)
|
yield s1[i], None
|
||||||
i += 1
|
i += 1
|
||||||
elif o[0] == 'replace':
|
elif o[0] == 'replace':
|
||||||
yield (s1[i], s2[j])
|
yield s1[i], s2[j]
|
||||||
i += 1
|
i += 1
|
||||||
j += 1
|
j += 1
|
||||||
else:
|
else:
|
||||||
yield (s1[i], s2[j])
|
yield s1[i], s2[j]
|
||||||
i += 1
|
i += 1
|
||||||
j += 1
|
j += 1
|
||||||
|
|
|
@ -3,17 +3,21 @@ from __future__ import division
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
|
from multimethod import multimethod
|
||||||
from uniseg.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
from qurator.dinglehopper.edit_distance import distance
|
from .edit_distance import distance
|
||||||
|
from .extracted_text import ExtractedText
|
||||||
|
|
||||||
|
|
||||||
def character_error_rate_n(reference, compared) -> Tuple[float, int]:
|
@multimethod
|
||||||
|
def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
|
||||||
"""
|
"""
|
||||||
Compute character error rate.
|
Compute character error rate.
|
||||||
|
|
||||||
:return: character error rate and length of the reference
|
:return: character error rate and length of the reference
|
||||||
"""
|
"""
|
||||||
|
|
||||||
d = distance(reference, compared)
|
d = distance(reference, compared)
|
||||||
n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))
|
n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))
|
||||||
|
|
||||||
|
@ -26,6 +30,11 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]:
|
||||||
# XXX Should we really count newlines here?
|
# XXX Should we really count newlines here?
|
||||||
|
|
||||||
|
|
||||||
|
@multimethod
|
||||||
|
def character_error_rate_n(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
|
||||||
|
return character_error_rate_n(reference.text, compared.text)
|
||||||
|
|
||||||
|
|
||||||
def character_error_rate(reference, compared) -> float:
|
def character_error_rate(reference, compared) -> float:
|
||||||
"""
|
"""
|
||||||
Compute character error rate.
|
Compute character error rate.
|
||||||
|
|
|
@ -3,16 +3,20 @@ import os
|
||||||
import click
|
import click
|
||||||
from jinja2 import Environment, FileSystemLoader
|
from jinja2 import Environment, FileSystemLoader
|
||||||
from markupsafe import escape
|
from markupsafe import escape
|
||||||
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
|
from .character_error_rate import character_error_rate_n
|
||||||
|
from .word_error_rate import word_error_rate_n, words_normalized
|
||||||
|
from .align import seq_align
|
||||||
|
from .extracted_text import ExtractedText
|
||||||
|
from .ocr_files import extract
|
||||||
|
|
||||||
|
|
||||||
from qurator.dinglehopper import *
|
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
|
||||||
|
|
||||||
|
|
||||||
def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
|
|
||||||
gtx = ''
|
gtx = ''
|
||||||
ocrx = ''
|
ocrx = ''
|
||||||
|
|
||||||
def format_thing(t, css_classes=None):
|
def format_thing(t, css_classes=None, id_=None):
|
||||||
if t is None:
|
if t is None:
|
||||||
html_t = none
|
html_t = none
|
||||||
css_classes += ' ellipsis'
|
css_classes += ' ellipsis'
|
||||||
|
@ -21,19 +25,51 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
|
||||||
else:
|
else:
|
||||||
html_t = escape(t)
|
html_t = escape(t)
|
||||||
|
|
||||||
|
html_custom_attrs = ""
|
||||||
|
|
||||||
|
# Set Bootstrap tooltip to the segment id
|
||||||
|
if id_:
|
||||||
|
html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)
|
||||||
|
|
||||||
if css_classes:
|
if css_classes:
|
||||||
return '<span class="{css_classes}">{html_t}</span>'.format(css_classes=css_classes, html_t=html_t)
|
return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs)
|
||||||
else:
|
else:
|
||||||
return '{html_t}'.format(html_t=html_t)
|
return '{html_t}'.format(html_t=html_t)
|
||||||
|
|
||||||
for k, (g, o) in enumerate(align(gt_things, ocr_things)):
|
if isinstance(gt_in, ExtractedText):
|
||||||
if g == o:
|
if not isinstance(ocr_in, ExtractedText):
|
||||||
css_classes = None
|
raise TypeError()
|
||||||
else:
|
# XXX splitting should be done in ExtractedText
|
||||||
css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k)
|
gt_things = list(grapheme_clusters(gt_in.text))
|
||||||
|
ocr_things = list(grapheme_clusters(ocr_in.text))
|
||||||
|
else:
|
||||||
|
gt_things = gt_in
|
||||||
|
ocr_things = ocr_in
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
g_pos = 0
|
||||||
|
o_pos = 0
|
||||||
|
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
|
||||||
|
css_classes = None
|
||||||
|
gt_id = None
|
||||||
|
ocr_id = None
|
||||||
|
if g != o:
|
||||||
|
css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k)
|
||||||
|
if isinstance(gt_in, ExtractedText):
|
||||||
|
gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None
|
||||||
|
ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None
|
||||||
|
# Deletions and inserts only produce one id + None, UI must
|
||||||
|
# support this, i.e. display for the one id produced
|
||||||
|
|
||||||
|
gtx += joiner + format_thing(g, css_classes, gt_id)
|
||||||
|
ocrx += joiner + format_thing(o, css_classes, ocr_id)
|
||||||
|
|
||||||
|
if g is not None:
|
||||||
|
g_pos += len(g)
|
||||||
|
if o is not None:
|
||||||
|
o_pos += len(o)
|
||||||
|
|
||||||
gtx += joiner + format_thing(g, css_classes)
|
|
||||||
ocrx += joiner + format_thing(o, css_classes)
|
|
||||||
|
|
||||||
return \
|
return \
|
||||||
'''
|
'''
|
||||||
|
@ -51,20 +87,17 @@ def process(gt, ocr, report_prefix, *, metrics=True):
|
||||||
Click on a wrapper.
|
Click on a wrapper.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
gt_text = text(gt)
|
gt_text = extract(gt)
|
||||||
ocr_text = text(ocr)
|
ocr_text = extract(ocr)
|
||||||
|
|
||||||
gt_text = substitute_equivalences(gt_text)
|
|
||||||
ocr_text = substitute_equivalences(ocr_text)
|
|
||||||
|
|
||||||
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
|
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
|
||||||
wer, n_words = word_error_rate_n(gt_text, ocr_text)
|
wer, n_words = word_error_rate_n(gt_text, ocr_text)
|
||||||
|
|
||||||
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align)
|
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·')
|
||||||
|
|
||||||
gt_words = words_normalized(gt_text)
|
gt_words = words_normalized(gt_text)
|
||||||
ocr_words = words_normalized(ocr_text)
|
ocr_words = words_normalized(ocr_text)
|
||||||
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯', align=seq_align)
|
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯')
|
||||||
|
|
||||||
def json_float(value):
|
def json_float(value):
|
||||||
"""Convert a float value to an JSON float.
|
"""Convert a float value to an JSON float.
|
||||||
|
|
|
@ -5,8 +5,11 @@ from functools import partial, lru_cache
|
||||||
from typing import Sequence, Tuple
|
from typing import Sequence, Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from multimethod import multimethod
|
||||||
from uniseg.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
|
from .extracted_text import ExtractedText
|
||||||
|
|
||||||
|
|
||||||
def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
|
def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
|
||||||
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
||||||
|
@ -69,15 +72,21 @@ def levenshtein_matrix_cache_clear():
|
||||||
_levenshtein_matrix.cache_clear()
|
_levenshtein_matrix.cache_clear()
|
||||||
|
|
||||||
|
|
||||||
def distance(s1, s2):
|
@multimethod
|
||||||
|
def distance(s1: str, s2: str):
|
||||||
"""Compute the Levenshtein edit distance between two Unicode strings
|
"""Compute the Levenshtein edit distance between two Unicode strings
|
||||||
|
|
||||||
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
|
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
|
||||||
clusters. This should be the correct way to compare two Unicode strings.
|
clusters. This should be the correct way to compare two Unicode strings.
|
||||||
"""
|
"""
|
||||||
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
|
seq1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
|
||||||
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
|
seq2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
|
||||||
return levenshtein(s1, s2)
|
return levenshtein(seq1, seq2)
|
||||||
|
|
||||||
|
|
||||||
|
@multimethod
|
||||||
|
def distance(s1: ExtractedText, s2: ExtractedText):
|
||||||
|
return distance(s1.text, s2.text)
|
||||||
|
|
||||||
|
|
||||||
def seq_editops(seq1, seq2):
|
def seq_editops(seq1, seq2):
|
||||||
|
@ -116,7 +125,11 @@ def seq_editops(seq1, seq2):
|
||||||
|
|
||||||
|
|
||||||
def editops(word1, word2):
|
def editops(word1, word2):
|
||||||
# XXX Note that this returns indices to the _grapheme clusters_, not characters!
|
"""
|
||||||
|
Return sequence of edit operations transforming one string to another.
|
||||||
|
|
||||||
|
Note that this returns indices to the _grapheme clusters_, not characters!
|
||||||
|
"""
|
||||||
word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))
|
word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))
|
||||||
word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))
|
word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))
|
||||||
return seq_editops(word1, word2)
|
return seq_editops(word1, word2)
|
||||||
|
|
118
qurator/dinglehopper/extracted_text.py
Normal file
118
qurator/dinglehopper/extracted_text.py
Normal file
|
@ -0,0 +1,118 @@
|
||||||
|
import enum
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
from contextlib import suppress
|
||||||
|
from itertools import repeat
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import attr
|
||||||
|
|
||||||
|
from .substitute_equivalences import substitute_equivalences
|
||||||
|
|
||||||
|
|
||||||
|
class Normalization(enum.Enum):
|
||||||
|
NFC = 1
|
||||||
|
NFC_MUFI = 2 # TODO
|
||||||
|
NFC_SBB = 3
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(text, normalization):
|
||||||
|
if normalization == Normalization.NFC:
|
||||||
|
return unicodedata.normalize('NFC', text)
|
||||||
|
if normalization == Normalization.NFC_MUFI:
|
||||||
|
raise NotImplementedError()
|
||||||
|
if normalization == Normalization.NFC_SBB:
|
||||||
|
return substitute_equivalences(text)
|
||||||
|
else:
|
||||||
|
raise ValueError()
|
||||||
|
|
||||||
|
|
||||||
|
# XXX hack
|
||||||
|
def normalize_sbb(t):
|
||||||
|
return normalize(t, Normalization.NFC_SBB)
|
||||||
|
|
||||||
|
|
||||||
|
@attr.s(frozen=True)
|
||||||
|
class ExtractedText:
|
||||||
|
"""
|
||||||
|
Extracted text
|
||||||
|
|
||||||
|
Objects of this class are guaranteed to be a. always in their normalization and
|
||||||
|
b. in NFC.
|
||||||
|
"""
|
||||||
|
segment_id = attr.ib(type=Optional[str])
|
||||||
|
|
||||||
|
@segment_id.validator
|
||||||
|
def check(self, _, value):
|
||||||
|
if value is None:
|
||||||
|
return
|
||||||
|
if not re.match(r'[\w\d_-]+', value):
|
||||||
|
raise ValueError('Malformed segment id "{}"'.format(value))
|
||||||
|
|
||||||
|
# An object contains either
|
||||||
|
# a. _text itself
|
||||||
|
# b. or segments (ExtractedText) and a joiner
|
||||||
|
|
||||||
|
segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list))
|
||||||
|
joiner = attr.ib(type=Optional[str])
|
||||||
|
_text = attr.ib(type=Optional[str])
|
||||||
|
|
||||||
|
@segments.validator
|
||||||
|
def check(self, _, value):
|
||||||
|
if value is not None and self._text is not None:
|
||||||
|
raise ValueError("Can't have both segments and text")
|
||||||
|
|
||||||
|
@_text.validator
|
||||||
|
def check(self, _, value):
|
||||||
|
if value is not None and self.segments is not None:
|
||||||
|
raise ValueError("Can't have both segments and text")
|
||||||
|
if value is not None and unicodedata.normalize('NFC', value) != value:
|
||||||
|
raise ValueError('String "{}" is not in NFC.'.format(value))
|
||||||
|
if value is not None and normalize(value, self.normalization) != value:
|
||||||
|
raise ValueError('String "{}" is not normalized.'.format(value))
|
||||||
|
|
||||||
|
normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text(self):
|
||||||
|
if self._text is not None:
|
||||||
|
if self._text == '':
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return self._text
|
||||||
|
else:
|
||||||
|
return self.joiner.join(s.text for s in self.segments)
|
||||||
|
|
||||||
|
_segment_id_for_pos = None
|
||||||
|
|
||||||
|
def segment_id_for_pos(self, pos):
|
||||||
|
# Calculate segment ids once, on the first call
|
||||||
|
if not self._segment_id_for_pos:
|
||||||
|
segment_id_for_pos = []
|
||||||
|
for s in self.segments:
|
||||||
|
segment_id_for_pos.extend(repeat(s.segment_id, len(s.text)))
|
||||||
|
segment_id_for_pos.extend(repeat(None, len(self.joiner)))
|
||||||
|
segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)]
|
||||||
|
# This is frozen, so we have to jump through the hoop:
|
||||||
|
object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
|
||||||
|
assert self._segment_id_for_pos
|
||||||
|
|
||||||
|
return self._segment_id_for_pos[pos]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_text_segment(cls, text_segment, nsmap):
|
||||||
|
"""Build an ExtractedText from a PAGE content text element"""
|
||||||
|
|
||||||
|
segment_id = text_segment.attrib['id']
|
||||||
|
segment_text = None
|
||||||
|
with suppress(AttributeError):
|
||||||
|
segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
|
||||||
|
segment_text = segment_text or ''
|
||||||
|
segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization
|
||||||
|
segment_text = segment_text or ''
|
||||||
|
return cls(segment_id, None, None, segment_text)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_str(cls, text, normalization=Normalization.NFC_SBB):
|
||||||
|
normalized_text = normalize(text, normalization)
|
||||||
|
return cls(None, None, None, normalized_text, normalization=normalization)
|
|
@ -1,14 +1,16 @@
|
||||||
from __future__ import division, print_function
|
from __future__ import division, print_function
|
||||||
|
|
||||||
|
from typing import Generator
|
||||||
from warnings import warn
|
from warnings import warn
|
||||||
|
|
||||||
from lxml import etree as ET
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
from lxml import etree as ET
|
||||||
from lxml.etree import XMLSyntaxError
|
from lxml.etree import XMLSyntaxError
|
||||||
|
|
||||||
|
from .extracted_text import ExtractedText, normalize_sbb
|
||||||
|
|
||||||
def alto_namespace(tree):
|
|
||||||
|
def alto_namespace(tree: ET.ElementTree) -> str:
|
||||||
"""Return the ALTO namespace used in the given ElementTree.
|
"""Return the ALTO namespace used in the given ElementTree.
|
||||||
|
|
||||||
This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not
|
This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not
|
||||||
|
@ -21,17 +23,22 @@ def alto_namespace(tree):
|
||||||
raise ValueError('Not an ALTO tree')
|
raise ValueError('Not an ALTO tree')
|
||||||
|
|
||||||
|
|
||||||
def alto_text(tree):
|
def alto_extract_lines(tree: ET.ElementTree) -> Generator[ExtractedText, None, None]:
|
||||||
"""Extract text from the given ALTO ElementTree."""
|
|
||||||
|
|
||||||
nsmap = {'alto': alto_namespace(tree)}
|
nsmap = {'alto': alto_namespace(tree)}
|
||||||
|
for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap):
|
||||||
|
line_id = line.attrib.get('ID')
|
||||||
|
line_text = ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap))
|
||||||
|
yield ExtractedText(line_id, None, None, normalize_sbb(line_text))
|
||||||
|
# FIXME hardcoded SBB normalization
|
||||||
|
|
||||||
lines = (
|
|
||||||
' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap))
|
|
||||||
for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap))
|
|
||||||
text_ = '\n'.join(lines)
|
|
||||||
|
|
||||||
return text_
|
def alto_extract(tree: ET.ElementTree()) -> ExtractedText:
|
||||||
|
"""Extract text from the given ALTO ElementTree."""
|
||||||
|
return ExtractedText(None, list(alto_extract_lines(tree)), '\n', None)
|
||||||
|
|
||||||
|
|
||||||
|
def alto_text(tree):
|
||||||
|
return alto_extract(tree).text
|
||||||
|
|
||||||
|
|
||||||
def page_namespace(tree):
|
def page_namespace(tree):
|
||||||
|
@ -47,18 +54,12 @@ def page_namespace(tree):
|
||||||
raise ValueError('Not a PAGE tree')
|
raise ValueError('Not a PAGE tree')
|
||||||
|
|
||||||
|
|
||||||
def page_text(tree):
|
def page_extract(tree):
|
||||||
"""Extract text from the given PAGE content ElementTree."""
|
"""Extract text from the given PAGE content ElementTree."""
|
||||||
|
|
||||||
nsmap = {'page': page_namespace(tree)}
|
nsmap = {'page': page_namespace(tree)}
|
||||||
|
|
||||||
def region_text(region):
|
regions = []
|
||||||
try:
|
|
||||||
return region.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
|
|
||||||
except AttributeError:
|
|
||||||
return None
|
|
||||||
|
|
||||||
region_texts = []
|
|
||||||
reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap)
|
reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap)
|
||||||
if reading_order is not None:
|
if reading_order is not None:
|
||||||
for group in reading_order.iterfind('./*', namespaces=nsmap):
|
for group in reading_order.iterfind('./*', namespaces=nsmap):
|
||||||
|
@ -68,39 +69,56 @@ def page_text(tree):
|
||||||
region_id = region_ref_indexed.attrib['regionRef']
|
region_id = region_ref_indexed.attrib['regionRef']
|
||||||
region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap)
|
region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap)
|
||||||
if region is not None:
|
if region is not None:
|
||||||
region_texts.append(region_text(region))
|
regions.append(ExtractedText.from_text_segment(region, nsmap))
|
||||||
else:
|
else:
|
||||||
warn('Not a TextRegion: "%s"' % region_id)
|
warn('Not a TextRegion: "%s"' % region_id)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
else:
|
else:
|
||||||
for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap):
|
for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap):
|
||||||
region_texts.append(region_text(region))
|
regions.append(ExtractedText.from_text_segment(region, nsmap))
|
||||||
|
|
||||||
# XXX Does a file have to have regions etc.? region vs lines etc.
|
|
||||||
# Filter empty region texts
|
# Filter empty region texts
|
||||||
region_texts = (t for t in region_texts if t)
|
regions = [r for r in regions if r.text is not None]
|
||||||
|
|
||||||
text_ = '\n'.join(region_texts)
|
return ExtractedText(None, regions, '\n', None)
|
||||||
|
|
||||||
return text_
|
|
||||||
|
|
||||||
|
|
||||||
def text(filename):
|
def page_text(tree):
|
||||||
"""Read the text from the given file.
|
return page_extract(tree).text
|
||||||
|
|
||||||
|
|
||||||
|
def plain_extract(filename):
|
||||||
|
with open(filename, 'r') as f:
|
||||||
|
return ExtractedText(
|
||||||
|
None,
|
||||||
|
[ExtractedText('line %d' % no, None, None, line) for no, line in enumerate(f.readlines())],
|
||||||
|
'\n',
|
||||||
|
None
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def plain_text(filename):
|
||||||
|
return plain_extract(filename).text
|
||||||
|
|
||||||
|
|
||||||
|
def extract(filename):
|
||||||
|
"""Extract the text from the given file.
|
||||||
|
|
||||||
Supports PAGE, ALTO and falls back to plain text.
|
Supports PAGE, ALTO and falls back to plain text.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tree = ET.parse(filename)
|
tree = ET.parse(filename)
|
||||||
except XMLSyntaxError:
|
except XMLSyntaxError:
|
||||||
with open(filename, 'r') as f:
|
return plain_extract(filename)
|
||||||
return f.read()
|
|
||||||
try:
|
try:
|
||||||
return page_text(tree)
|
return page_extract(tree)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return alto_text(tree)
|
return alto_extract(tree)
|
||||||
|
|
||||||
|
|
||||||
|
def text(filename):
|
||||||
|
return extract(filename).text
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -7,8 +7,8 @@ from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
||||||
from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
|
from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
|
||||||
from pkg_resources import resource_string
|
from pkg_resources import resource_string
|
||||||
|
|
||||||
from qurator.dinglehopper.cli import process as cli_process
|
from .cli import process as cli_process
|
||||||
from qurator.dinglehopper.edit_distance import levenshtein_matrix_cache_clear
|
from .edit_distance import levenshtein_matrix_cache_clear
|
||||||
|
|
||||||
OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
|
OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
|
||||||
|
|
||||||
|
|
|
@ -1,21 +1,15 @@
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
|
|
||||||
def substitute_equivalences(s):
|
def unjoin_ligatures(s):
|
||||||
|
"""Unjoin ligatures, i.e. ff becomes ff."""
|
||||||
|
|
||||||
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
|
|
||||||
# It might make sense to use different rules for GT and for the different OCR
|
|
||||||
equivalences = {
|
equivalences = {
|
||||||
'': 'ü',
|
|
||||||
'': 'ſſ',
|
'': 'ſſ',
|
||||||
"\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
|
"\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
|
||||||
'': 'ä',
|
|
||||||
'': 'ch',
|
'': 'ch',
|
||||||
'==': '–', # → en-dash
|
|
||||||
'—': '–', # em-dash → en-dash
|
|
||||||
'': 'ck',
|
'': 'ck',
|
||||||
'': 'll',
|
'': 'll',
|
||||||
'': 'ö',
|
|
||||||
'': 'ſi',
|
'': 'ſi',
|
||||||
'': 'ſt',
|
'': 'ſt',
|
||||||
'fi': 'fi',
|
'fi': 'fi',
|
||||||
|
@ -23,12 +17,7 @@ def substitute_equivalences(s):
|
||||||
'fl': 'fl',
|
'fl': 'fl',
|
||||||
'ffi': 'ffi',
|
'ffi': 'ffi',
|
||||||
'': 'ct',
|
'': 'ct',
|
||||||
'’': '\'',
|
|
||||||
'⸗': '-',
|
|
||||||
'': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
|
'': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
|
||||||
'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
|
|
||||||
'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
|
|
||||||
'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
|
|
||||||
'\uf532': 'as', # eMOP: Latin small ligature as
|
'\uf532': 'as', # eMOP: Latin small ligature as
|
||||||
'\uf533': 'is', # eMOP: Latin small ligature is
|
'\uf533': 'is', # eMOP: Latin small ligature is
|
||||||
'\uf534': 'us', # eMOP: Latin small ligature us
|
'\uf534': 'us', # eMOP: Latin small ligature us
|
||||||
|
@ -37,10 +26,32 @@ def substitute_equivalences(s):
|
||||||
'\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly?
|
'\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly?
|
||||||
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
|
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
|
||||||
'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST
|
'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST
|
||||||
'\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
|
|
||||||
}
|
}
|
||||||
|
|
||||||
s = unicodedata.normalize('NFC', s)
|
s = unicodedata.normalize('NFC', s)
|
||||||
for fr, to in equivalences.items():
|
for fr, to in equivalences.items():
|
||||||
s = s.replace(fr, to)
|
s = s.replace(fr, to)
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def substitute_equivalences(s):
|
||||||
|
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
|
||||||
|
# It might make sense to use different rules for GT and for the different OCR
|
||||||
|
equivalences = {
|
||||||
|
'': 'ü',
|
||||||
|
'': 'ä',
|
||||||
|
'==': '–', # → en-dash
|
||||||
|
'—': '–', # em-dash → en-dash
|
||||||
|
'': 'ö',
|
||||||
|
'’': '\'',
|
||||||
|
'⸗': '-',
|
||||||
|
'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
|
||||||
|
'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
|
||||||
|
'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
|
||||||
|
'\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
|
||||||
|
}
|
||||||
|
|
||||||
|
s = unicodedata.normalize('NFC', s)
|
||||||
|
s = unjoin_ligatures(s)
|
||||||
|
for fr, to in equivalences.items():
|
||||||
|
s = s.replace(fr, to)
|
||||||
|
return s
|
||||||
|
|
|
@ -1,14 +1,15 @@
|
||||||
function find_diff_class(classes) {
|
function find_diff_class(classes) {
|
||||||
return classes.split(/\s+/).find(x => x.match(/.diff\d.*/));
|
return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
|
||||||
}
|
}
|
||||||
|
|
||||||
$(document).ready(function() {
|
$(document).ready(function() {
|
||||||
|
/* Enable Bootstrap tooltips */
|
||||||
|
$('[data-toggle="tooltip"]').tooltip();
|
||||||
|
|
||||||
$('.diff').mouseover(function() {
|
$('.diff').mouseover(function() {
|
||||||
let c = find_diff_class($(this).attr('class'))
|
find_diff_class($(this).attr('class')).addClass('diff-highlight');
|
||||||
$('.' + c).addClass('diff-highlight')
|
|
||||||
});
|
});
|
||||||
$('.diff').mouseout(function() {
|
$('.diff').mouseout(function() {
|
||||||
let c = find_diff_class($(this).attr('class'))
|
find_diff_class($(this).attr('class')).removeClass('diff-highlight');
|
||||||
$('.' + c).removeClass('diff-highlight')
|
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
68
qurator/dinglehopper/tests/extracted_text_test.py
Normal file
68
qurator/dinglehopper/tests/extracted_text_test.py
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
import unicodedata
|
||||||
|
import pytest
|
||||||
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
|
from .. import seq_align, ExtractedText
|
||||||
|
|
||||||
|
|
||||||
|
def test_text():
|
||||||
|
test1 = ExtractedText(None, [
|
||||||
|
ExtractedText('s0', None, None, 'foo'),
|
||||||
|
ExtractedText('s1', None, None, 'bar'),
|
||||||
|
ExtractedText('s2', None, None, 'bazinga')
|
||||||
|
], ' ', None)
|
||||||
|
|
||||||
|
assert test1.text == 'foo bar bazinga'
|
||||||
|
assert test1.segment_id_for_pos(0) == 's0'
|
||||||
|
assert test1.segment_id_for_pos(3) is None
|
||||||
|
assert test1.segment_id_for_pos(10) == 's2'
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalization_check():
|
||||||
|
with pytest.raises(ValueError, match=r'.*is not in NFC.*'):
|
||||||
|
ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ'))
|
||||||
|
assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ'))
|
||||||
|
|
||||||
|
|
||||||
|
AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id')
|
||||||
|
|
||||||
|
|
||||||
|
def test_align():
|
||||||
|
"""
|
||||||
|
Test aligning by character while retaining segment id info
|
||||||
|
|
||||||
|
The difficulty here is that aligning should work on grapheme clusters,
|
||||||
|
not Python characters.
|
||||||
|
"""
|
||||||
|
|
||||||
|
test1 = ExtractedText(None, [
|
||||||
|
ExtractedText('s0', None, None, 'foo'),
|
||||||
|
ExtractedText('s1', None, None, 'bar'),
|
||||||
|
ExtractedText('s2', None, None, 'batzinga')
|
||||||
|
], ' ', None)
|
||||||
|
test2 = ExtractedText(None, [
|
||||||
|
ExtractedText('x0', None, None, 'foo'),
|
||||||
|
ExtractedText('x1', None, None, 'bar'),
|
||||||
|
ExtractedText('x2', None, None, '.'), # extra .
|
||||||
|
ExtractedText('x3', None, None, 'bazim̃ga'), # deletion + different grapheme cluster, m̃ also is two Python characters
|
||||||
|
], ' ', None)
|
||||||
|
|
||||||
|
left_pos = 0; right_pos = 0; alignment = []
|
||||||
|
for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)):
|
||||||
|
left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
|
||||||
|
right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
|
||||||
|
el = AlignmentElement(left, right, left_id, right_id)
|
||||||
|
alignment.append(el)
|
||||||
|
if left is not None:
|
||||||
|
left_pos += len(left)
|
||||||
|
if right is not None:
|
||||||
|
right_pos += len(right)
|
||||||
|
|
||||||
|
print('test1: {}'.format(test1.text))
|
||||||
|
print('test2: {}'.format(test2.text))
|
||||||
|
|
||||||
|
assert alignment[0] == ('f', 'f', 's0', 'x0')
|
||||||
|
assert alignment[8] == (None, '.', None, 'x2')
|
||||||
|
assert alignment[12] == ('t', None, 's2', None)
|
||||||
|
assert alignment[15] == ('n', 'm̃', 's2', 'x3')
|
|
@ -78,7 +78,8 @@ def test_lines():
|
||||||
|
|
||||||
|
|
||||||
def test_lines_similar():
|
def test_lines_similar():
|
||||||
"""Test comparing list of lines while using a "weaker equivalence".
|
"""
|
||||||
|
Test comparing list of lines while using a "weaker equivalence".
|
||||||
|
|
||||||
This mainly serves as documentation.
|
This mainly serves as documentation.
|
||||||
"""
|
"""
|
||||||
|
@ -88,7 +89,14 @@ def test_lines_similar():
|
||||||
self._string = string
|
self._string = string
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
return distance(self._string, other._string) < 2 # XXX NOT the final version
|
# Just an example!
|
||||||
|
min_len = min(len(self._string), len(other._string))
|
||||||
|
if min_len > 0:
|
||||||
|
normalized_distance = distance(self._string, other._string)/min_len
|
||||||
|
similar = normalized_distance < 0.1
|
||||||
|
else:
|
||||||
|
similar = False
|
||||||
|
return similar
|
||||||
|
|
||||||
def __ne__(self, other):
|
def __ne__(self, other):
|
||||||
return not self.__eq__(other)
|
return not self.__eq__(other)
|
||||||
|
@ -106,3 +114,6 @@ def test_lines_similar():
|
||||||
left, right = unzip(result)
|
left, right = unzip(result)
|
||||||
assert list(left) == [SimilarString('This is a line.'), SimilarString('This is another'), None, SimilarString('And the last line')]
|
assert list(left) == [SimilarString('This is a line.'), SimilarString('This is another'), None, SimilarString('And the last line')]
|
||||||
assert list(right) == [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')]
|
assert list(right) == [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')]
|
||||||
|
|
||||||
|
# Test __eq__ (i.e. is it a substitution or a similar string?)
|
||||||
|
assert list(left)[0] == list(right)[0]
|
||||||
|
|
|
@ -13,11 +13,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
def test_align_page_files():
|
def test_align_page_files():
|
||||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||||
# → 4 elements in the alignment should be different.
|
# → 2 elements in the alignment should be different, the ligature is
|
||||||
|
# (currently) not counted due to normalization.
|
||||||
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
|
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
|
||||||
|
|
||||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||||
|
|
||||||
result = list(align(gt, ocr))
|
result = list(align(gt, ocr))
|
||||||
assert sum(left != right for left, right in result) == 4
|
for left, right in result:
|
||||||
|
if left != right:
|
||||||
|
print(left, right)
|
||||||
|
assert sum(left != right for left, right in result) == 2
|
||||||
|
|
|
@ -4,6 +4,7 @@ import os
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
from .. import character_error_rate, page_text, alto_text
|
from .. import character_error_rate, page_text, alto_text
|
||||||
|
|
||||||
|
@ -13,9 +14,14 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
def test_character_error_rate_between_page_files():
|
def test_character_error_rate_between_page_files():
|
||||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||||
|
# The fi ligature does not count.
|
||||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||||
assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311) # 2 TextRegions, 1 \n
|
|
||||||
|
gt_len = len(list(grapheme_clusters(gt)))
|
||||||
|
expected_cer = 2/gt_len
|
||||||
|
|
||||||
|
assert character_error_rate(gt, ocr) == expected_cer
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import os
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -10,14 +9,17 @@ from ..cli import process
|
||||||
def test_cli_json(tmp_path):
|
def test_cli_json(tmp_path):
|
||||||
"""Test that the cli/process() yields a loadable JSON report"""
|
"""Test that the cli/process() yields a loadable JSON report"""
|
||||||
|
|
||||||
# XXX Path.__str__() is necessary for Python 3.5
|
|
||||||
with working_directory(str(tmp_path)):
|
with working_directory(str(tmp_path)):
|
||||||
with open('gt.txt', 'w') as gtf:
|
with open('gt.txt', 'w') as gtf:
|
||||||
gtf.write('AAAAA')
|
gtf.write('AAAAA')
|
||||||
with open('ocr.txt', 'w') as ocrf:
|
with open('ocr.txt', 'w') as ocrf:
|
||||||
ocrf.write('AAAAB')
|
ocrf.write('AAAAB')
|
||||||
|
|
||||||
|
with open('gt.txt', 'r') as gtf:
|
||||||
|
print(gtf.read())
|
||||||
process('gt.txt', 'ocr.txt', 'report')
|
process('gt.txt', 'ocr.txt', 'report')
|
||||||
|
with open('report.json', 'r') as jsonf:
|
||||||
|
print(jsonf.read())
|
||||||
with open('report.json', 'r') as jsonf:
|
with open('report.json', 'r') as jsonf:
|
||||||
j = json.load(jsonf)
|
j = json.load(jsonf)
|
||||||
assert j['cer'] == pytest.approx(0.2)
|
assert j['cer'] == pytest.approx(0.2)
|
||||||
|
@ -26,7 +28,6 @@ def test_cli_json(tmp_path):
|
||||||
def test_cli_json_cer_is_infinity(tmp_path):
|
def test_cli_json_cer_is_infinity(tmp_path):
|
||||||
"""Test that the cli/process() yields a loadable JSON report when CER == inf"""
|
"""Test that the cli/process() yields a loadable JSON report when CER == inf"""
|
||||||
|
|
||||||
# XXX Path.__str__() is necessary for Python 3.5
|
|
||||||
with working_directory(str(tmp_path)):
|
with working_directory(str(tmp_path)):
|
||||||
with open('gt.txt', 'w') as gtf:
|
with open('gt.txt', 'w') as gtf:
|
||||||
gtf.write('') # Empty to yield CER == inf
|
gtf.write('') # Empty to yield CER == inf
|
||||||
|
|
|
@ -13,9 +13,11 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
def test_distance_between_page_files():
|
def test_distance_between_page_files():
|
||||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||||
|
# Due to normalization, we don't count the ligature.
|
||||||
|
# → 2 differences
|
||||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||||
assert distance(gt, ocr) == 4
|
assert distance(gt, ocr) == 2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
|
|
|
@ -1,12 +1,10 @@
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import shutil
|
import shutil
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from click.testing import CliRunner
|
from click.testing import CliRunner
|
||||||
import pytest
|
|
||||||
from .util import working_directory
|
from .util import working_directory
|
||||||
|
|
||||||
|
|
||||||
|
@ -18,8 +16,6 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||||
def test_ocrd_cli(tmp_path):
|
def test_ocrd_cli(tmp_path):
|
||||||
"""Test OCR-D interface"""
|
"""Test OCR-D interface"""
|
||||||
|
|
||||||
# XXX Path.str() is necessary for Python 3.5
|
|
||||||
|
|
||||||
# Copy test workspace
|
# Copy test workspace
|
||||||
test_workspace_dir_source = Path(data_dir) / 'actevedef_718448162'
|
test_workspace_dir_source = Path(data_dir) / 'actevedef_718448162'
|
||||||
test_workspace_dir = tmp_path / 'test_ocrd_cli'
|
test_workspace_dir = tmp_path / 'test_ocrd_cli'
|
||||||
|
|
|
@ -12,14 +12,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||||
|
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
def test_word_error_rate_between_page_files():
|
def test_word_error_rate_between_page_files():
|
||||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. → 3 changed words
|
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
|
||||||
|
# the ligature does not count → 2 errors
|
||||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||||
|
|
||||||
gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line
|
gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line
|
||||||
assert len(list(words(gt))) == gt_word_count
|
assert len(list(words(gt))) == gt_word_count
|
||||||
|
|
||||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||||
assert word_error_rate(gt, ocr) == 3/gt_word_count
|
assert word_error_rate(gt, ocr) == 2/gt_word_count
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
|
|
|
@ -6,7 +6,8 @@ import textwrap
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from .. import alto_namespace, alto_text, page_namespace, page_text, text
|
from .util import working_directory
|
||||||
|
from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text
|
||||||
|
|
||||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||||
|
|
||||||
|
@ -49,27 +50,51 @@ def test_page_namespace():
|
||||||
def test_page_test():
|
def test_page_test():
|
||||||
tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
|
tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
|
||||||
result = page_text(tree)
|
result = page_text(tree)
|
||||||
|
|
||||||
|
# We are currently normalizing on extraction, so the text is normalized.
|
||||||
|
#
|
||||||
|
# expected = textwrap.dedent("""\
|
||||||
|
# ber die vielen Sorgen wegen deelben vergaß
|
||||||
|
# Hartkopf, der Frau Amtmnnin das ver⸗
|
||||||
|
# ſproene zu berliefern. — Ein Erpreer
|
||||||
|
# wurde an ihn abgeſit, um ihn ums Him⸗
|
||||||
|
# melswien zu ſagen, daß er das Verſproene
|
||||||
|
# glei den Augenbli berbringen mte, die
|
||||||
|
# Frau Amtmnnin htte auf ihn verlaen,
|
||||||
|
# und nun wßte e nit, was e anfangen
|
||||||
|
# ſote. Den Augenbli ſote er kommen,
|
||||||
|
# ſon vergieng e in ihrer Ang. — Die
|
||||||
|
# Ge wren ſon angekommen, und es fehlte
|
||||||
|
# ihr do no an aem. —
|
||||||
|
# Hartkopf mußte er bennen, und
|
||||||
|
# endli na langem Nadenken fiel es ihm er
|
||||||
|
# wieder ein. — Er langte den Zettel aus dem
|
||||||
|
# Accisbue heraus, und ſagte ſeiner Frau, daß
|
||||||
|
# e das, was da wre, herbeyſaffen mte.
|
||||||
|
# Jndeß mangelten do einige Generalia, die
|
||||||
|
# alſo wegfielen. — Hartkopf gieng ſelb
|
||||||
|
# mit und berbrate es. —""")
|
||||||
expected = textwrap.dedent("""\
|
expected = textwrap.dedent("""\
|
||||||
ber die vielen Sorgen wegen deelben vergaß
|
über die vielen Sorgen wegen deſſelben vergaß
|
||||||
Hartkopf, der Frau Amtmnnin das ver⸗
|
Hartkopf, der Frau Amtmännin das ver-
|
||||||
ſproene zu berliefern. — Ein Erpreer
|
ſprochene zu überliefern. – Ein Erpreſſer
|
||||||
wurde an ihn abgeſit, um ihn ums Him⸗
|
wurde an ihn abgeſchickt, um ihn ums Him-
|
||||||
melswien zu ſagen, daß er das Verſproene
|
melswillen zu ſagen, daß er das Verſprochene
|
||||||
glei den Augenbli berbringen mte, die
|
gleich den Augenblick überbringen möchte, die
|
||||||
Frau Amtmnnin htte auf ihn verlaen,
|
Frau Amtmännin hätte ſich auf ihn verlaſſen,
|
||||||
und nun wßte e nit, was e anfangen
|
und nun wüßte ſie nicht, was ſie anfangen
|
||||||
ſote. Den Augenbli ſote er kommen,
|
ſollte. Den Augenblick ſollte er kommen,
|
||||||
ſon vergieng e in ihrer Ang. — Die
|
ſonſt vergieng ſie in ihrer Angſt. – Die
|
||||||
Ge wren ſon angekommen, und es fehlte
|
Gäſte wären ſchon angekommen, und es fehlte
|
||||||
ihr do no an aem. —
|
ihr doch noch an allem. –
|
||||||
Hartkopf mußte er bennen, und
|
Hartkopf mußte ſich erſt beſinnen, und
|
||||||
endli na langem Nadenken fiel es ihm er
|
endlich nach langem Nachdenken fiel es ihm erſt
|
||||||
wieder ein. — Er langte den Zettel aus dem
|
wieder ein. – Er langte den Zettel aus dem
|
||||||
Accisbue heraus, und ſagte ſeiner Frau, daß
|
Accisbuche heraus, und ſagte ſeiner Frau, daß
|
||||||
e das, was da wre, herbeyſaffen mte.
|
ſie das, was da wäre, herbeyſchaffen möchte.
|
||||||
Jndeß mangelten do einige Generalia, die
|
Jndeß mangelten doch einige Generalia, die
|
||||||
alſo wegfielen. — Hartkopf gieng ſelb
|
alſo wegfielen. – Hartkopf gieng ſelbſt
|
||||||
mit und berbrate es. —""")
|
mit und überbrachte es. –""")
|
||||||
assert result == expected
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
|
@ -92,7 +117,8 @@ def test_page_order():
|
||||||
tree = ET.parse(os.path.join(data_dir, 'order.page.xml'))
|
tree = ET.parse(os.path.join(data_dir, 'order.page.xml'))
|
||||||
result = page_text(tree)
|
result = page_text(tree)
|
||||||
|
|
||||||
assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL)
|
print(result)
|
||||||
|
assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die', result, re.DOTALL)
|
||||||
|
|
||||||
|
|
||||||
def test_page_mixed_regions():
|
def test_page_mixed_regions():
|
||||||
|
@ -106,5 +132,15 @@ def test_page_mixed_regions():
|
||||||
|
|
||||||
def test_text():
|
def test_text():
|
||||||
assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
|
assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
|
||||||
assert "wieder ein. — Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
|
assert "wieder ein. – Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
|
||||||
assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt'))
|
assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt'))
|
||||||
|
|
||||||
|
|
||||||
|
def test_plain(tmp_path):
|
||||||
|
with working_directory(str(tmp_path)):
|
||||||
|
with open('ocr.txt', 'w') as ocrf:
|
||||||
|
ocrf.write('AAAAB')
|
||||||
|
|
||||||
|
result = plain_text('ocr.txt')
|
||||||
|
expected = 'AAAAB'
|
||||||
|
assert result == expected
|
||||||
|
|
|
@ -21,8 +21,8 @@ def diffprint(x, y):
|
||||||
_diffprint(x, y)
|
_diffprint(x, y)
|
||||||
|
|
||||||
|
|
||||||
def unzip(l):
|
def unzip(an_iterable_of_tuples):
|
||||||
return zip(*l)
|
return zip(*an_iterable_of_tuples)
|
||||||
|
|
||||||
|
|
||||||
class working_directory:
|
class working_directory:
|
||||||
|
|
|
@ -1,14 +1,19 @@
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
|
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from typing import Tuple
|
from typing import Tuple, Iterable
|
||||||
|
from multimethod import multimethod
|
||||||
|
|
||||||
import uniseg.wordbreak
|
import uniseg.wordbreak
|
||||||
|
|
||||||
from .edit_distance import levenshtein
|
from .edit_distance import levenshtein
|
||||||
|
from . import ExtractedText
|
||||||
|
|
||||||
|
|
||||||
def words(s):
|
@multimethod
|
||||||
|
def words(s: str):
|
||||||
|
"""Extract words from a string"""
|
||||||
|
|
||||||
# Patch uniseg.wordbreak.word_break to deal with our private use characters. See also
|
# Patch uniseg.wordbreak.word_break to deal with our private use characters. See also
|
||||||
# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
|
# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
|
||||||
old_word_break = uniseg.wordbreak.word_break
|
old_word_break = uniseg.wordbreak.word_break
|
||||||
|
@ -41,17 +46,37 @@ def words(s):
|
||||||
yield word
|
yield word
|
||||||
|
|
||||||
|
|
||||||
def words_normalized(s):
|
@multimethod
|
||||||
|
def words(s: ExtractedText):
|
||||||
|
return words(s.text)
|
||||||
|
|
||||||
|
|
||||||
|
@multimethod
|
||||||
|
def words_normalized(s: str):
|
||||||
return words(unicodedata.normalize('NFC', s))
|
return words(unicodedata.normalize('NFC', s))
|
||||||
|
|
||||||
|
|
||||||
def word_error_rate_n(reference, compared) -> Tuple[float, int]:
|
@multimethod
|
||||||
if isinstance(reference, str):
|
def words_normalized(s: ExtractedText):
|
||||||
reference_seq = list(words_normalized(reference))
|
return words_normalized(s.text)
|
||||||
compared_seq = list(words_normalized(compared))
|
|
||||||
else:
|
|
||||||
reference_seq = list(reference)
|
@multimethod
|
||||||
compared_seq = list(compared)
|
def word_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
|
||||||
|
reference_seq = list(words_normalized(reference))
|
||||||
|
compared_seq = list(words_normalized(compared))
|
||||||
|
return word_error_rate_n(reference_seq, compared_seq)
|
||||||
|
|
||||||
|
|
||||||
|
@multimethod
|
||||||
|
def word_error_rate_n(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
|
||||||
|
return word_error_rate_n(reference.text, compared.text)
|
||||||
|
|
||||||
|
|
||||||
|
@multimethod
|
||||||
|
def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, int]:
|
||||||
|
reference_seq = list(reference)
|
||||||
|
compared_seq = list(compared)
|
||||||
|
|
||||||
d = levenshtein(reference_seq, compared_seq)
|
d = levenshtein(reference_seq, compared_seq)
|
||||||
n = len(reference_seq)
|
n = len(reference_seq)
|
||||||
|
|
|
@ -6,3 +6,5 @@ numpy
|
||||||
colorama
|
colorama
|
||||||
MarkupSafe
|
MarkupSafe
|
||||||
ocrd >= 2.13.1
|
ocrd >= 2.13.1
|
||||||
|
attrs
|
||||||
|
multimethod == 1.3 # latest version to officially support Python 3.5
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue