Small corrections

pull/47/head
Benjamin Rosemann 4 years ago
parent b24d8d5664
commit 0dd5fc0ee5

@ -46,8 +46,8 @@ def flexible_character_accuracy(
Reference: contains steps 1-7 of the flexible character accuracy algorithm. Reference: contains steps 1-7 of the flexible character accuracy algorithm.
:param gt: The ground truth text. :param gt: The ground truth ExtractedText object.
:param ocr: The text to compare the ground truth with. :param ocr: The ExtractedText object to compare the ground truth with.
:return: Score between 0 and 1 and match objects. :return: Score between 0 and 1 and match objects.
""" """
return flexible_character_accuracy(gt.text, ocr.text) return flexible_character_accuracy(gt.text, ocr.text)
@ -66,11 +66,11 @@ def flexible_character_accuracy(gt: str, ocr: str) -> Tuple[float, List[Match]]:
best_score = -float("inf") best_score = -float("inf")
best_matches = [] best_matches = []
# TODO: this should be configurable # TODO: should this be configurable?
combinations = product( combinations = product(
range(15, 31, 5), range(0, 24, 3), range(0, 4, 1), range(0, 6, 1) range(15, 31, 5), range(0, 24, 3), range(0, 4, 1), range(0, 6, 1)
) )
# TODO: place to parallelize the algorithm # TODO: place to parallelize the algorithm?
for (edit_dist, length_diff, offset, length) in combinations: for (edit_dist, length_diff, offset, length) in combinations:
coef = Coefficients( coef = Coefficients(
edit_dist=edit_dist, length_diff=length_diff, offset=offset, length=length edit_dist=edit_dist, length_diff=length_diff, offset=offset, length=length
@ -89,7 +89,7 @@ def flexible_character_accuracy(gt: str, ocr: str) -> Tuple[float, List[Match]]:
def match_with_coefficients(gt: str, ocr: str, coef: Coefficients) -> List[Match]: def match_with_coefficients(gt: str, ocr: str, coef: Coefficients) -> List[Match]:
"""Match ground truth with ocr and considers a given set of coefficients. """Match ground truth with ocr and consider a given set of coefficients.
Reference: contains steps 1 - 6 of the flexible character accuracy algorithm. Reference: contains steps 1 - 6 of the flexible character accuracy algorithm.
@ -128,7 +128,8 @@ def match_longest_gt_lines(
"""Find the best match for the longest line(s) in ground truth. """Find the best match for the longest line(s) in ground truth.
The longest lines in ground truth are matched against lines in ocr to find the The longest lines in ground truth are matched against lines in ocr to find the
best matching pair. This pair is then either considered a match on full line best matching pair. This pair is then either considered a match on a full line
or the line(s) is splitted and the non matching parts are added back to the list.
Reference: contains steps 3 and 4 of the flexible character accuracy algorithm. Reference: contains steps 3 and 4 of the flexible character accuracy algorithm.
@ -139,11 +140,12 @@ def match_longest_gt_lines(
return best_match return best_match
# Step 3 of the flexible character accuracy algorithm (variation). # Step 3 of the flexible character accuracy algorithm (variation).
# Instead of the longest line we take all longest lines with equal length. # We do not only take the longest line from ground truth but decide on a length
length = min(gt_lines[0].length, ocr_lines[0].length) # threshold and take all lines from ground truth bigger than the threshold.
for gt_line in takewhile(lambda line: line.length >= length, gt_lines): length_threshold = min(gt_lines[0].length, ocr_lines[0].length) - 1
for gt_line in takewhile(lambda line: line.length > length_threshold, gt_lines):
match, ocr_line = match_gt_line(gt_line, ocr_lines, coef) match, ocr_line = match_gt_line(gt_line, ocr_lines, coef)
score = 0 if not match else character_accuracy(match.dist) score = -float("inf") if not match else character_accuracy(match.dist)
if score > best_score: if score > best_score:
best_score, best_match, best_gt, best_ocr = score, match, gt_line, ocr_line best_score, best_match, best_gt, best_ocr = score, match, gt_line, ocr_line
# early breaking: we only need one perfect fit # early breaking: we only need one perfect fit
@ -191,34 +193,17 @@ def match_gt_line(
return best_match, best_ocr return best_match, best_ocr
def remove_or_split(original: "Part", match: "Part", lines: List["Part"]) -> bool: @lru_cache(maxsize=10000)
"""Removes the matched line or splits it into parts.
Reference: contains step 4 of the flexible character accuracy algorithm.
:return: True if line was splitted.
"""
splitted = False
del lines[lines.index(original)]
if match.length < original.length:
lines.extend(original.split(match))
# sorting for ocr is not mentioned in the paper, but is used as tie breaking =)
lines.sort(key=lambda x: x.length, reverse=True)
splitted = True
return splitted
@lru_cache(maxsize=1000000)
def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]: def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]:
"""Matches two lines searching for a local alignment. """Matches two lines searching for a naive local alignment.
The shorter line is moved along the longer line The shorter line is moved along the longer line
until the editing distance is minimized. until the editing distance is minimized.
Reference: see figure 2 in the paper. Reference: see figure 2 in the doi:10.1016/j.patrec.2020.02.003.
TODO: make distance function configurable? TODO: make distance function configurable?
TODO: rethink @lru_cache TODO: use @cache annotation in Python 3.9?
:return: Match object if one is found. :return: Match object if one is found.
""" """
@ -273,14 +258,14 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]:
return best_match return best_match
@lru_cache(maxsize=1000000) @lru_cache(maxsize=10000)
def distance(gt: "Part", ocr: "Part") -> Match: def distance(gt: "Part", ocr: "Part") -> Match:
"""Calculate the editing distance between the two lines. """Calculate the editing distance between the two lines.
Using the already available `editops()` function with the Levenshtein distance. Using the already available `editops()` function with the Levenshtein distance.
TODO: replace with @cache annotation in Python 3.9 TODO: use @cache annotation in Python 3.9?
TODO: rethink @lru_cache TODO: wait for qurator-spk/dinglehopper#48 for efficient editops.
:return: Match object containing the lines and the editing operations. :return: Match object containing the lines and the editing operations.
""" """
@ -300,7 +285,7 @@ def score_edit_distance(dist: Distance) -> int:
return dist.delete + dist.insert + 2 * dist.replace return dist.delete + dist.insert + 2 * dist.replace
@lru_cache(1000000) @lru_cache(10000)
def calculate_penalty( def calculate_penalty(
gt_length: int, gt_length: int,
ocr_length: int, ocr_length: int,
@ -336,7 +321,6 @@ def character_accuracy_for_matches(matches: List[Match]) -> float:
"""Character accuracy of a full text represented by a list of matches. """Character accuracy of a full text represented by a list of matches.
See other `character_accuracy` for details. See other `character_accuracy` for details.
""" """
agg = reduce( agg = reduce(
lambda acc, match: acc + Counter(match.dist._asdict()), matches, Counter() lambda acc, match: acc + Counter(match.dist._asdict()), matches, Counter()
@ -355,7 +339,7 @@ def character_accuracy(edits: Distance) -> float:
Errors are replacements, deletes and inserts. Errors are replacements, deletes and inserts.
Note that is is possible to have more errors than characters in which case the Note that it is possible to have more errors than characters in which case the
character accuracy turns negative. character accuracy turns negative.
Comparing two empty strings (having no edits) results in a character accuracy of 1. Comparing two empty strings (having no edits) results in a character accuracy of 1.
@ -391,10 +375,30 @@ def initialize_lines(text: str) -> List["Part"]:
return lines return lines
def split_matches(matches: List[Match]) -> Tuple[List[str], List[str], List[List]]: def remove_or_split(original: "Part", match: "Part", lines: List["Part"]) -> bool:
"""Removes the matched line or splits it into parts.
Reference: contains step 4 of the flexible character accuracy algorithm.
:return: True if line was splitted.
"""
splitted = False
del lines[lines.index(original)]
if match.length < original.length:
lines.extend(original.split(match))
# sorting for ocr is not mentioned in the paper, but is used as tie breaking =)
lines.sort(key=lambda x: x.length, reverse=True)
splitted = True
return splitted
def split_matches(
matches: List[Match], linesep="\n"
) -> Tuple[List[str], List[str], List[List]]:
"""Extracts text segments and editing operations in separate lists. """Extracts text segments and editing operations in separate lists.
:param matches: List of match objects. :param matches: List of match objects.
:param linesep: Character(s) or line separation.
:return: List of ground truth segments, ocr segments and editing operations. :return: List of ground truth segments, ocr segments and editing operations.
""" """
matches = sorted(matches, key=lambda m: m.gt.line + m.gt.start / 10000) matches = sorted(matches, key=lambda m: m.gt.line + m.gt.start / 10000)
@ -402,9 +406,9 @@ def split_matches(matches: List[Match]) -> Tuple[List[str], List[str], List[List
gt, ocr, ops = [], [], [] gt, ocr, ops = [], [], []
for match in matches: for match in matches:
if match.gt.line > line: if match.gt.line > line:
gt.append("\n") gt.append(linesep)
ocr.append("\n") ocr.append(linesep)
ops.append([]) ops.extend([[]] * len(linesep))
line = match.gt.line line = match.gt.line
gt.append(match.gt.text) gt.append(match.gt.text)
ocr.append(match.ocr.text) ocr.append(match.ocr.text)

@ -80,7 +80,7 @@ SIMPLE_EDITS = [
def extended_case_to_text(gt, ocr): def extended_case_to_text(gt, ocr):
"""Generate sentence from reading order encoding. """Generate sentence from reading order encoding.
See figure 4 in 10.1016/j.patrec.2020.02.003 See figure 4 in 10.1016/j.patrec.2020.02.003.
""" """
sentence = ( sentence = (
"Eight", "Eight",
@ -159,7 +159,7 @@ def test_flexible_character_accuracy_xml(gt, ocr, first_line_score, all_line_sco
], ],
) )
def test_flexible_character_accuracy(config, ocr): def test_flexible_character_accuracy(config, ocr):
"""Tests from figure 3 in the paper.""" """Tests from figure 3 in the 10.1016/j.patrec.2020.02.003."""
gt = ( gt = (
'"I have\nno special\ntalent.\n' '"I have\nno special\ntalent.\n'
'I am only\npassionately\ncurious."\n' 'I am only\npassionately\ncurious."\n'
@ -199,7 +199,7 @@ def test_flexible_character_accuracy(config, ocr):
def test_flexible_character_accuracy_extended( def test_flexible_character_accuracy_extended(
gt, ocr, first_line_score, all_line_score gt, ocr, first_line_score, all_line_score
): ):
"""Tests from figure 4 in the paper.""" """Tests from figure 4 in the 10.1016/j.patrec.2020.02.003."""
gt_sentence, ocr_sentence = extended_case_to_text(gt, ocr) gt_sentence, ocr_sentence = extended_case_to_text(gt, ocr)
result, _ = flexible_character_accuracy(gt_sentence, ocr_sentence) result, _ = flexible_character_accuracy(gt_sentence, ocr_sentence)
assert result == pytest.approx(all_line_score, abs=0.001) assert result == pytest.approx(all_line_score, abs=0.001)

@ -6,14 +6,13 @@ from lxml import etree as ET
from .. import distance, page_text, extract from .. import distance, page_text, extract
from .. import flexible_character_accuracy, split_matches from .. import flexible_character_accuracy, split_matches
data_dir = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "data", "table-order"
)
@pytest.mark.parametrize("file", ["table-order-0002.xml", "table-no-reading-order.xml"]) @pytest.mark.parametrize("file", ["table-order-0002.xml", "table-no-reading-order.xml"])
@pytest.mark.integration @pytest.mark.integration
def test_fac_ignoring_reading_order(file): def test_fac_ignoring_reading_order(file):
data_dir = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "data", "table-order"
)
expected = "1\n2\n3\n4\n5\n6\n7\n8\n9" expected = "1\n2\n3\n4\n5\n6\n7\n8\n9"
gt = page_text(ET.parse(os.path.join(data_dir, "table-order-0001.xml"))) gt = page_text(ET.parse(os.path.join(data_dir, "table-order-0001.xml")))
@ -42,6 +41,9 @@ def test_fac_ignoring_reading_order(file):
) )
@pytest.mark.integration @pytest.mark.integration
def test_reading_order_settings(file, expected_text): def test_reading_order_settings(file, expected_text):
data_dir = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "data", "table-order"
)
if "table-unordered.xml" == file: if "table-unordered.xml" == file:
with pytest.raises(NotImplementedError): with pytest.raises(NotImplementedError):
page_text(ET.parse(os.path.join(data_dir, file))) page_text(ET.parse(os.path.join(data_dir, file)))

Loading…
Cancel
Save