From 44bd4b5eda29a59f5f02dd8ceb2eef39d21d924d Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Tue, 2 Jan 2024 20:38:40 +0100 Subject: [PATCH 1/2] =?UTF-8?q?=E2=9A=99=20pre-commit:=20Update=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .pre-commit-config.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d0ae66d..4f3562a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,26 +11,26 @@ repos: - id: check-ast - repo: https://github.com/psf/black - rev: 23.10.0 + rev: 23.12.1 hooks: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.1 + rev: v0.1.10 hooks: - args: - - --fix - - --exit-non-zero-on-fix + - --fix + - --exit-non-zero-on-fix id: ruff - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.6.1 + rev: v1.8.0 hooks: - additional_dependencies: - - types-setuptools + - types-setuptools id: mypy - repo: https://gitlab.com/vojko.pribudic/pre-commit-update - rev: v0.1.0 + rev: v0.1.1 hooks: - id: pre-commit-update From c1681551af19922c8fa4164ea76cccdde832c708 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 3 Jan 2024 19:21:53 +0100 Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=90=9B=20Fix=20generating=20word=20di?= =?UTF-8?q?fferences?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py index e542697..99403ba 100644 --- a/src/dinglehopper/cli.py +++ b/src/dinglehopper/cli.py @@ -122,9 +122,11 @@ def process( gt_text = extract(gt, textequiv_level=textequiv_level) ocr_text = extract(ocr, textequiv_level=textequiv_level) - gt_words = words_normalized(gt_text) - ocr_words = words_normalized(ocr_text) + gt_words: list = list(words_normalized(gt_text)) + ocr_words: list = list(words_normalized(ocr_text)) + assert isinstance(gt_text, ExtractedText) + assert isinstance(ocr_text, ExtractedText) cer, n_characters = character_error_rate_n(gt_text, ocr_text) char_diff_report, diff_c = gen_diff_report( gt_text, @@ -136,6 +138,10 @@ def process( differences=differences, ) + # {gt,ocr}_words must not be a generator, so we don't drain it for the differences + # report. + assert isinstance(gt_words, list) + assert isinstance(ocr_words, list) wer, n_words = word_error_rate_n(gt_words, ocr_words) word_diff_report, diff_w = gen_diff_report( gt_words,