From 020b283cc38916ceb6f1fc07b7a1882f1c2afdeb Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 12 Apr 2023 19:51:09 +0200 Subject: [PATCH 01/17] update to GH release archive as model URL --- sbb_binarize/ocrd-tool.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sbb_binarize/ocrd-tool.json b/sbb_binarize/ocrd-tool.json index 9148309..7d2e6ac 100644 --- a/sbb_binarize/ocrd-tool.json +++ b/sbb_binarize/ocrd-tool.json @@ -31,15 +31,15 @@ "type": "archive", "path_in_archive": "model_2020_01_16", "size": 562917559, - "description": "default models provided by github.com/qurator-spk" + "description": "default models provided by github.com/qurator-spk (SavedModel format)" }, { - "url": "https://github.com/apacha/sbb_binarization/releases/download/pre-trained-models/model_2021_03_09.zip", + "url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip", "name": "default-2021-03-09", "type": "archive", "path_in_archive": ".", - "size": 133693693, - "description": "updated default models provided by github.com/qurator-spk" + "size": 133230419, + "description": "updated default models provided by github.com/qurator-spk (SavedModel format)" } ] } From 5d5579957d9e1edea0666e7c7848b9e1c6fde555 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 12 Apr 2023 22:14:25 +0200 Subject: [PATCH 02/17] update to GH release archive as model URL (older model) --- sbb_binarize/ocrd-tool.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sbb_binarize/ocrd-tool.json b/sbb_binarize/ocrd-tool.json index 7d2e6ac..beb74a1 100644 --- a/sbb_binarize/ocrd-tool.json +++ b/sbb_binarize/ocrd-tool.json @@ -26,11 +26,11 @@ }, "resources": [ { - "url": "https://github.com/apacha/sbb_binarization/releases/download/pre-trained-models/model_2020_01_16.zip", + "url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2020_01_16.zip", "name": "default", "type": "archive", - "path_in_archive": "model_2020_01_16", - "size": 562917559, + "path_in_archive": "saved_model_2020_01_16", + "size": 563147331, "description": "default models provided by github.com/qurator-spk (SavedModel format)" }, { From 8a12d40769c1b3ac9e520b1e593ad43543f832d3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 12 Apr 2023 22:41:01 +0200 Subject: [PATCH 03/17] fix region level --- sbb_binarize/ocrd_cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sbb_binarize/ocrd_cli.py b/sbb_binarize/ocrd_cli.py index 44a001f..45c9fee 100644 --- a/sbb_binarize/ocrd_cli.py +++ b/sbb_binarize/ocrd_cli.py @@ -124,7 +124,7 @@ class SbbBinarizeProcessor(Processor): LOG.warning("Page '%s' contains no text/table regions", page_id) for region in regions: region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh, feature_filter='binarized') - region_image_bin = cv2pil(binarizer.run(image=pil2cv(region_image))) + region_image_bin = cv2pil(self.binarizer.run(image=pil2cv(region_image))) region_image_bin_path = self.workspace.save_image_file( region_image_bin, "%s_%s.IMG-BIN" % (file_id, region.id), @@ -139,7 +139,7 @@ class SbbBinarizeProcessor(Processor): LOG.warning("Page '%s' contains no text lines", page_id) for region_id, line in region_line_tuples: line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh, feature_filter='binarized') - line_image_bin = cv2pil(binarizer.run(image=pil2cv(line_image))) + line_image_bin = cv2pil(self.binarizer.run(image=pil2cv(line_image))) line_image_bin_path = self.workspace.save_image_file( line_image_bin, "%s_%s_%s.IMG-BIN" % (file_id, region_id, line.id), From 5b1634d6b3fe3c15268fc31ba6564f0b260d38cf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 12 Apr 2023 22:55:12 +0200 Subject: [PATCH 04/17] improve/update tests --- Makefile | 32 ++++++++++++++++++++++++-------- README.md | 32 ++++++++++++++++++++++++-------- 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index 7c26e8b..02fef8a 100644 --- a/Makefile +++ b/Makefile @@ -1,16 +1,17 @@ # BEGIN-EVAL makefile-parser --make-help Makefile +.PHONY: help install help: @echo "" @echo " Targets" @echo "" @echo " install Install with pip" - @echo " model Downloads the pre-trained models from qurator-data.de" + @echo " models Downloads the pre-trained models from qurator-data.de" @echo " test Run tests" + @echo " clean Remove copies/results in test/assets" @echo "" @echo " Variables" @echo "" - @echo " MODEL_DIR Directory to store models" # END-EVAL @@ -19,11 +20,26 @@ install: pip install . # Downloads the pre-trained models from qurator-data.de -.PHONY: model -model: - ocrd resmgr download --allow-uninstalled --location cwd ocrd-sbb-binarize default +.PHONY: models +models: + ocrd resmgr download ocrd-sbb-binarize "*" + +repo/assets: + git submodule update --init repo/assets + +# Setup test data +test/assets: repo/assets + @mkdir -p $@ + cp -r -t $@ repo/assets/data/* # Run tests -test: model - ocrd-sbb-binarize -m repo/assets/data/kant_aufklaerung_1784/data/mets.xml -I OCR-D-IMG -O BIN -P model default - ocrd-sbb-binarize -m repo/assets/data/kant_aufklaerung_1784-page-region/data/mets.xml -I OCR-D-IMG -O BIN -P model default -P operation_level region +.PHONY: test +test: test/assets models + ocrd-sbb-binarize -m test/assets/kant_aufklaerung_1784/data/mets.xml -I OCR-D-IMG -O BIN -P model default + ocrd-sbb-binarize -m test/assets/kant_aufklaerung_1784/data/mets.xml -I OCR-D-IMG -O BIN2 -P model default-2021-03-09 + ocrd-sbb-binarize -m test/assets/kant_aufklaerung_1784-page-region/data/mets.xml -g phys_0001 -I OCR-D-GT-SEG-REGION -O BIN -P model default -P operation_level region + ocrd-sbb-binarize -m test/assets/kant_aufklaerung_1784-page-region/data/mets.xml -g phys_0001 -I OCR-D-GT-SEG-REGION -O BIN2 -P model default-2021-03-09 -P operation_level region + +.PHONY: clean +clean: + -$(RM) -fr test/assets diff --git a/README.md b/README.md index 8ec4e4b..c150683 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Clone the repository, enter it and run ### Models -Pre-trained models in `HDF5` format can be downloaded from here: +Pre-trained models in HDF5 format can be downloaded from here: https://qurator-data.de/sbb_binarization/ @@ -26,6 +26,11 @@ We also provide a Tensorflow `saved_model` via Huggingface: https://huggingface.co/SBB/sbb_binarization +With [OCR-D](https://ocr-d.de/), you can use the [Resource Manager](Tensorflow SavedModel) to deploy models, e.g. + + ocrd resmgr download ocrd-sbb-binarize "*" + + ## Usage ```sh @@ -39,11 +44,22 @@ Images containing a lot of border noise (black pixels) should be cropped beforeh ### Example -```sh -sbb_binarize -m /path/to/model/ myimage.tif myimage-bin.tif -``` -To use the [OCR-D](https://ocr-d.de/) interface: -```sh -ocrd-sbb-binarize --overwrite -I INPUT_FILE_GRP -O OCR-D-IMG-BIN -P model "/var/lib/sbb_binarization" -``` + sbb_binarize -m /path/to/model/ myimage.tif myimage-bin.tif + + +To use the [OCR-D](https://ocr-d.de/en/spec/cli) interface: + + ocrd-sbb-binarize -I INPUT_FILE_GRP -O OCR-D-IMG-BIN -P model default + + +## Testing + +For simple smoke tests, the following will +- download models +- download test data +- run the OCR-D wrapper (on page and region level): + + + make model + make test From 9a6730ef3aa5cc2b77c25e453cc627b25188700d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 12 Apr 2023 22:58:33 +0200 Subject: [PATCH 05/17] add CI :fingers_crossed: --- .github/workflows/test.yml | 41 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..4146b0c --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,41 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: CLI Tests + +on: [push, pull_request] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.7', '3.8', '3.9'] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Lint with flake8 + run: | + pip install flake8 + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Install package + run: make install + - name: Cache models + uses: actions/cache@v3 + with: + key: detectron-models + path: /home/runner/.local/share/ocrd-resources/ocrd-sbb-binarize/* + - name: Install dependencies for test + # also downloads models, if not already present + run: make models test/assets + - name: Run tests + run: make test + From 30e467cd4c1ebbc7960be38264034b99b3eb26bf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 12 Apr 2023 23:28:34 +0200 Subject: [PATCH 06/17] update CircleCI --- .circleci/config.yml | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 5aeda5c..747fbb7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,16 +2,19 @@ version: 2 jobs: - build-python37: - docker: - - image: python:3.7 + build-python: + parameters: + python-version: + type: string + docker: + - image: cimg/python:<< parameters.python-version >> steps: - checkout - restore_cache: keys: - ocrd-resources-{{ checksum "requirements.txt" }}-{{ checksum "Makefile" }} - run: make install - - run: make model + - run: make models - save_cache: key: ocrd-resources-{{ checksum "requirements.txt" }}-{{ checksum "Makefile" }} paths: @@ -19,27 +22,13 @@ jobs: - run: git submodule update --init - run: make test - build-python38: - docker: - - image: python:3.8 - steps: - - checkout - - restore_cache: - keys: - - ocrd-resources-{{ checksum "requirements.txt" }}-{{ checksum "Makefile" }} - - run: make install - - run: make model - - save_cache: - key: ocrd-resources-{{ checksum "requirements.txt" }}-{{ checksum "Makefile" }} - paths: - ocrd-resources - - run: git submodule update --init - - run: make test workflows: version: 2 build: jobs: - - build-python37 - - build-python38 + - build-python + matrix: + parameters: + python-version: ['3.7', '3.8', '3.9', '3.10'] From ac9bebf2b55faff1ee4fb7a4a13a942235ca5402 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 12 Apr 2023 23:31:23 +0200 Subject: [PATCH 07/17] Update Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 02fef8a..a5be943 100644 --- a/Makefile +++ b/Makefile @@ -25,7 +25,7 @@ models: ocrd resmgr download ocrd-sbb-binarize "*" repo/assets: - git submodule update --init repo/assets + git submodule update --init # Setup test data test/assets: repo/assets From 237900f69d5902c926bb893bd2b309d83bfb8a6e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 12 Apr 2023 23:41:03 +0200 Subject: [PATCH 08/17] GHA CI: add debug session --- .github/workflows/test.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4146b0c..8b90582 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -11,8 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.7', '3.8', '3.9'] - + python-version: ['3.7', '3.8', '3.9', '3.10'] steps: - uses: actions/checkout@v3 - name: Set up Python @@ -31,11 +30,14 @@ jobs: - name: Cache models uses: actions/cache@v3 with: - key: detectron-models + key: models path: /home/runner/.local/share/ocrd-resources/ocrd-sbb-binarize/* - name: Install dependencies for test # also downloads models, if not already present run: make models test/assets - name: Run tests run: make test + - name: Setup upterm session when failure + if: failure() + uses: lhotari/action-upterm@v1 From b79ea56b227f05083add4e40e97f9810057b81ee Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 12 Apr 2023 23:46:52 +0200 Subject: [PATCH 09/17] GHA: clone submodules --- .github/workflows/test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8b90582..17491a2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -32,6 +32,8 @@ jobs: with: key: models path: /home/runner/.local/share/ocrd-resources/ocrd-sbb-binarize/* + - name: Clone submodules + run: git submodule update --init - name: Install dependencies for test # also downloads models, if not already present run: make models test/assets From ad7164f8e93c11fafd1235caaa86468488f601fd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 12 Apr 2023 23:49:04 +0200 Subject: [PATCH 10/17] Updated config.yml --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 747fbb7..f86b4e1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,7 +6,7 @@ jobs: parameters: python-version: type: string - docker: + docker: - image: cimg/python:<< parameters.python-version >> steps: - checkout @@ -27,7 +27,7 @@ workflows: version: 2 build: jobs: - - build-python + - build-python: matrix: parameters: python-version: ['3.7', '3.8', '3.9', '3.10'] From f0ae1b77d95cfd119c1084ad248a49a96b8e882b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 12 Apr 2023 23:54:16 +0200 Subject: [PATCH 11/17] CircleCI: try to fix syntax --- .circleci/config.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index f86b4e1..edd6b61 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,7 +1,6 @@ -version: 2 +version: 2.1 jobs: - build-python: parameters: python-version: @@ -24,7 +23,6 @@ jobs: workflows: - version: 2 build: jobs: - build-python: From 0b3d07a4c43cc3370dfe1d1be9436cff6434c671 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 13 Apr 2023 10:50:06 +0200 Subject: [PATCH 12/17] add badges --- README.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index c150683..677d3c4 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,10 @@ > Binarization for document images +[![pip release](https://img.shields.io/pypi/v/sbb-binarization.svg)](https://pypi.org/project/sbb-binarization/) +[![CircleCI test](https://circleci.com/gh/qurator-spk/sbb_binarization.svg?style=svg)](https://circleci.com/gh/qurator-spk/sbb_binarization) +[![GHAction test](https://github.com/qurator-spk/sbb_binarization/actions/workflows/test.yml/badge.svg)](https://github.com/qurator-spk/sbb_binarization/actions/workflows/test.yml) + ## Examples @@ -22,9 +26,10 @@ Pre-trained models in HDF5 format can be downloaded from here: https://qurator-data.de/sbb_binarization/ -We also provide a Tensorflow `saved_model` via Huggingface: +We also provide models in Tensorflow SavedModel format via Huggingface and Github release assets: https://huggingface.co/SBB/sbb_binarization +https://github.com/qurator-spk/sbb_binarization/releases With [OCR-D](https://ocr-d.de/), you can use the [Resource Manager](Tensorflow SavedModel) to deploy models, e.g. @@ -59,7 +64,7 @@ For simple smoke tests, the following will - download models - download test data - run the OCR-D wrapper (on page and region level): - - - make model - make test + + make model + make test + \ No newline at end of file From 6a28ece9410f43a32c15781e7d92b88215d39504 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 13 Apr 2023 10:50:16 +0200 Subject: [PATCH 13/17] update --- CHANGELOG.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ec8078..fd95e15 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,17 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased -## [0.0.10] - 2022-10-24 +Fixed: + + * repaired `operation_level=region` (typo) + +Changed: + + * Trained models loadable and registered in SavedModel format + * Test both models, deployed normally (not in CWD) + * Test input with actual regions in `operation_level=region` + +## [0.0.11] - 2022-10-24 Added: From 7d8f293f2fc7f20aa4319816171ae8a3022751ea Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 13 Apr 2023 11:07:10 +0200 Subject: [PATCH 14/17] fix standalone CLI version_option --- sbb_binarize/cli.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sbb_binarize/cli.py b/sbb_binarize/cli.py index ddfbde6..16145bb 100644 --- a/sbb_binarize/cli.py +++ b/sbb_binarize/cli.py @@ -6,7 +6,7 @@ from click import command, option, argument, version_option, types from .sbb_binarize import SbbBinarizer @command() -@version_option() +@version_option(package_name="sbb-binarization") @option('--model-dir', '-m', type=types.Path(exists=True, file_okay=False), required=True, help='directory containing models for prediction') @argument('input_image') @argument('output_image') diff --git a/setup.py b/setup.py index 7ab6e02..437730c 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ setup( author='Vahid Rezanezhad', url='https://github.com/qurator-spk/sbb_binarization', license='Apache License 2.0', - packages=find_packages(exclude=('tests', 'docs')), + packages=find_packages(exclude=('test', 'repo')), include_package_data=True, package_data={'': ['*.json', '*.yml', '*.yaml']}, install_requires=install_requires, From 0bcb171240937c135a0d0d41a02fe2b6632a3243 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 13 Apr 2023 12:54:02 +0200 Subject: [PATCH 15/17] standalone cv2.imwrite: use uint8 / bilevel --- sbb_binarize/sbb_binarize.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sbb_binarize/sbb_binarize.py b/sbb_binarize/sbb_binarize.py index 0c87ca0..5533b8d 100644 --- a/sbb_binarize/sbb_binarize.py +++ b/sbb_binarize/sbb_binarize.py @@ -255,9 +255,11 @@ class SbbBinarizer: img_fin = (res[:, :] == 0) * 255 img_last = img_last + img_fin - kernel = np.ones((5, 5), np.uint8) img_last[:, :][img_last[:, :] > 0] = 255 img_last = (img_last[:, :] == 0) * 255 if save: - cv2.imwrite(save, img_last) + cv2.imwrite(save, img_last.astype(np.uint8), [ + cv2.IMWRITE_PNG_BILEVEL, 1, + cv2.IMWRITE_JPEG_QUALITY, 100 + ]) return img_last From b1bcc7f0422ea5e7d5348a7629bbe5e6170ad8b3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 13 Apr 2023 12:55:07 +0200 Subject: [PATCH 16/17] update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fd95e15..161ed31 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). Fixed: * repaired `operation_level=region` (typo) + * repaired standalone CLI `--version` (pkg name) + * repaired standalone CLI channel format (uint8) Changed: From 01fc36a9604426131865d452271e9193697afa36 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 13 Apr 2023 21:12:34 +0200 Subject: [PATCH 17/17] make git submodule update depend on subdir --- .circleci/config.yml | 1 - .github/workflows/test.yml | 2 -- Makefile | 6 +++--- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index edd6b61..919d30f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -18,7 +18,6 @@ jobs: key: ocrd-resources-{{ checksum "requirements.txt" }}-{{ checksum "Makefile" }} paths: ocrd-resources - - run: git submodule update --init - run: make test diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 17491a2..8b90582 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -32,8 +32,6 @@ jobs: with: key: models path: /home/runner/.local/share/ocrd-resources/ocrd-sbb-binarize/* - - name: Clone submodules - run: git submodule update --init - name: Install dependencies for test # also downloads models, if not already present run: make models test/assets diff --git a/Makefile b/Makefile index a5be943..e4f5b87 100644 --- a/Makefile +++ b/Makefile @@ -24,13 +24,13 @@ install: models: ocrd resmgr download ocrd-sbb-binarize "*" -repo/assets: +repo/assets/data: git submodule update --init # Setup test data -test/assets: repo/assets +test/assets: repo/assets/data @mkdir -p $@ - cp -r -t $@ repo/assets/data/* + cp -r -t $@ $