diff --git a/.circleci/config.yml b/.circleci/config.yml index 5aeda5c..919d30f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,45 +1,31 @@ -version: 2 +version: 2.1 jobs: - - build-python37: + build-python: + parameters: + python-version: + type: string docker: - - image: python:3.7 + - image: cimg/python:<< parameters.python-version >> steps: - checkout - restore_cache: keys: - ocrd-resources-{{ checksum "requirements.txt" }}-{{ checksum "Makefile" }} - run: make install - - run: make model + - run: make models - save_cache: key: ocrd-resources-{{ checksum "requirements.txt" }}-{{ checksum "Makefile" }} paths: ocrd-resources - - run: git submodule update --init - run: make test - build-python38: - docker: - - image: python:3.8 - steps: - - checkout - - restore_cache: - keys: - - ocrd-resources-{{ checksum "requirements.txt" }}-{{ checksum "Makefile" }} - - run: make install - - run: make model - - save_cache: - key: ocrd-resources-{{ checksum "requirements.txt" }}-{{ checksum "Makefile" }} - paths: - ocrd-resources - - run: git submodule update --init - - run: make test workflows: - version: 2 build: jobs: - - build-python37 - - build-python38 + - build-python: + matrix: + parameters: + python-version: ['3.7', '3.8', '3.9', '3.10'] diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..8b90582 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,43 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: CLI Tests + +on: [push, pull_request] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.7', '3.8', '3.9', '3.10'] + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Lint with flake8 + run: | + pip install flake8 + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Install package + run: make install + - name: Cache models + uses: actions/cache@v3 + with: + key: models + path: /home/runner/.local/share/ocrd-resources/ocrd-sbb-binarize/* + - name: Install dependencies for test + # also downloads models, if not already present + run: make models test/assets + - name: Run tests + run: make test + - name: Setup upterm session when failure + if: failure() + uses: lhotari/action-upterm@v1 + diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ec8078..161ed31 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,19 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased -## [0.0.10] - 2022-10-24 +Fixed: + + * repaired `operation_level=region` (typo) + * repaired standalone CLI `--version` (pkg name) + * repaired standalone CLI channel format (uint8) + +Changed: + + * Trained models loadable and registered in SavedModel format + * Test both models, deployed normally (not in CWD) + * Test input with actual regions in `operation_level=region` + +## [0.0.11] - 2022-10-24 Added: diff --git a/Makefile b/Makefile index 7c26e8b..e4f5b87 100644 --- a/Makefile +++ b/Makefile @@ -1,16 +1,17 @@ # BEGIN-EVAL makefile-parser --make-help Makefile +.PHONY: help install help: @echo "" @echo " Targets" @echo "" @echo " install Install with pip" - @echo " model Downloads the pre-trained models from qurator-data.de" + @echo " models Downloads the pre-trained models from qurator-data.de" @echo " test Run tests" + @echo " clean Remove copies/results in test/assets" @echo "" @echo " Variables" @echo "" - @echo " MODEL_DIR Directory to store models" # END-EVAL @@ -19,11 +20,26 @@ install: pip install . # Downloads the pre-trained models from qurator-data.de -.PHONY: model -model: - ocrd resmgr download --allow-uninstalled --location cwd ocrd-sbb-binarize default +.PHONY: models +models: + ocrd resmgr download ocrd-sbb-binarize "*" + +repo/assets/data: + git submodule update --init + +# Setup test data +test/assets: repo/assets/data + @mkdir -p $@ + cp -r -t $@ $ Binarization for document images +[![pip release](https://img.shields.io/pypi/v/sbb-binarization.svg)](https://pypi.org/project/sbb-binarization/) +[![CircleCI test](https://circleci.com/gh/qurator-spk/sbb_binarization.svg?style=svg)](https://circleci.com/gh/qurator-spk/sbb_binarization) +[![GHAction test](https://github.com/qurator-spk/sbb_binarization/actions/workflows/test.yml/badge.svg)](https://github.com/qurator-spk/sbb_binarization/actions/workflows/test.yml) + ## Examples @@ -18,13 +22,19 @@ Clone the repository, enter it and run ### Models -Pre-trained models in `HDF5` format can be downloaded from here: +Pre-trained models in HDF5 format can be downloaded from here: https://qurator-data.de/sbb_binarization/ -We also provide a Tensorflow `saved_model` via Huggingface: +We also provide models in Tensorflow SavedModel format via Huggingface and Github release assets: https://huggingface.co/SBB/sbb_binarization +https://github.com/qurator-spk/sbb_binarization/releases + +With [OCR-D](https://ocr-d.de/), you can use the [Resource Manager](Tensorflow SavedModel) to deploy models, e.g. + + ocrd resmgr download ocrd-sbb-binarize "*" + ## Usage @@ -39,11 +49,22 @@ Images containing a lot of border noise (black pixels) should be cropped beforeh ### Example -```sh -sbb_binarize -m /path/to/model/ myimage.tif myimage-bin.tif -``` -To use the [OCR-D](https://ocr-d.de/) interface: -```sh -ocrd-sbb-binarize --overwrite -I INPUT_FILE_GRP -O OCR-D-IMG-BIN -P model "/var/lib/sbb_binarization" -``` + sbb_binarize -m /path/to/model/ myimage.tif myimage-bin.tif + + +To use the [OCR-D](https://ocr-d.de/en/spec/cli) interface: + + ocrd-sbb-binarize -I INPUT_FILE_GRP -O OCR-D-IMG-BIN -P model default + + +## Testing + +For simple smoke tests, the following will +- download models +- download test data +- run the OCR-D wrapper (on page and region level): + + make model + make test + \ No newline at end of file diff --git a/sbb_binarize/cli.py b/sbb_binarize/cli.py index ddfbde6..16145bb 100644 --- a/sbb_binarize/cli.py +++ b/sbb_binarize/cli.py @@ -6,7 +6,7 @@ from click import command, option, argument, version_option, types from .sbb_binarize import SbbBinarizer @command() -@version_option() +@version_option(package_name="sbb-binarization") @option('--model-dir', '-m', type=types.Path(exists=True, file_okay=False), required=True, help='directory containing models for prediction') @argument('input_image') @argument('output_image') diff --git a/sbb_binarize/ocrd-tool.json b/sbb_binarize/ocrd-tool.json index 9148309..beb74a1 100644 --- a/sbb_binarize/ocrd-tool.json +++ b/sbb_binarize/ocrd-tool.json @@ -26,20 +26,20 @@ }, "resources": [ { - "url": "https://github.com/apacha/sbb_binarization/releases/download/pre-trained-models/model_2020_01_16.zip", + "url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2020_01_16.zip", "name": "default", "type": "archive", - "path_in_archive": "model_2020_01_16", - "size": 562917559, - "description": "default models provided by github.com/qurator-spk" + "path_in_archive": "saved_model_2020_01_16", + "size": 563147331, + "description": "default models provided by github.com/qurator-spk (SavedModel format)" }, { - "url": "https://github.com/apacha/sbb_binarization/releases/download/pre-trained-models/model_2021_03_09.zip", + "url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip", "name": "default-2021-03-09", "type": "archive", "path_in_archive": ".", - "size": 133693693, - "description": "updated default models provided by github.com/qurator-spk" + "size": 133230419, + "description": "updated default models provided by github.com/qurator-spk (SavedModel format)" } ] } diff --git a/sbb_binarize/ocrd_cli.py b/sbb_binarize/ocrd_cli.py index 44a001f..45c9fee 100644 --- a/sbb_binarize/ocrd_cli.py +++ b/sbb_binarize/ocrd_cli.py @@ -124,7 +124,7 @@ class SbbBinarizeProcessor(Processor): LOG.warning("Page '%s' contains no text/table regions", page_id) for region in regions: region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh, feature_filter='binarized') - region_image_bin = cv2pil(binarizer.run(image=pil2cv(region_image))) + region_image_bin = cv2pil(self.binarizer.run(image=pil2cv(region_image))) region_image_bin_path = self.workspace.save_image_file( region_image_bin, "%s_%s.IMG-BIN" % (file_id, region.id), @@ -139,7 +139,7 @@ class SbbBinarizeProcessor(Processor): LOG.warning("Page '%s' contains no text lines", page_id) for region_id, line in region_line_tuples: line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh, feature_filter='binarized') - line_image_bin = cv2pil(binarizer.run(image=pil2cv(line_image))) + line_image_bin = cv2pil(self.binarizer.run(image=pil2cv(line_image))) line_image_bin_path = self.workspace.save_image_file( line_image_bin, "%s_%s_%s.IMG-BIN" % (file_id, region_id, line.id), diff --git a/sbb_binarize/sbb_binarize.py b/sbb_binarize/sbb_binarize.py index 0c87ca0..5533b8d 100644 --- a/sbb_binarize/sbb_binarize.py +++ b/sbb_binarize/sbb_binarize.py @@ -255,9 +255,11 @@ class SbbBinarizer: img_fin = (res[:, :] == 0) * 255 img_last = img_last + img_fin - kernel = np.ones((5, 5), np.uint8) img_last[:, :][img_last[:, :] > 0] = 255 img_last = (img_last[:, :] == 0) * 255 if save: - cv2.imwrite(save, img_last) + cv2.imwrite(save, img_last.astype(np.uint8), [ + cv2.IMWRITE_PNG_BILEVEL, 1, + cv2.IMWRITE_JPEG_QUALITY, 100 + ]) return img_last diff --git a/setup.py b/setup.py index 7ab6e02..437730c 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ setup( author='Vahid Rezanezhad', url='https://github.com/qurator-spk/sbb_binarization', license='Apache License 2.0', - packages=find_packages(exclude=('tests', 'docs')), + packages=find_packages(exclude=('test', 'repo')), include_package_data=True, package_data={'': ['*.json', '*.yml', '*.yaml']}, install_requires=install_requires,