Merge pull request #59 from bertsky/change-model-url

update to GH release archive as model URL
1 year ago · 7afe7574b5
parent ae45802f61 01fc36a960
commit 7afe7574b5
10 changed files with 136 additions and 56 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1,45 +1,31 @@
-version: 2
+version: 2.1

 jobs:
-
-  build-python37:
+  build-python:
+    parameters:
+      python-version:
+        type: string
    docker:
-      - image: python:3.7
+      - image: cimg/python:<< parameters.python-version >>
    steps:
      - checkout
      - restore_cache:
          keys:
            - ocrd-resources-{{ checksum "requirements.txt" }}-{{ checksum "Makefile" }}
      - run: make install
-      - run: make model
+      - run: make models
      - save_cache:
          key: ocrd-resources-{{ checksum "requirements.txt" }}-{{ checksum "Makefile" }}
          paths:
            ocrd-resources
-      - run: git submodule update --init
      - run: make test

-  build-python38:
-    docker:
-      - image: python:3.8
-    steps:
-      - checkout
-      - restore_cache:
-          keys:
-            - ocrd-resources-{{ checksum "requirements.txt" }}-{{ checksum "Makefile" }}
-      - run: make install
-      - run: make model
-      - save_cache:
-          key: ocrd-resources-{{ checksum "requirements.txt" }}-{{ checksum "Makefile" }}
-          paths:
-            ocrd-resources
-      - run: git submodule update --init
-      - run: make test

 workflows:
-  version: 2
  build:
    jobs:
-      - build-python37
-      - build-python38
+      - build-python:
+          matrix:
+            parameters:
+              python-version: ['3.7', '3.8', '3.9', '3.10']

--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -0,0 +1,43 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: CLI Tests
+
+on: [push, pull_request]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.7', '3.8', '3.9', '3.10']
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version:  ${{ matrix.python-version }}
+    - name: Lint with flake8
+      run: |
+        pip install flake8
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Install package
+      run: make install
+    - name: Cache models
+      uses: actions/cache@v3
+      with:
+        key: models
+        path: /home/runner/.local/share/ocrd-resources/ocrd-sbb-binarize/*
+    - name: Install dependencies for test
+      # also downloads models, if not already present
+      run: make models test/assets
+    - name: Run tests
+      run: make test
+    - name: Setup upterm session when failure
+      if: failure()
+      uses: lhotari/action-upterm@v1  
+
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -4,7 +4,19 @@ Versioned according to [Semantic Versioning](http://semver.org/).

 ## Unreleased

-## [0.0.10] - 2022-10-24
+Fixed:
+
+  * repaired `operation_level=region` (typo)
+  * repaired standalone CLI `--version` (pkg name)
+  * repaired standalone CLI channel format (uint8)
+
+Changed:
+
+  * Trained models loadable and registered in SavedModel format
+  * Test both models, deployed normally (not in CWD)
+  * Test input with actual regions in `operation_level=region`
+
+## [0.0.11] - 2022-10-24

 Added:

--- a/32
+++ b/32
@ -1,16 +1,17 @@
 # BEGIN-EVAL makefile-parser --make-help Makefile

+.PHONY: help install
 help:
 	@echo ""
 	@echo "  Targets"
 	@echo ""
 	@echo "    install  Install with pip"
-	@echo "    model    Downloads the pre-trained models from qurator-data.de"
+	@echo "    models   Downloads the pre-trained models from qurator-data.de"
 	@echo "    test     Run tests"
+	@echo "    clean    Remove copies/results in test/assets"
 	@echo ""
 	@echo "  Variables"
 	@echo ""
-	@echo "    MODEL_DIR  Directory to store models"

 # END-EVAL

@ -19,11 +20,26 @@ install:
 	pip install .

 # Downloads the pre-trained models from qurator-data.de
-.PHONY: model
-model:
-	ocrd resmgr download --allow-uninstalled --location cwd ocrd-sbb-binarize default
+.PHONY: models
+models:
+	ocrd resmgr download ocrd-sbb-binarize "*"
+
+repo/assets/data:
+	git submodule update --init
+
+# Setup test data
+test/assets: repo/assets/data
+	@mkdir -p $@
+	cp -r -t $@ $</*

 # Run tests
-test: model
-	ocrd-sbb-binarize -m repo/assets/data/kant_aufklaerung_1784/data/mets.xml -I OCR-D-IMG -O BIN -P model default
-	ocrd-sbb-binarize -m repo/assets/data/kant_aufklaerung_1784-page-region/data/mets.xml -I OCR-D-IMG -O BIN -P model default -P operation_level region
+.PHONY: test
+test: test/assets models
+	ocrd-sbb-binarize -m test/assets/kant_aufklaerung_1784/data/mets.xml -I OCR-D-IMG -O BIN -P model default
+	ocrd-sbb-binarize -m test/assets/kant_aufklaerung_1784/data/mets.xml -I OCR-D-IMG -O BIN2 -P model default-2021-03-09
+	ocrd-sbb-binarize -m test/assets/kant_aufklaerung_1784-page-region/data/mets.xml -g phys_0001 -I OCR-D-GT-SEG-REGION -O BIN -P model default -P operation_level region
+	ocrd-sbb-binarize -m test/assets/kant_aufklaerung_1784-page-region/data/mets.xml -g phys_0001 -I OCR-D-GT-SEG-REGION -O BIN2 -P model default-2021-03-09 -P operation_level region
+
+.PHONY: clean
+clean:
+	-$(RM) -fr test/assets
--- a/README.md
+++ b/README.md
@ -2,6 +2,10 @@

 > Binarization for document images

+[![pip release](https://img.shields.io/pypi/v/sbb-binarization.svg)](https://pypi.org/project/sbb-binarization/)
+[![CircleCI test](https://circleci.com/gh/qurator-spk/sbb_binarization.svg?style=svg)](https://circleci.com/gh/qurator-spk/sbb_binarization)
+[![GHAction test](https://github.com/qurator-spk/sbb_binarization/actions/workflows/test.yml/badge.svg)](https://github.com/qurator-spk/sbb_binarization/actions/workflows/test.yml)
+
 ## Examples

 <img src="https://user-images.githubusercontent.com/952378/63592437-e433e400-c5b1-11e9-9c2d-889c6e93d748.jpg" width="180"><img src="https://user-images.githubusercontent.com/952378/63592435-e433e400-c5b1-11e9-88e4-3e441b61fa67.jpg" width="180"><img src="https://user-images.githubusercontent.com/952378/63592440-e4cc7a80-c5b1-11e9-8964-2cd1b22c87be.jpg" width="220"><img src="https://user-images.githubusercontent.com/952378/63592438-e4cc7a80-c5b1-11e9-86dc-a9e9f8555422.jpg" width="220">
@ -18,13 +22,19 @@ Clone the repository, enter it and run

 ### Models

-Pre-trained models in  `HDF5` format can be downloaded from here:   
+Pre-trained models in HDF5 format can be downloaded from here:

 https://qurator-data.de/sbb_binarization/

-We also provide a Tensorflow `saved_model` via Huggingface:
+We also provide models in Tensorflow SavedModel format via Huggingface and Github release assets:

 https://huggingface.co/SBB/sbb_binarization
+https://github.com/qurator-spk/sbb_binarization/releases
+
+With [OCR-D](https://ocr-d.de/), you can use the [Resource Manager](Tensorflow SavedModel) to deploy models, e.g.
+
+    ocrd resmgr download ocrd-sbb-binarize "*"
+

 ## Usage

@ -39,11 +49,22 @@ Images containing a lot of border noise (black pixels) should be cropped beforeh

 ### Example

-```sh
-sbb_binarize -m /path/to/model/ myimage.tif myimage-bin.tif
-```

-To use the [OCR-D](https://ocr-d.de/) interface:
-```sh
-ocrd-sbb-binarize --overwrite -I INPUT_FILE_GRP -O OCR-D-IMG-BIN -P model "/var/lib/sbb_binarization"
-```
+    sbb_binarize -m /path/to/model/ myimage.tif myimage-bin.tif
+
+
+To use the [OCR-D](https://ocr-d.de/en/spec/cli) interface:
+
+    ocrd-sbb-binarize -I INPUT_FILE_GRP -O OCR-D-IMG-BIN -P model default
+
+
+## Testing
+
+For simple smoke tests, the following will
+- download models
+- download test data
+- run the OCR-D wrapper (on page and region level):
+    
+        make model
+        make test
+    
--- a/sbb_binarize/cli.py
+++ b/sbb_binarize/cli.py
@ -6,7 +6,7 @@ from click import command, option, argument, version_option, types
 from .sbb_binarize import SbbBinarizer

@command()
-@version_option()
+@version_option(package_name="sbb-binarization")
@option('--model-dir', '-m', type=types.Path(exists=True, file_okay=False), required=True, help='directory containing models for prediction')
@argument('input_image')
@argument('output_image')
--- a/sbb_binarize/ocrd-tool.json
+++ b/sbb_binarize/ocrd-tool.json
@ -26,20 +26,20 @@
      },
      "resources": [
        {
-          "url": "https://github.com/apacha/sbb_binarization/releases/download/pre-trained-models/model_2020_01_16.zip",
+          "url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2020_01_16.zip",
          "name": "default",
          "type": "archive",
-          "path_in_archive": "model_2020_01_16",
-          "size": 562917559,
-          "description": "default models provided by github.com/qurator-spk"
+          "path_in_archive": "saved_model_2020_01_16",
+          "size": 563147331,
+          "description": "default models provided by github.com/qurator-spk (SavedModel format)"
        },
        {
-          "url": "https://github.com/apacha/sbb_binarization/releases/download/pre-trained-models/model_2021_03_09.zip",
+          "url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip",
          "name": "default-2021-03-09",
          "type": "archive",
          "path_in_archive": ".",
-          "size": 133693693,
-          "description": "updated default models provided by github.com/qurator-spk"
+          "size": 133230419,
+          "description": "updated default models provided by github.com/qurator-spk (SavedModel format)"
        }
      ]
    }
--- a/sbb_binarize/ocrd_cli.py
+++ b/sbb_binarize/ocrd_cli.py
@ -124,7 +124,7 @@ class SbbBinarizeProcessor(Processor):
                    LOG.warning("Page '%s' contains no text/table regions", page_id)
                for region in regions:
                    region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh, feature_filter='binarized')
-                    region_image_bin = cv2pil(binarizer.run(image=pil2cv(region_image)))
+                    region_image_bin = cv2pil(self.binarizer.run(image=pil2cv(region_image)))
                    region_image_bin_path = self.workspace.save_image_file(
                            region_image_bin,
                            "%s_%s.IMG-BIN" % (file_id, region.id),
@ -139,7 +139,7 @@ class SbbBinarizeProcessor(Processor):
                    LOG.warning("Page '%s' contains no text lines", page_id)
                for region_id, line in region_line_tuples:
                    line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh, feature_filter='binarized')
-                    line_image_bin = cv2pil(binarizer.run(image=pil2cv(line_image)))
+                    line_image_bin = cv2pil(self.binarizer.run(image=pil2cv(line_image)))
                    line_image_bin_path = self.workspace.save_image_file(
                            line_image_bin,
                            "%s_%s_%s.IMG-BIN" % (file_id, region_id, line.id),
--- a/sbb_binarize/sbb_binarize.py
+++ b/sbb_binarize/sbb_binarize.py
@ -255,9 +255,11 @@ class SbbBinarizer:
            img_fin = (res[:, :] == 0) * 255
            img_last = img_last + img_fin

-        kernel = np.ones((5, 5), np.uint8)
        img_last[:, :][img_last[:, :] > 0] = 255
        img_last = (img_last[:, :] == 0) * 255
        if save:
-            cv2.imwrite(save, img_last)
+            cv2.imwrite(save, img_last.astype(np.uint8), [
+                cv2.IMWRITE_PNG_BILEVEL, 1,
+                cv2.IMWRITE_JPEG_QUALITY, 100
+            ])
        return img_last
--- a/setup.py
+++ b/setup.py
@ -17,7 +17,7 @@ setup(
    author='Vahid Rezanezhad',
    url='https://github.com/qurator-spk/sbb_binarization',
    license='Apache License 2.0',
-    packages=find_packages(exclude=('tests', 'docs')),
+    packages=find_packages(exclude=('test', 'repo')),
    include_package_data=True,
    package_data={'': ['*.json', '*.yml', '*.yaml']},
    install_requires=install_requires,