From 4ceed759ad83336527280fb3892925d2311dcd9b Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 28 Jan 2021 12:44:47 +0100 Subject: [PATCH 1/5] processor: self.resolve_resource model in addition to SBB_BINARIZE_DATA --- requirements.txt | 2 +- sbb_binarize/ocrd_cli.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6b012f7..85fd500 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ numpy >= 1.17.0, < 1.19.0 setuptools >= 41 opencv-python-headless -ocrd >= 2.18.0 +ocrd >= 2.22.3 keras >= 2.3.1, < 2.4 h5py < 3 tensorflow-gpu >= 1.15, < 1.16 diff --git a/sbb_binarize/ocrd_cli.py b/sbb_binarize/ocrd_cli.py index 7d9a7d5..34f16b3 100644 --- a/sbb_binarize/ocrd_cli.py +++ b/sbb_binarize/ocrd_cli.py @@ -40,15 +40,17 @@ class SbbBinarizeProcessor(Processor): kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] if not(kwargs.get('show_help', None) or kwargs.get('dump_json', None) or kwargs.get('show_version')): + LOG = getLogger('processor.SbbBinarize.__init__') if not 'model' in kwargs['parameter']: raise ValueError("'model' parameter is required") model_path = Path(kwargs['parameter']['model']) if not model_path.is_absolute(): - if 'SBB_BINARIZE_DATA' in environ: + if 'SBB_BINARIZE_DATA' in environ and environ['SBB_BINARIZE_DATA']: + LOG.info("Environment variable SBB_BINARIZE_DATA is set to '%s' - prepending to model value '%s'. If you don't want this mechanism, unset the SBB_BINARIZE_DATA environment variable.", environ['SBB_BINARIZE_DATA'], model_path) model_path = Path(environ['SBB_BINARIZE_DATA']).joinpath(model_path) model_path = model_path.resolve() - if not model_path.is_dir(): - raise FileNotFoundError("Does not exist or is not a directory: %s" % model_path) + if not model_path.is_dir(): + raise FileNotFoundError("Does not exist or is not a directory: %s" % model_path) kwargs['parameter']['model'] = str(model_path) super().__init__(*args, **kwargs) @@ -61,7 +63,7 @@ class SbbBinarizeProcessor(Processor): assert_file_grp_cardinality(self.output_file_grp, 1) oplevel = self.parameter['operation_level'] - model_path = self.parameter['model'] # pylint: disable=attribute-defined-outside-init + model_path = self.resolve_resource(self.parameter['model']) binarizer = SbbBinarizer(model_dir=model_path, logger=LOG) for n, input_file in enumerate(self.input_files): From c46d5d8b9cb1e1437134eebdc96de8ed924905d0 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 28 Jan 2021 13:54:42 +0100 Subject: [PATCH 2/5] fix test {level-of-operation,operation_level} --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 95ddbfe..7d96752 100644 --- a/Makefile +++ b/Makefile @@ -33,4 +33,4 @@ models.tar.gz: # Run tests test: model cd repo/assets/data/kant_aufklaerung_1784/data; ocrd-sbb-binarize -I OCR-D-IMG -O BIN -P model $(MODEL_DIR) - cd repo/assets/data/kant_aufklaerung_1784-page-region/data; ocrd-sbb-binarize -I OCR-D-IMG -O BIN -P model $(MODEL_DIR) -P level-of-operation region + cd repo/assets/data/kant_aufklaerung_1784-page-region/data; ocrd-sbb-binarize -I OCR-D-IMG -O BIN -P model $(MODEL_DIR) -P operation_level region From fc6baf589717979d02d2094a9bec20e759ff1952 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 28 Jan 2021 14:01:55 +0100 Subject: [PATCH 3/5] use resmgr for model download --- .circleci/config.yml | 18 ++++++++---------- Makefile | 17 +++++------------ 2 files changed, 13 insertions(+), 22 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ca93957..8f02829 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -9,14 +9,13 @@ jobs: - checkout - restore_cache: keys: - - model-cache + - ocrd-resources-{{ checksum "requirements.txt" }}-{{ checksum "Makefile" }} + - run: make install - run: make model - save_cache: - key: model-cache + key: ocrd-resources-{{ checksum "requirements.txt" }}-{{ checksum "Makefile" }} paths: - models.tar.gz - models - - run: make install + ocrd-resources - run: git submodule update --init - run: make test @@ -27,14 +26,13 @@ jobs: - checkout - restore_cache: keys: - - model-cache + - ocrd-resources-{{ checksum "requirements.txt" }}-{{ checksum "Makefile" }} + - run: make install - run: make model - save_cache: - key: model-cache + key: ocrd-resources-{{ checksum "requirements.txt" }}-{{ checksum "Makefile" }} paths: - models.tar.gz - models - - run: make install + ocrd-resources - run: git submodule update --init - run: make test diff --git a/Makefile b/Makefile index 7d96752..8c44a32 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,3 @@ -# Directory to store models -MODEL_DIR = $(PWD)/models - # BEGIN-EVAL makefile-parser --make-help Makefile help: @@ -22,15 +19,11 @@ install: pip install . # Downloads the pre-trained models from qurator-data.de -model: $(MODEL_DIR)/model1_bin.h5 - -$(MODEL_DIR)/model1_bin.h5: models.tar.gz - tar xf models.tar.gz - -models.tar.gz: - wget 'https://qurator-data.de/sbb_binarization/models.tar.gz' +.PHONY: model +model: + ocrd resmgr download --allow-uninstalled --location cwd ocrd-sbb-binarize default # Run tests test: model - cd repo/assets/data/kant_aufklaerung_1784/data; ocrd-sbb-binarize -I OCR-D-IMG -O BIN -P model $(MODEL_DIR) - cd repo/assets/data/kant_aufklaerung_1784-page-region/data; ocrd-sbb-binarize -I OCR-D-IMG -O BIN -P model $(MODEL_DIR) -P operation_level region + cd repo/assets/data/kant_aufklaerung_1784/data; ocrd-sbb-binarize -I OCR-D-IMG -O BIN -P model default + cd repo/assets/data/kant_aufklaerung_1784-page-region/data; ocrd-sbb-binarize -I OCR-D-IMG -O BIN -P model default -P operation_level region From a3b1e721b2c67348f0b850eda507995feebbe41c Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 28 Jan 2021 14:16:02 +0100 Subject: [PATCH 4/5] ocrd-cli: fix indentation --- sbb_binarize/ocrd_cli.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sbb_binarize/ocrd_cli.py b/sbb_binarize/ocrd_cli.py index 34f16b3..7f88dfc 100644 --- a/sbb_binarize/ocrd_cli.py +++ b/sbb_binarize/ocrd_cli.py @@ -48,9 +48,9 @@ class SbbBinarizeProcessor(Processor): if 'SBB_BINARIZE_DATA' in environ and environ['SBB_BINARIZE_DATA']: LOG.info("Environment variable SBB_BINARIZE_DATA is set to '%s' - prepending to model value '%s'. If you don't want this mechanism, unset the SBB_BINARIZE_DATA environment variable.", environ['SBB_BINARIZE_DATA'], model_path) model_path = Path(environ['SBB_BINARIZE_DATA']).joinpath(model_path) - model_path = model_path.resolve() - if not model_path.is_dir(): - raise FileNotFoundError("Does not exist or is not a directory: %s" % model_path) + model_path = model_path.resolve() + if not model_path.is_dir(): + raise FileNotFoundError("Does not exist or is not a directory: %s" % model_path) kwargs['parameter']['model'] = str(model_path) super().__init__(*args, **kwargs) From 9636ba24c8a9d4bcc35046b31303602704ad5ce6 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 28 Jan 2021 14:19:22 +0100 Subject: [PATCH 5/5] fix test --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 8c44a32..7c26e8b 100644 --- a/Makefile +++ b/Makefile @@ -25,5 +25,5 @@ model: # Run tests test: model - cd repo/assets/data/kant_aufklaerung_1784/data; ocrd-sbb-binarize -I OCR-D-IMG -O BIN -P model default - cd repo/assets/data/kant_aufklaerung_1784-page-region/data; ocrd-sbb-binarize -I OCR-D-IMG -O BIN -P model default -P operation_level region + ocrd-sbb-binarize -m repo/assets/data/kant_aufklaerung_1784/data/mets.xml -I OCR-D-IMG -O BIN -P model default + ocrd-sbb-binarize -m repo/assets/data/kant_aufklaerung_1784-page-region/data/mets.xml -I OCR-D-IMG -O BIN -P model default -P operation_level region