From 1715f0d8b3af730ad4f2f0ed5c09890a755fe0cd Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 13 Apr 2021 16:03:58 +0200 Subject: [PATCH 01/18] allow overriding DPI --- qurator/eynollah/eynollah.py | 11 ++++++----- qurator/eynollah/utils/pil_cv2.py | 14 ++++++++------ 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index 7510654..07812b2 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -92,7 +92,8 @@ class Eynollah: curved_line=False, full_layout=False, allow_scaling=False, - headers_off=False + headers_off=False, + override_dpi=None, ): self.image_filename = image_filename self.dir_out = dir_out @@ -102,6 +103,7 @@ class Eynollah: self.full_layout = full_layout self.allow_scaling = allow_scaling self.headers_off = headers_off + self.override_dpi = override_dpi if not self.image_filename_stem: self.image_filename_stem = Path(Path(image_filename).name).stem self.plotter = None if not enable_plotting else EynollahPlotter( @@ -346,10 +348,9 @@ class Eynollah: def resize_and_enhance_image_with_column_classifier(self): self.logger.debug("enter resize_and_enhance_image_with_column_classifier") - try: - dpi = check_dpi(self.image_filename) - except: - dpi = 230 + if self.override_dpi: + return self.override_dpi + dpi = check_dpi(self.image_filename) self.logger.info("Detected %s DPI", dpi) img = self.imread() diff --git a/qurator/eynollah/utils/pil_cv2.py b/qurator/eynollah/utils/pil_cv2.py index d7cd18d..12c94c9 100644 --- a/qurator/eynollah/utils/pil_cv2.py +++ b/qurator/eynollah/utils/pil_cv2.py @@ -15,10 +15,12 @@ def pil2cv(img): return cvtColor(pil_as_np_array, color_conversion) def check_dpi(image_filename): - exif = OcrdExif(Image.open(image_filename)) - print(exif.to_xml()) - resolution = exif.resolution - if exif.resolutionUnit == 'cm': - resolution /= 2.54 - return int(resolution) + try: + exif = OcrdExif(Image.open(image_filename)) + resolution = exif.resolution + if exif.resolutionUnit == 'cm': + resolution /= 2.54 + return int(resolution) + except: + return 230 From 9db6edf51eb74e432542dbf101207c4700a682bd Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 13 Apr 2021 17:38:02 +0200 Subject: [PATCH 02/18] OCR-D CLI --- ocrd-tool.json | 1 + qurator/eynollah/eynollah.py | 7 ++-- qurator/eynollah/ocrd-tool.json | 54 ++++++++++++++++++++++++++++++ qurator/eynollah/ocrd_cli.py | 11 ++++++ qurator/eynollah/processor.py | 59 +++++++++++++++++++++++++++++++++ setup.py | 2 +- 6 files changed, 130 insertions(+), 4 deletions(-) create mode 120000 ocrd-tool.json create mode 100644 qurator/eynollah/ocrd-tool.json create mode 100644 qurator/eynollah/ocrd_cli.py create mode 100644 qurator/eynollah/processor.py diff --git a/ocrd-tool.json b/ocrd-tool.json new file mode 120000 index 0000000..5c48493 --- /dev/null +++ b/ocrd-tool.json @@ -0,0 +1 @@ +qurator/eynollah/ocrd-tool.json \ No newline at end of file diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index 07812b2..d59fe89 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -80,9 +80,9 @@ class Eynollah: def __init__( self, image_filename, - image_filename_stem, - dir_out, dir_models, + image_filename_stem=None, + dir_out=None, dir_of_cropped_images=None, dir_of_layout=None, dir_of_deskewed=None, @@ -94,6 +94,7 @@ class Eynollah: allow_scaling=False, headers_off=False, override_dpi=None, + logger=None, ): self.image_filename = image_filename self.dir_out = dir_out @@ -117,7 +118,7 @@ class Eynollah: dir_out=self.dir_out, image_filename=self.image_filename, curved_line=self.curved_line) - self.logger = getLogger('eynollah') + self.logger = logger if logger else getLogger('eynollah') self.dir_models = dir_models self.model_dir_of_enhancement = dir_models + "/model_enhancement.h5" diff --git a/qurator/eynollah/ocrd-tool.json b/qurator/eynollah/ocrd-tool.json new file mode 100644 index 0000000..12f067b --- /dev/null +++ b/qurator/eynollah/ocrd-tool.json @@ -0,0 +1,54 @@ +{ + "version": "0.0.1", + "git_url": "https://github.com/qurator-spk/eynollah", + "tools": { + "ocrd-eynollah-segment": { + "executable": "ocrd-eynollah-segment", + "categories": ["Layout analysis"], + "description": "Segment page into regions and lines and do reading order detection with eynollah", + "input_file_grp": ["OCR-D-IMG", "OCR-D-SEG-PAGE", "OCR-D-GT-SEG-PAGE"], + "output_file_grp": ["OCR-D-SEG-LINE"], + "steps": ["layout/segmentation/region", "layout/segmentation/line"], + "parameters": { + "models": { + "type": "string", + "format": "file", + "cacheable": true, + "description": "Path to directory containing models to be used (See https://qurator-data.de/eynollah)", + "required": true + }, + "dpi": { + "type": "number", + "format": "float", + "description": "pixel density in dots per inch (overrides any meta-data in the images); disabled when <= 0", + "default": -1 + }, + "full_layout": { + "type": "boolean", + "default": true, + "description": "Try to detect all elements, including drop-caps and marginalia" + }, + "curved_line": { + "type": "boolean", + "default": false, + "description": "if this parameter set to true, this tool will try to return contoure of textlines instead of rectabgle bounding box of textline. This should be taken into account that with this option the tool need more time to do process." + }, + "allow_enhancement": { + "type": "boolean", + "default": true, + "description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not. If so output of resized and enhanced image and corresponding layout data will be written in out directory" + }, + "allow_scaling": { + "type": "boolean", + "default": false, + "description": "if this parameter set to true, this tool would check the scale and if needed it will scale it to perform better layout detection" + }, + "headers_off": { + "type": "boolean", + "default": false, + "description": "if this parameter set to true, this tool would ignore headers role in reading order" + } + } + } + } +} diff --git a/qurator/eynollah/ocrd_cli.py b/qurator/eynollah/ocrd_cli.py new file mode 100644 index 0000000..8929927 --- /dev/null +++ b/qurator/eynollah/ocrd_cli.py @@ -0,0 +1,11 @@ +from .processor import EynollahProcessor +from click import command +from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor + +@command() +@ocrd_cli_options +def main(*args, **kwargs): + return ocrd_cli_wrap_processor(EynollahProcessor, *args, **kwargs) + +if __name__ == '__main__': + main() diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py new file mode 100644 index 0000000..07d7ab2 --- /dev/null +++ b/qurator/eynollah/processor.py @@ -0,0 +1,59 @@ +from json import loads +from pkg_resources import resource_string +from tempfile import NamedTemporaryFile +from os.path import join + +from ocrd import Processor +from ocrd_modelfactory import page_from_file +from ocrd_models.ocrd_page import to_xml +from ocrd_utils import ( + getLogger, + MIMETYPE_PAGE, + assert_file_grp_cardinality, + make_file_id +) + +from .eynollah import Eynollah + +OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) + +class EynollahProcessor(Processor): + + def __init__(self, *args, **kwargs): + kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment'] + kwargs['version'] = OCRD_TOOL['version'] + super().__init__(*args, **kwargs) + + def process(self): + LOG = getLogger('eynollah') + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + for n, input_file in enumerate(self.input_files): + page_id = input_file.pageId or input_file.ID + LOG.info("INPUT FILE %s / %s ", page_id, len(self.input_files)) + pcgts_in = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts_in) + page = pcgts_in.get_Page() + page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') + file_id = make_file_id(input_file, self.output_file_grp) + with NamedTemporaryFile(buffering=0, suffix='.tif') as f: + page_image.save(f.name) + eynollah_kwargs = { + 'dir_models': self.resolve_resource(self.parameter['models']), + 'allow_enhancement': self.parameter['allow_enhancement'], + 'curved_line': self.parameter['curved_line'], + 'full_layout': self.parameter['full_layout'], + 'allow_scaling': self.parameter['allow_scaling'], + 'headers_off': self.parameter['headers_off'], + 'override_dpi': self.parameter['dpi'] if self.parameter['dpi'] > 0 else None, + 'logger': LOG, + 'image_filename': f.name} + pcgts_out = Eynollah(**eynollah_kwargs).run() + pcgts_out.get_Page().imageFilename = pcgts_in.get_Page().imageFilename + self.workspace.add_file( + ID=file_id, + file_grp=self.output_file_grp, + pageId=page_id, + mimetype=MIMETYPE_PAGE, + local_filename=join(self.output_file_grp, file_id) + '.xml', + content=to_xml(pcgts_out)) diff --git a/setup.py b/setup.py index 7988aee..c050ead 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ setup( entry_points={ 'console_scripts': [ 'eynollah=qurator.eynollah.cli:main', - # 'ocrd-eynollah=qurator.eynollah.ocrd_cli:cli', + 'ocrd-eynollah-segment=qurator.eynollah.ocrd_cli:main', ] }, ) From 2bc34891a513606a34034e351d299b977a9b1660 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 13 Apr 2021 17:55:22 +0200 Subject: [PATCH 03/18] fix CLI call --- qurator/eynollah/cli.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/qurator/eynollah/cli.py b/qurator/eynollah/cli.py index 514853a..7cbd82f 100644 --- a/qurator/eynollah/cli.py +++ b/qurator/eynollah/cli.py @@ -117,20 +117,19 @@ def main( print("Error: You used -ep to enable plotting but set none of -sl, -sd, -sa or -si") sys.exit(1) eynollah = Eynollah( - image, - None, - out, - model, - save_images, - save_layout, - save_deskewed, - save_all, - enable_plotting, - allow_enhancement, - curved_line, - full_layout, - allow_scaling, - headers_off, + image_filename=image, + dir_out=out, + dir_models=model, + dir_of_cropped_images=save_images, + dir_of_layout=save_layout, + dir_of_deskewed=save_deskewed, + dir_of_all=save_all, + enable_plotting=enable_plotting, + allow_enhancement=allow_enhancement, + curved_line=curved_line, + full_layout=full_layout, + allow_scaling=allow_scaling, + headers_off=headers_off, ) pcgts = eynollah.run() eynollah.writer.write_pagexml(pcgts) From 8c4e9b6068b522e52cb722c65f7a1c548b0b995b Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 14 Apr 2021 10:38:07 +0200 Subject: [PATCH 04/18] allow passing pcgts to eynollah and writer --- qurator/eynollah/eynollah.py | 4 +++- qurator/eynollah/processor.py | 14 +++++++------- qurator/eynollah/writer.py | 4 ++-- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index d59fe89..c6b4096 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -95,6 +95,7 @@ class Eynollah: headers_off=False, override_dpi=None, logger=None, + pcgts=None, ): self.image_filename = image_filename self.dir_out = dir_out @@ -117,7 +118,8 @@ class Eynollah: self.writer = EynollahXmlWriter( dir_out=self.dir_out, image_filename=self.image_filename, - curved_line=self.curved_line) + curved_line=self.curved_line, + pcgts=pcgts) self.logger = logger if logger else getLogger('eynollah') self.dir_models = dir_models diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index 07d7ab2..68da037 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -30,10 +30,10 @@ class EynollahProcessor(Processor): assert_file_grp_cardinality(self.output_file_grp, 1) for n, input_file in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID - LOG.info("INPUT FILE %s / %s ", page_id, len(self.input_files)) - pcgts_in = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts_in) - page = pcgts_in.get_Page() + LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files)) + pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) + page = pcgts.get_Page() page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') file_id = make_file_id(input_file, self.output_file_grp) with NamedTemporaryFile(buffering=0, suffix='.tif') as f: @@ -47,13 +47,13 @@ class EynollahProcessor(Processor): 'headers_off': self.parameter['headers_off'], 'override_dpi': self.parameter['dpi'] if self.parameter['dpi'] > 0 else None, 'logger': LOG, + 'pcgts': pcgts, 'image_filename': f.name} - pcgts_out = Eynollah(**eynollah_kwargs).run() - pcgts_out.get_Page().imageFilename = pcgts_in.get_Page().imageFilename + Eynollah(**eynollah_kwargs).run() self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=page_id, mimetype=MIMETYPE_PAGE, local_filename=join(self.output_file_grp, file_id) + '.xml', - content=to_xml(pcgts_out)) + content=to_xml(pcgts)) diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index 7ddfea5..5b42043 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -141,7 +141,7 @@ class EynollahXmlWriter(): self.logger.debug('enter build_pagexml_no_full_layout') # create the file structure - pcgts = create_page_xml(self.image_filename, self.height_org, self.width_org) + pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org) page = pcgts.get_Page() page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page)))) @@ -181,7 +181,7 @@ class EynollahXmlWriter(): self.logger.debug('enter build_pagexml_full_layout') # create the file structure - pcgts = create_page_xml(self.image_filename, self.height_org, self.width_org) + pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org) page = pcgts.get_Page() page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page)))) From b8d818ede1b7dd3e304c49d50667db7b458ce8c2 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 14 Apr 2021 10:53:32 +0200 Subject: [PATCH 05/18] writer: don't create empty PcGts at init --- qurator/eynollah/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index 5b42043..317ac4e 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -30,7 +30,7 @@ class EynollahXmlWriter(): self.image_filename = image_filename self.image_filename_stem = Path(Path(image_filename).name).stem self.curved_line = curved_line - self.pcgts = pcgts if pcgts else PcGtsType() + self.pcgts = pcgts self.scale_x = None # XXX set outside __init__ self.scale_y = None # XXX set outside __init__ self.height_org = None # XXX set outside __init__ From 5e260eb4487880834b7f3bd4dcb46081661d44e3 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 14 Apr 2021 11:55:44 +0200 Subject: [PATCH 06/18] setup.py: include json data --- setup.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.py b/setup.py index c050ead..a54bb58 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,9 @@ setup( namespace_packages=['qurator'], packages=find_packages(exclude=['tests']), install_requires=install_requires, + package_data={ + '': ['*.json'] + }, entry_points={ 'console_scripts': [ 'eynollah=qurator.eynollah.cli:main', From 8f7cf5d1fb6c41979861810b49395f93cdbea23f Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 14 Apr 2021 11:58:12 +0200 Subject: [PATCH 07/18] setup.py: include json data --- setup.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.py b/setup.py index c050ead..a54bb58 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,9 @@ setup( namespace_packages=['qurator'], packages=find_packages(exclude=['tests']), install_requires=install_requires, + package_data={ + '': ['*.json'] + }, entry_points={ 'console_scripts': [ 'eynollah=qurator.eynollah.cli:main', From 037210b292aefaea0d26b40fce03d609992457df Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 14 Apr 2021 08:55:54 -0400 Subject: [PATCH 08/18] update writer.py --- qurator/eynollah/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index 317ac4e..7069785 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -87,7 +87,7 @@ class EynollahXmlWriter(): points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x)) points_co += ',' points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y)) - points += ' ' + points_co += ' ' coords.set_points(points_co[:-1]) def serialize_lines_in_region(self, text_region, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, counter): From 1367f82605a01c2ca8afcfaf1c8c391fe4af01d3 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 14 Apr 2021 17:31:57 +0200 Subject: [PATCH 09/18] improve ocrd-tool descriptions Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- qurator/eynollah/ocrd-tool.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/qurator/eynollah/ocrd-tool.json b/qurator/eynollah/ocrd-tool.json index 12f067b..76d4b7c 100644 --- a/qurator/eynollah/ocrd-tool.json +++ b/qurator/eynollah/ocrd-tool.json @@ -20,23 +20,23 @@ "dpi": { "type": "number", "format": "float", - "description": "pixel density in dots per inch (overrides any meta-data in the images); disabled when <= 0", - "default": -1 + "description": "pixel density in dots per inch (overrides any meta-data in the images); ignored if <= 0 (with fall-back 230)", + "default": 0 }, "full_layout": { "type": "boolean", "default": true, - "description": "Try to detect all elements, including drop-caps and marginalia" + "description": "Try to detect all element subtypes, including drop-caps and headings" }, "curved_line": { "type": "boolean", "default": false, - "description": "if this parameter set to true, this tool will try to return contoure of textlines instead of rectabgle bounding box of textline. This should be taken into account that with this option the tool need more time to do process." + "description": "try to return contour of textlines instead of just rectangle bounding box. Needs more processing time" }, "allow_enhancement": { "type": "boolean", "default": true, - "description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not. If so output of resized and enhanced image and corresponding layout data will be written in out directory" + "description": "if the input image has less than 300 DPI, then upscale and enhance" }, "allow_scaling": { "type": "boolean", @@ -46,7 +46,7 @@ "headers_off": { "type": "boolean", "default": false, - "description": "if this parameter set to true, this tool would ignore headers role in reading order" + "description": "ignore the special role of headings during reading order detection" } } } From d40c453dadc2b2f583ea2be04789075be17e8d3f Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 14 Apr 2021 17:42:37 +0200 Subject: [PATCH 10/18] check_dpi: raise exception if resolution == 1 to trigger except clause Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- qurator/eynollah/utils/pil_cv2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/qurator/eynollah/utils/pil_cv2.py b/qurator/eynollah/utils/pil_cv2.py index 12c94c9..b10ceb7 100644 --- a/qurator/eynollah/utils/pil_cv2.py +++ b/qurator/eynollah/utils/pil_cv2.py @@ -18,9 +18,10 @@ def check_dpi(image_filename): try: exif = OcrdExif(Image.open(image_filename)) resolution = exif.resolution + if resolution == 1: + raise Exception() if exif.resolutionUnit == 'cm': resolution /= 2.54 return int(resolution) except: return 230 - From 4897cefdb70769128693d00b58f3148b5702eb71 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 15 Apr 2021 17:25:05 +0200 Subject: [PATCH 11/18] allow passing PIL image to Eynollah w/o disk I/O --- qurator/eynollah/eynollah.py | 38 +++++++++++++++------------- qurator/eynollah/plot.py | 2 -- qurator/eynollah/processor.py | 42 +++++++++++++++---------------- qurator/eynollah/utils/pil_cv2.py | 9 ++++--- qurator/eynollah/writer.py | 5 +++- tests/test_dpi.py | 5 ++-- 6 files changed, 53 insertions(+), 48 deletions(-) diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index c6b4096..755895c 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -65,7 +65,7 @@ from .utils import ( order_of_regions, find_number_of_columns_in_document, return_boxes_of_images_by_order_of_reading_new) -from .utils.pil_cv2 import check_dpi +from .utils.pil_cv2 import check_dpi, pil2cv from .utils.xml import order_and_id_of_texts from .plot import EynollahPlotter from .writer import EynollahXmlWriter @@ -79,8 +79,9 @@ KERNEL = np.ones((5, 5), np.uint8) class Eynollah: def __init__( self, - image_filename, dir_models, + image_filename, + image_pil=None, image_filename_stem=None, dir_out=None, dir_of_cropped_images=None, @@ -97,24 +98,24 @@ class Eynollah: logger=None, pcgts=None, ): + if image_pil: + self._imgs = self._cache_images(image_pil=image_pil) + else: + self._imgs = self._cache_images(image_filename=image_filename) self.image_filename = image_filename self.dir_out = dir_out - self.image_filename_stem = image_filename_stem self.allow_enhancement = allow_enhancement self.curved_line = curved_line self.full_layout = full_layout self.allow_scaling = allow_scaling self.headers_off = headers_off self.override_dpi = override_dpi - if not self.image_filename_stem: - self.image_filename_stem = Path(Path(image_filename).name).stem self.plotter = None if not enable_plotting else EynollahPlotter( dir_of_all=dir_of_all, dir_of_deskewed=dir_of_deskewed, dir_of_cropped_images=dir_of_cropped_images, dir_of_layout=dir_of_layout, - image_filename=image_filename, - image_filename_stem=self.image_filename_stem) + image_filename_stem=Path(Path(image_filename).name).stem) self.writer = EynollahXmlWriter( dir_out=self.dir_out, image_filename=self.image_filename, @@ -133,7 +134,16 @@ class Eynollah: self.model_region_dir_p_ens = dir_models + "/model_ensemble_s.h5" self.model_textline_dir = dir_models + "/model_textline_newspapers.h5" - self._imgs = {} + def _cache_images(self, image_filename=None, image_pil=None): + ret = {} + if image_filename: + ret['img'] = cv2.imread(image_filename) + else: + ret['img'] = pil2cv(image_pil) + ret['img_grayscale'] = cv2.cvtColor(ret['img'], cv2.COLOR_BGR2GRAY) + for prefix in ('', '_grayscale'): + ret[f'img{prefix}_uint8'] = ret[f'img{prefix}'].astype(np.uint8) + return ret def imread(self, grayscale=False, uint8=True): key = 'img' @@ -141,16 +151,9 @@ class Eynollah: key += '_grayscale' if uint8: key += '_uint8' - if key not in self._imgs: - if grayscale: - img = cv2.imread(self.image_filename, cv2.IMREAD_GRAYSCALE) - else: - img = cv2.imread(self.image_filename) - if uint8: - img = img.astype(np.uint8) - self._imgs[key] = img return self._imgs[key].copy() + def predict_enhancement(self, img): self.logger.debug("enter predict_enhancement") model_enhancement, session_enhancement = self.start_new_session_and_model(self.model_dir_of_enhancement) @@ -353,7 +356,7 @@ class Eynollah: self.logger.debug("enter resize_and_enhance_image_with_column_classifier") if self.override_dpi: return self.override_dpi - dpi = check_dpi(self.image_filename) + dpi = check_dpi(self.imread()) self.logger.info("Detected %s DPI", dpi) img = self.imread() @@ -1450,7 +1453,6 @@ class Eynollah: scale = 1 if is_image_enhanced: if self.allow_enhancement: - cv2.imwrite(os.path.join(self.dir_out, self.image_filename_stem) + ".tif", img_res) img_res = img_res.astype(np.uint8) self.get_image_and_scales(img_org, img_res, scale) else: diff --git a/qurator/eynollah/plot.py b/qurator/eynollah/plot.py index a2cf4e2..18a7c14 100644 --- a/qurator/eynollah/plot.py +++ b/qurator/eynollah/plot.py @@ -21,7 +21,6 @@ class EynollahPlotter(): dir_of_deskewed, dir_of_layout, dir_of_cropped_images, - image_filename, image_filename_stem, image_org=None, scale_x=1, @@ -31,7 +30,6 @@ class EynollahPlotter(): self.dir_of_layout = dir_of_layout self.dir_of_cropped_images = dir_of_cropped_images self.dir_of_deskewed = dir_of_deskewed - self.image_filename = image_filename self.image_filename_stem = image_filename_stem # XXX TODO hacky these cannot be set at init time self.image_org = image_org diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index 68da037..cfebe72 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -14,6 +14,7 @@ from ocrd_utils import ( ) from .eynollah import Eynollah +from .utils.pil_cv2 import pil2cv OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) @@ -35,25 +36,24 @@ class EynollahProcessor(Processor): self.add_metadata(pcgts) page = pcgts.get_Page() page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') + eynollah_kwargs = { + 'dir_models': self.resolve_resource(self.parameter['models']), + 'allow_enhancement': self.parameter['allow_enhancement'], + 'curved_line': self.parameter['curved_line'], + 'full_layout': self.parameter['full_layout'], + 'allow_scaling': self.parameter['allow_scaling'], + 'headers_off': self.parameter['headers_off'], + 'override_dpi': self.parameter['dpi'] if self.parameter['dpi'] > 0 else None, + 'logger': LOG, + 'pcgts': pcgts, + 'image_pil': page_image, + 'image_filename': None} + Eynollah(**eynollah_kwargs).run() file_id = make_file_id(input_file, self.output_file_grp) - with NamedTemporaryFile(buffering=0, suffix='.tif') as f: - page_image.save(f.name) - eynollah_kwargs = { - 'dir_models': self.resolve_resource(self.parameter['models']), - 'allow_enhancement': self.parameter['allow_enhancement'], - 'curved_line': self.parameter['curved_line'], - 'full_layout': self.parameter['full_layout'], - 'allow_scaling': self.parameter['allow_scaling'], - 'headers_off': self.parameter['headers_off'], - 'override_dpi': self.parameter['dpi'] if self.parameter['dpi'] > 0 else None, - 'logger': LOG, - 'pcgts': pcgts, - 'image_filename': f.name} - Eynollah(**eynollah_kwargs).run() - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=page_id, - mimetype=MIMETYPE_PAGE, - local_filename=join(self.output_file_grp, file_id) + '.xml', - content=to_xml(pcgts)) + self.workspace.add_file( + ID=file_id, + file_grp=self.output_file_grp, + pageId=page_id, + mimetype=MIMETYPE_PAGE, + local_filename=join(self.output_file_grp, file_id) + '.xml', + content=to_xml(pcgts)) diff --git a/qurator/eynollah/utils/pil_cv2.py b/qurator/eynollah/utils/pil_cv2.py index b10ceb7..4d35b7a 100644 --- a/qurator/eynollah/utils/pil_cv2.py +++ b/qurator/eynollah/utils/pil_cv2.py @@ -6,7 +6,7 @@ from cv2 import COLOR_GRAY2BGR, COLOR_RGB2BGR, cvtColor, imread # from sbb_binarization def cv2pil(img): - return Image.fromarray(img.astype('uint8')) + return Image.fromarray(img) def pil2cv(img): # from ocrd/workspace.py @@ -14,14 +14,15 @@ def pil2cv(img): pil_as_np_array = np.array(img).astype('uint8') if img.mode == '1' else np.array(img) return cvtColor(pil_as_np_array, color_conversion) -def check_dpi(image_filename): +def check_dpi(img): try: - exif = OcrdExif(Image.open(image_filename)) + exif = OcrdExif(cv2pil(img)) resolution = exif.resolution if resolution == 1: raise Exception() if exif.resolutionUnit == 'cm': resolution /= 2.54 return int(resolution) - except: + except Exception as e: + print(e) return 230 diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index 7069785..d9a9239 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -28,7 +28,6 @@ class EynollahXmlWriter(): self.counter = EynollahIdCounter() self.dir_out = dir_out self.image_filename = image_filename - self.image_filename_stem = Path(Path(image_filename).name).stem self.curved_line = curved_line self.pcgts = pcgts self.scale_x = None # XXX set outside __init__ @@ -36,6 +35,10 @@ class EynollahXmlWriter(): self.height_org = None # XXX set outside __init__ self.width_org = None # XXX set outside __init__ + @property + def image_filename_stem(self): + return Path(Path(self.image_filename).name).stem + def calculate_page_coords(self, cont_page): self.logger.debug('enter calculate_page_coords') points_page_print = "" diff --git a/tests/test_dpi.py b/tests/test_dpi.py index 380928d..510ffc5 100644 --- a/tests/test_dpi.py +++ b/tests/test_dpi.py @@ -1,10 +1,11 @@ +import cv2 from pathlib import Path from qurator.eynollah.utils.pil_cv2 import check_dpi from tests.base import main def test_dpi(): - fpath = Path(__file__).parent.joinpath('resources', 'kant_aufklaerung_1784_0020.tif') - assert 300 == check_dpi(str(fpath)) + fpath = str(Path(__file__).parent.joinpath('resources', 'kant_aufklaerung_1784_0020.tif')) + assert 230 == check_dpi(cv2.imread(fpath)) if __name__ == '__main__': main(__file__) From 42ccb4711d628294f45c5f501a4126a3445942f9 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 21 Apr 2021 10:55:28 +0200 Subject: [PATCH 12/18] Update qurator/eynollah/ocrd-tool.json Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- qurator/eynollah/ocrd-tool.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qurator/eynollah/ocrd-tool.json b/qurator/eynollah/ocrd-tool.json index 76d4b7c..01d48fa 100644 --- a/qurator/eynollah/ocrd-tool.json +++ b/qurator/eynollah/ocrd-tool.json @@ -41,7 +41,7 @@ "allow_scaling": { "type": "boolean", "default": false, - "description": "if this parameter set to true, this tool would check the scale and if needed it will scale it to perform better layout detection" + "description": "check the resolution against the number of detected columns and if needed, scale the image up or down during layout detection (heuristic to improve quality and performance)" }, "headers_off": { "type": "boolean", From 2e8a3e3bee5bb926121b5bba630270c8044f5fe4 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 21 Apr 2021 18:30:48 +0200 Subject: [PATCH 13/18] use Page.imageFilename directly for accurate DPI estimate --- qurator/eynollah/eynollah.py | 4 +--- qurator/eynollah/processor.py | 22 +++++++++++++++++----- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index 755895c..197afe9 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -354,9 +354,7 @@ class Eynollah: def resize_and_enhance_image_with_column_classifier(self): self.logger.debug("enter resize_and_enhance_image_with_column_classifier") - if self.override_dpi: - return self.override_dpi - dpi = check_dpi(self.imread()) + dpi = self.override_dpi if self.override_dpi else check_dpi(self.imread()) self.logger.info("Detected %s DPI", dpi) img = self.imread() diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index cfebe72..2fcc08d 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -1,10 +1,14 @@ from json import loads from pkg_resources import resource_string from tempfile import NamedTemporaryFile +from pathlib import Path from os.path import join +from PIL import Image + from ocrd import Processor -from ocrd_modelfactory import page_from_file +from ocrd_modelfactory import page_from_file, exif_from_filename +from ocrd_models import OcrdFile, OcrdExif from ocrd_models.ocrd_page import to_xml from ocrd_utils import ( getLogger, @@ -35,7 +39,15 @@ class EynollahProcessor(Processor): pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() - page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') + # XXX loses DPI information + # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') + self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))) + if self.parameter['dpi'] <= 0: + exif = exif_from_filename(page.imageFilename) + dpi = exif.resolution + if exif.resolutionUnit == 'cm': + dpi /= 2.54 + self.parameter['dpi'] = dpi eynollah_kwargs = { 'dir_models': self.resolve_resource(self.parameter['models']), 'allow_enhancement': self.parameter['allow_enhancement'], @@ -43,11 +55,11 @@ class EynollahProcessor(Processor): 'full_layout': self.parameter['full_layout'], 'allow_scaling': self.parameter['allow_scaling'], 'headers_off': self.parameter['headers_off'], - 'override_dpi': self.parameter['dpi'] if self.parameter['dpi'] > 0 else None, + 'override_dpi': self.parameter['dpi'], 'logger': LOG, 'pcgts': pcgts, - 'image_pil': page_image, - 'image_filename': None} + 'image_filename': page.imageFilename + } Eynollah(**eynollah_kwargs).run() file_id = make_file_id(input_file, self.output_file_grp) self.workspace.add_file( From ae0b4a825a75132708651a6205e3d2e5fbcdf48e Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 22 Apr 2021 10:28:01 +0200 Subject: [PATCH 14/18] ocrd cli: catch dpi == 1, return 230 --- qurator/eynollah/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index 2fcc08d..a9261f3 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -47,7 +47,7 @@ class EynollahProcessor(Processor): dpi = exif.resolution if exif.resolutionUnit == 'cm': dpi /= 2.54 - self.parameter['dpi'] = dpi + self.parameter['dpi'] = dpi if dpi != 1 else 230 eynollah_kwargs = { 'dir_models': self.resolve_resource(self.parameter['models']), 'allow_enhancement': self.parameter['allow_enhancement'], From d0b0e23ac64d0dbdb1352731ddf2ff14828e3b84 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 22 Apr 2021 12:07:14 +0200 Subject: [PATCH 15/18] do DPI calculation as part of caching images --- qurator/eynollah/eynollah.py | 7 +++++-- qurator/eynollah/processor.py | 6 ------ qurator/eynollah/utils/pil_cv2.py | 8 +++++++- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index 197afe9..780371a 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -102,6 +102,8 @@ class Eynollah: self._imgs = self._cache_images(image_pil=image_pil) else: self._imgs = self._cache_images(image_filename=image_filename) + if override_dpi: + self.dpi = override_dpi self.image_filename = image_filename self.dir_out = dir_out self.allow_enhancement = allow_enhancement @@ -109,7 +111,6 @@ class Eynollah: self.full_layout = full_layout self.allow_scaling = allow_scaling self.headers_off = headers_off - self.override_dpi = override_dpi self.plotter = None if not enable_plotting else EynollahPlotter( dir_of_all=dir_of_all, dir_of_deskewed=dir_of_deskewed, @@ -138,8 +139,10 @@ class Eynollah: ret = {} if image_filename: ret['img'] = cv2.imread(image_filename) + self.dpi = check_dpi(image_filename) else: ret['img'] = pil2cv(image_pil) + self.dpi = check_dpi(image_pil) ret['img_grayscale'] = cv2.cvtColor(ret['img'], cv2.COLOR_BGR2GRAY) for prefix in ('', '_grayscale'): ret[f'img{prefix}_uint8'] = ret[f'img{prefix}'].astype(np.uint8) @@ -354,7 +357,7 @@ class Eynollah: def resize_and_enhance_image_with_column_classifier(self): self.logger.debug("enter resize_and_enhance_image_with_column_classifier") - dpi = self.override_dpi if self.override_dpi else check_dpi(self.imread()) + dpi = self.dpi self.logger.info("Detected %s DPI", dpi) img = self.imread() diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index a9261f3..58d0940 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -42,12 +42,6 @@ class EynollahProcessor(Processor): # XXX loses DPI information # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))) - if self.parameter['dpi'] <= 0: - exif = exif_from_filename(page.imageFilename) - dpi = exif.resolution - if exif.resolutionUnit == 'cm': - dpi /= 2.54 - self.parameter['dpi'] = dpi if dpi != 1 else 230 eynollah_kwargs = { 'dir_models': self.resolve_resource(self.parameter['models']), 'allow_enhancement': self.parameter['allow_enhancement'], diff --git a/qurator/eynollah/utils/pil_cv2.py b/qurator/eynollah/utils/pil_cv2.py index 4d35b7a..d6eb0f3 100644 --- a/qurator/eynollah/utils/pil_cv2.py +++ b/qurator/eynollah/utils/pil_cv2.py @@ -15,8 +15,14 @@ def pil2cv(img): return cvtColor(pil_as_np_array, color_conversion) def check_dpi(img): + if isinstance(img, Image.__class__): + pil_image = img + elif isinstance(img, str): + pil_image = Image.open(img) + else: + pil_image = cv2pil(img) try: - exif = OcrdExif(cv2pil(img)) + exif = OcrdExif(pil_image) resolution = exif.resolution if resolution == 1: raise Exception() From c7f304dcb67c82faacbad49033559e19a154e057 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 22 Apr 2021 12:31:00 +0200 Subject: [PATCH 16/18] ocrd processor: pass local filename as image_filename, ht @bertsky --- qurator/eynollah/processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index 58d0940..703a4d2 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -41,7 +41,7 @@ class EynollahProcessor(Processor): page = pcgts.get_Page() # XXX loses DPI information # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))) + image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename eynollah_kwargs = { 'dir_models': self.resolve_resource(self.parameter['models']), 'allow_enhancement': self.parameter['allow_enhancement'], @@ -52,7 +52,7 @@ class EynollahProcessor(Processor): 'override_dpi': self.parameter['dpi'], 'logger': LOG, 'pcgts': pcgts, - 'image_filename': page.imageFilename + 'image_filename': image_filename } Eynollah(**eynollah_kwargs).run() file_id = make_file_id(input_file, self.output_file_grp) From ff265eee5c52647299c5e1549430ed93af6c56a9 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 22 Apr 2021 12:57:04 +0200 Subject: [PATCH 17/18] cv2pil: do COLOR_BGR2RGB conversion --- qurator/eynollah/utils/pil_cv2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qurator/eynollah/utils/pil_cv2.py b/qurator/eynollah/utils/pil_cv2.py index d6eb0f3..4d180f5 100644 --- a/qurator/eynollah/utils/pil_cv2.py +++ b/qurator/eynollah/utils/pil_cv2.py @@ -1,12 +1,12 @@ from PIL import Image import numpy as np from ocrd_models import OcrdExif -from cv2 import COLOR_GRAY2BGR, COLOR_RGB2BGR, cvtColor, imread +from cv2 import COLOR_GRAY2BGR, COLOR_RGB2BGR, COLOR_BGR2RGB, cvtColor, imread # from sbb_binarization def cv2pil(img): - return Image.fromarray(img) + return Image.fromarray(np.array(cvtColor(img, COLOR_BGR2RGB))) def pil2cv(img): # from ocrd/workspace.py From 6c8852eb04423317bc3793d6663f6c491a56b5d7 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 22 Apr 2021 13:12:40 +0200 Subject: [PATCH 18/18] check_dpi: catch Pillow choking on faulty img, return 230 --- qurator/eynollah/utils/pil_cv2.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/qurator/eynollah/utils/pil_cv2.py b/qurator/eynollah/utils/pil_cv2.py index 4d180f5..20dc22f 100644 --- a/qurator/eynollah/utils/pil_cv2.py +++ b/qurator/eynollah/utils/pil_cv2.py @@ -15,13 +15,13 @@ def pil2cv(img): return cvtColor(pil_as_np_array, color_conversion) def check_dpi(img): - if isinstance(img, Image.__class__): - pil_image = img - elif isinstance(img, str): - pil_image = Image.open(img) - else: - pil_image = cv2pil(img) try: + if isinstance(img, Image.__class__): + pil_image = img + elif isinstance(img, str): + pil_image = Image.open(img) + else: + pil_image = cv2pil(img) exif = OcrdExif(pil_image) resolution = exif.resolution if resolution == 1: