diff --git a/ocrd-tool.json b/ocrd-tool.json new file mode 120000 index 0000000..5c48493 --- /dev/null +++ b/ocrd-tool.json @@ -0,0 +1 @@ +qurator/eynollah/ocrd-tool.json \ No newline at end of file diff --git a/qurator/eynollah/cli.py b/qurator/eynollah/cli.py index 514853a..7cbd82f 100644 --- a/qurator/eynollah/cli.py +++ b/qurator/eynollah/cli.py @@ -117,20 +117,19 @@ def main( print("Error: You used -ep to enable plotting but set none of -sl, -sd, -sa or -si") sys.exit(1) eynollah = Eynollah( - image, - None, - out, - model, - save_images, - save_layout, - save_deskewed, - save_all, - enable_plotting, - allow_enhancement, - curved_line, - full_layout, - allow_scaling, - headers_off, + image_filename=image, + dir_out=out, + dir_models=model, + dir_of_cropped_images=save_images, + dir_of_layout=save_layout, + dir_of_deskewed=save_deskewed, + dir_of_all=save_all, + enable_plotting=enable_plotting, + allow_enhancement=allow_enhancement, + curved_line=curved_line, + full_layout=full_layout, + allow_scaling=allow_scaling, + headers_off=headers_off, ) pcgts = eynollah.run() eynollah.writer.write_pagexml(pcgts) diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index 0b476f6..167336f 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -65,7 +65,7 @@ from .utils import ( order_of_regions, find_number_of_columns_in_document, return_boxes_of_images_by_order_of_reading_new) -from .utils.pil_cv2 import check_dpi +from .utils.pil_cv2 import check_dpi, pil2cv from .utils.xml import order_and_id_of_texts from .plot import EynollahPlotter from .writer import EynollahXmlWriter @@ -79,10 +79,11 @@ KERNEL = np.ones((5, 5), np.uint8) class Eynollah: def __init__( self, - image_filename, - image_filename_stem, - dir_out, dir_models, + image_filename, + image_pil=None, + image_filename_stem=None, + dir_out=None, dir_of_cropped_images=None, dir_of_layout=None, dir_of_deskewed=None, @@ -92,30 +93,36 @@ class Eynollah: curved_line=False, full_layout=False, allow_scaling=False, - headers_off=False + headers_off=False, + override_dpi=None, + logger=None, + pcgts=None, ): + if image_pil: + self._imgs = self._cache_images(image_pil=image_pil) + else: + self._imgs = self._cache_images(image_filename=image_filename) + if override_dpi: + self.dpi = override_dpi self.image_filename = image_filename self.dir_out = dir_out - self.image_filename_stem = image_filename_stem self.allow_enhancement = allow_enhancement self.curved_line = curved_line self.full_layout = full_layout self.allow_scaling = allow_scaling self.headers_off = headers_off - if not self.image_filename_stem: - self.image_filename_stem = Path(Path(image_filename).name).stem self.plotter = None if not enable_plotting else EynollahPlotter( dir_of_all=dir_of_all, dir_of_deskewed=dir_of_deskewed, dir_of_cropped_images=dir_of_cropped_images, dir_of_layout=dir_of_layout, - image_filename=image_filename, - image_filename_stem=self.image_filename_stem) + image_filename_stem=Path(Path(image_filename).name).stem) self.writer = EynollahXmlWriter( dir_out=self.dir_out, image_filename=self.image_filename, - curved_line=self.curved_line) - self.logger = getLogger('eynollah') + curved_line=self.curved_line, + pcgts=pcgts) + self.logger = logger if logger else getLogger('eynollah') self.dir_models = dir_models self.model_dir_of_enhancement = dir_models + "/model_enhancement.h5" @@ -128,7 +135,18 @@ class Eynollah: self.model_region_dir_p_ens = dir_models + "/model_ensemble_s.h5" self.model_textline_dir = dir_models + "/model_textline_newspapers.h5" - self._imgs = {} + def _cache_images(self, image_filename=None, image_pil=None): + ret = {} + if image_filename: + ret['img'] = cv2.imread(image_filename) + self.dpi = check_dpi(image_filename) + else: + ret['img'] = pil2cv(image_pil) + self.dpi = check_dpi(image_pil) + ret['img_grayscale'] = cv2.cvtColor(ret['img'], cv2.COLOR_BGR2GRAY) + for prefix in ('', '_grayscale'): + ret[f'img{prefix}_uint8'] = ret[f'img{prefix}'].astype(np.uint8) + return ret def imread(self, grayscale=False, uint8=True): key = 'img' @@ -136,16 +154,9 @@ class Eynollah: key += '_grayscale' if uint8: key += '_uint8' - if key not in self._imgs: - if grayscale: - img = cv2.imread(self.image_filename, cv2.IMREAD_GRAYSCALE) - else: - img = cv2.imread(self.image_filename) - if uint8: - img = img.astype(np.uint8) - self._imgs[key] = img return self._imgs[key].copy() + def predict_enhancement(self, img): self.logger.debug("enter predict_enhancement") model_enhancement, session_enhancement = self.start_new_session_and_model(self.model_dir_of_enhancement) @@ -346,10 +357,7 @@ class Eynollah: def resize_and_enhance_image_with_column_classifier(self): self.logger.debug("enter resize_and_enhance_image_with_column_classifier") - try: - dpi = check_dpi(self.image_filename) - except: - dpi = 230 + dpi = self.dpi self.logger.info("Detected %s DPI", dpi) img = self.imread() @@ -1503,7 +1511,6 @@ class Eynollah: scale = 1 if is_image_enhanced: if self.allow_enhancement: - cv2.imwrite(os.path.join(self.dir_out, self.image_filename_stem) + ".tif", img_res) img_res = img_res.astype(np.uint8) self.get_image_and_scales(img_org, img_res, scale) else: diff --git a/qurator/eynollah/ocrd-tool.json b/qurator/eynollah/ocrd-tool.json new file mode 100644 index 0000000..01d48fa --- /dev/null +++ b/qurator/eynollah/ocrd-tool.json @@ -0,0 +1,54 @@ +{ + "version": "0.0.1", + "git_url": "https://github.com/qurator-spk/eynollah", + "tools": { + "ocrd-eynollah-segment": { + "executable": "ocrd-eynollah-segment", + "categories": ["Layout analysis"], + "description": "Segment page into regions and lines and do reading order detection with eynollah", + "input_file_grp": ["OCR-D-IMG", "OCR-D-SEG-PAGE", "OCR-D-GT-SEG-PAGE"], + "output_file_grp": ["OCR-D-SEG-LINE"], + "steps": ["layout/segmentation/region", "layout/segmentation/line"], + "parameters": { + "models": { + "type": "string", + "format": "file", + "cacheable": true, + "description": "Path to directory containing models to be used (See https://qurator-data.de/eynollah)", + "required": true + }, + "dpi": { + "type": "number", + "format": "float", + "description": "pixel density in dots per inch (overrides any meta-data in the images); ignored if <= 0 (with fall-back 230)", + "default": 0 + }, + "full_layout": { + "type": "boolean", + "default": true, + "description": "Try to detect all element subtypes, including drop-caps and headings" + }, + "curved_line": { + "type": "boolean", + "default": false, + "description": "try to return contour of textlines instead of just rectangle bounding box. Needs more processing time" + }, + "allow_enhancement": { + "type": "boolean", + "default": true, + "description": "if the input image has less than 300 DPI, then upscale and enhance" + }, + "allow_scaling": { + "type": "boolean", + "default": false, + "description": "check the resolution against the number of detected columns and if needed, scale the image up or down during layout detection (heuristic to improve quality and performance)" + }, + "headers_off": { + "type": "boolean", + "default": false, + "description": "ignore the special role of headings during reading order detection" + } + } + } + } +} diff --git a/qurator/eynollah/ocrd_cli.py b/qurator/eynollah/ocrd_cli.py new file mode 100644 index 0000000..8929927 --- /dev/null +++ b/qurator/eynollah/ocrd_cli.py @@ -0,0 +1,11 @@ +from .processor import EynollahProcessor +from click import command +from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor + +@command() +@ocrd_cli_options +def main(*args, **kwargs): + return ocrd_cli_wrap_processor(EynollahProcessor, *args, **kwargs) + +if __name__ == '__main__': + main() diff --git a/qurator/eynollah/plot.py b/qurator/eynollah/plot.py index a2cf4e2..18a7c14 100644 --- a/qurator/eynollah/plot.py +++ b/qurator/eynollah/plot.py @@ -21,7 +21,6 @@ class EynollahPlotter(): dir_of_deskewed, dir_of_layout, dir_of_cropped_images, - image_filename, image_filename_stem, image_org=None, scale_x=1, @@ -31,7 +30,6 @@ class EynollahPlotter(): self.dir_of_layout = dir_of_layout self.dir_of_cropped_images = dir_of_cropped_images self.dir_of_deskewed = dir_of_deskewed - self.image_filename = image_filename self.image_filename_stem = image_filename_stem # XXX TODO hacky these cannot be set at init time self.image_org = image_org diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py new file mode 100644 index 0000000..703a4d2 --- /dev/null +++ b/qurator/eynollah/processor.py @@ -0,0 +1,65 @@ +from json import loads +from pkg_resources import resource_string +from tempfile import NamedTemporaryFile +from pathlib import Path +from os.path import join + +from PIL import Image + +from ocrd import Processor +from ocrd_modelfactory import page_from_file, exif_from_filename +from ocrd_models import OcrdFile, OcrdExif +from ocrd_models.ocrd_page import to_xml +from ocrd_utils import ( + getLogger, + MIMETYPE_PAGE, + assert_file_grp_cardinality, + make_file_id +) + +from .eynollah import Eynollah +from .utils.pil_cv2 import pil2cv + +OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) + +class EynollahProcessor(Processor): + + def __init__(self, *args, **kwargs): + kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment'] + kwargs['version'] = OCRD_TOOL['version'] + super().__init__(*args, **kwargs) + + def process(self): + LOG = getLogger('eynollah') + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + for n, input_file in enumerate(self.input_files): + page_id = input_file.pageId or input_file.ID + LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files)) + pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) + page = pcgts.get_Page() + # XXX loses DPI information + # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') + image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename + eynollah_kwargs = { + 'dir_models': self.resolve_resource(self.parameter['models']), + 'allow_enhancement': self.parameter['allow_enhancement'], + 'curved_line': self.parameter['curved_line'], + 'full_layout': self.parameter['full_layout'], + 'allow_scaling': self.parameter['allow_scaling'], + 'headers_off': self.parameter['headers_off'], + 'override_dpi': self.parameter['dpi'], + 'logger': LOG, + 'pcgts': pcgts, + 'image_filename': image_filename + } + Eynollah(**eynollah_kwargs).run() + file_id = make_file_id(input_file, self.output_file_grp) + self.workspace.add_file( + ID=file_id, + file_grp=self.output_file_grp, + pageId=page_id, + mimetype=MIMETYPE_PAGE, + local_filename=join(self.output_file_grp, file_id) + '.xml', + content=to_xml(pcgts)) diff --git a/qurator/eynollah/utils/pil_cv2.py b/qurator/eynollah/utils/pil_cv2.py index d7cd18d..20dc22f 100644 --- a/qurator/eynollah/utils/pil_cv2.py +++ b/qurator/eynollah/utils/pil_cv2.py @@ -1,12 +1,12 @@ from PIL import Image import numpy as np from ocrd_models import OcrdExif -from cv2 import COLOR_GRAY2BGR, COLOR_RGB2BGR, cvtColor, imread +from cv2 import COLOR_GRAY2BGR, COLOR_RGB2BGR, COLOR_BGR2RGB, cvtColor, imread # from sbb_binarization def cv2pil(img): - return Image.fromarray(img.astype('uint8')) + return Image.fromarray(np.array(cvtColor(img, COLOR_BGR2RGB))) def pil2cv(img): # from ocrd/workspace.py @@ -14,11 +14,21 @@ def pil2cv(img): pil_as_np_array = np.array(img).astype('uint8') if img.mode == '1' else np.array(img) return cvtColor(pil_as_np_array, color_conversion) -def check_dpi(image_filename): - exif = OcrdExif(Image.open(image_filename)) - print(exif.to_xml()) - resolution = exif.resolution - if exif.resolutionUnit == 'cm': - resolution /= 2.54 - return int(resolution) - +def check_dpi(img): + try: + if isinstance(img, Image.__class__): + pil_image = img + elif isinstance(img, str): + pil_image = Image.open(img) + else: + pil_image = cv2pil(img) + exif = OcrdExif(pil_image) + resolution = exif.resolution + if resolution == 1: + raise Exception() + if exif.resolutionUnit == 'cm': + resolution /= 2.54 + return int(resolution) + except Exception as e: + print(e) + return 230 diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index d5d6a13..d9a9239 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -28,14 +28,17 @@ class EynollahXmlWriter(): self.counter = EynollahIdCounter() self.dir_out = dir_out self.image_filename = image_filename - self.image_filename_stem = Path(Path(image_filename).name).stem self.curved_line = curved_line - self.pcgts = pcgts if pcgts else PcGtsType() + self.pcgts = pcgts self.scale_x = None # XXX set outside __init__ self.scale_y = None # XXX set outside __init__ self.height_org = None # XXX set outside __init__ self.width_org = None # XXX set outside __init__ + @property + def image_filename_stem(self): + return Path(Path(self.image_filename).name).stem + def calculate_page_coords(self, cont_page): self.logger.debug('enter calculate_page_coords') points_page_print = "" @@ -141,7 +144,7 @@ class EynollahXmlWriter(): self.logger.debug('enter build_pagexml_no_full_layout') # create the file structure - pcgts = create_page_xml(self.image_filename, self.height_org, self.width_org) + pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org) page = pcgts.get_Page() page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page)))) @@ -181,7 +184,7 @@ class EynollahXmlWriter(): self.logger.debug('enter build_pagexml_full_layout') # create the file structure - pcgts = create_page_xml(self.image_filename, self.height_org, self.width_org) + pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org) page = pcgts.get_Page() page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page)))) diff --git a/setup.py b/setup.py index 7988aee..a54bb58 100644 --- a/setup.py +++ b/setup.py @@ -13,10 +13,13 @@ setup( namespace_packages=['qurator'], packages=find_packages(exclude=['tests']), install_requires=install_requires, + package_data={ + '': ['*.json'] + }, entry_points={ 'console_scripts': [ 'eynollah=qurator.eynollah.cli:main', - # 'ocrd-eynollah=qurator.eynollah.ocrd_cli:cli', + 'ocrd-eynollah-segment=qurator.eynollah.ocrd_cli:main', ] }, ) diff --git a/tests/test_dpi.py b/tests/test_dpi.py index 380928d..510ffc5 100644 --- a/tests/test_dpi.py +++ b/tests/test_dpi.py @@ -1,10 +1,11 @@ +import cv2 from pathlib import Path from qurator.eynollah.utils.pil_cv2 import check_dpi from tests.base import main def test_dpi(): - fpath = Path(__file__).parent.joinpath('resources', 'kant_aufklaerung_1784_0020.tif') - assert 300 == check_dpi(str(fpath)) + fpath = str(Path(__file__).parent.joinpath('resources', 'kant_aufklaerung_1784_0020.tif')) + assert 230 == check_dpi(cv2.imread(fpath)) if __name__ == '__main__': main(__file__)