move line-gt extraction out of ocr to eynollah-training

2026-03-13 02:31:56 +01:00 · 2025-11-28 12:09:50 +01:00 · 2025-11-28 12:09:50 +01:00 · 30f9c695dc
commit 30f9c695dc
parent 951bd2fce6
4 changed files with 500 additions and 429 deletions
--- a/src/eynollah/cli/cli_ocr.py
+++ b/src/eynollah/cli/cli_ocr.py
@ -59,12 +59,6 @@ import click
    is_flag=True,
    help="if this parameter set to true, transformer ocr will be applied, otherwise cnn_rnn model.",
 )
@click.option(
    "--export_textline_images_and_text",
    "-etit/-noetit",
    is_flag=True,
    help="if this parameter set to true, images and text in xml will be exported into output dir. This files can be used for training a OCR engine.",
 )
@click.option(
    "--do_not_mask_with_textline_contour",
    "-nmtc/-mtc",
@ -76,11 +70,6 @@ import click
    "-bs",
    help="number of inference batch size. Default b_s for trocr and cnn_rnn models are 2 and 8 respectively",
 )
@click.option(
    "--dataset_abbrevation",
    "-ds_pref",
    help="in the case of extracting textline and text from a xml GT file user can add an abbrevation of dataset name to generated dataset",
 )
@click.option(
    "--min_conf_value_of_textline_text",
    "-min_conf",
@ -97,7 +86,6 @@ def ocr_cli(
    dir_out_image_text,
    overwrite,
    tr_ocr,
    export_textline_images_and_text,
    do_not_mask_with_textline_contour,
    batch_size,
    dataset_abbrevation,
@ -106,18 +94,11 @@ def ocr_cli(
    """
    Recognize text with a CNN/RNN or transformer ML model.
    """
-    assert not export_textline_images_and_text or not tr_ocr, "Exporting textline and text  -etit can not be set alongside transformer ocr -tr_ocr"
+    assert bool(image) ^ bool(dir_in), "Either -i (single image) or -di (directory) must be provided, but not both."
    # FIXME: refactor: move export_textline_images_and_text out of eynollah.py
    # assert not export_textline_images_and_text or not model, "Exporting textline and text  -etit can not be set alongside model -m"
    assert not export_textline_images_and_text or not batch_size, "Exporting textline and text  -etit can not be set alongside batch size -bs"
    assert not export_textline_images_and_text or not dir_in_bin, "Exporting textline and text  -etit can not be set alongside directory of bin images -dib"
    assert not export_textline_images_and_text or not dir_out_image_text, "Exporting textline and text  -etit can not be set alongside directory of images with predicted text -doit"
    assert bool(image) != bool(dir_in), "Either -i (single image) or -di (directory) must be provided, but not both."
    from ..eynollah_ocr import Eynollah_ocr
    eynollah_ocr = Eynollah_ocr(
        model_zoo=ctx.obj.model_zoo,
        tr_ocr=tr_ocr,
        export_textline_images_and_text=export_textline_images_and_text,
        do_not_mask_with_textline_contour=do_not_mask_with_textline_contour,
        batch_size=batch_size,
        pref_of_dataset=dataset_abbrevation,
--- a/src/eynollah/eynollah_ocr.py
+++ b/src/eynollah/eynollah_ocr.py
@ -9,17 +9,13 @@ from logging import Logger, getLogger
 from typing import Optional
 from pathlib import Path
 import os
 import json
 import gc
 import sys
 import math
 import time
 from keras.layers import StringLookup
 import cv2
 import xml.etree.ElementTree as ET
 import tensorflow as tf
 from keras.models import load_model
 from PIL import Image, ImageDraw, ImageFont
 import numpy as np
 from eynollah.model_zoo import EynollahModelZoo
@ -48,11 +44,6 @@ if sys.version_info < (3, 10):
 else:
    import importlib.resources as importlib_resources
 try:
    from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 except ImportError:
    TrOCRProcessor = VisionEncoderDecoderModel = None
 class Eynollah_ocr:
    def __init__(
        self,
@ -60,27 +51,16 @@ class Eynollah_ocr:
        model_zoo: EynollahModelZoo,
        tr_ocr=False,
        batch_size: Optional[int]=None,
        export_textline_images_and_text: bool=False,
        do_not_mask_with_textline_contour: bool=False,
        pref_of_dataset=None,
        min_conf_value_of_textline_text : Optional[float]=None,
        logger: Optional[Logger]=None,
    ):
        self.tr_ocr = tr_ocr
        # For generating textline-image pairs for traning, move to generate_gt_for_training
        self.export_textline_images_and_text = export_textline_images_and_text
        # masking for OCR and GT generation, relevant for skewed lines and bounding boxes
        self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour
        # prefix or dataset
        self.pref_of_dataset = pref_of_dataset
        self.logger = logger if logger else getLogger('eynollah.ocr')
        self.model_zoo = model_zoo
        # TODO: Properly document what 'export_textline_images_and_text' is about
        if export_textline_images_and_text:
            self.logger.info("export_textline_images_and_text was set, so no actual models are loaded")
            return
        self.min_conf_value_of_textline_text = min_conf_value_of_textline_text if min_conf_value_of_textline_text else 0.3
        self.b_s = 2 if batch_size is None and tr_ocr else 8 if batch_size is None else batch_size
@ -539,11 +519,6 @@ class Eynollah_ocr:
                                    mask_poly = mask_poly[y:y+h, x:x+w, :]
                                    img_crop = img_poly_on_img[y:y+h, x:x+w, :]
                                    if self.export_textline_images_and_text:
                                        if not self.do_not_mask_with_textline_contour:
                                            img_crop[mask_poly==0] = 255
                                    else:
                                    # print(file_name, angle_degrees, w*h,
                                    #       mask_poly[:,:,0].sum(),
                                    #       mask_poly[:,:,0].sum() /float(w*h) ,
@ -602,7 +577,6 @@ class Eynollah_ocr:
                                                        break_curved_line_into_small_pieces_and_then_merge(
                                                            img_crop, mask_poly)
                                    if not self.export_textline_images_and_text:
                                    if w_scaled < 750:#1.5*image_width:
                                        img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                            img_crop, image_height, image_width)
@ -666,31 +640,9 @@ class Eynollah_ocr:
                                                    img_crop_bin, image_height, image_width)
                                                cropped_lines_bin.append(img_fin)
                                if self.export_textline_images_and_text:
                                    if img_crop.shape[0]==0 or img_crop.shape[1]==0:
                                        pass
                                    else:
                                        if child_textlines.tag.endswith("TextEquiv"):
                                            for cheild_text in child_textlines:
                                                if cheild_text.tag.endswith("Unicode"):
                                                    textline_text = cheild_text.text
                                                    if textline_text:
                                                        base_name = os.path.join(
                                                            dir_out, file_name + '_line_' + str(indexer_textlines))
                                                        if self.pref_of_dataset:
                                                            base_name += '_' + self.pref_of_dataset
                                                        if not self.do_not_mask_with_textline_contour:
                                                            base_name += '_masked'
                                                        with open(base_name + '.txt', 'w') as text_file:
                                                            text_file.write(textline_text)
                                                        cv2.imwrite(base_name + '.png', img_crop)
                                                    indexer_textlines+=1
                    if not self.export_textline_images_and_text:
                    indexer_text_region = indexer_text_region +1
                if not self.export_textline_images_and_text:
                extracted_texts = []
                extracted_conf_value = []
--- a/src/eynollah/training/cli.py
+++ b/src/eynollah/training/cli.py
@ -8,6 +8,7 @@ from .build_model_load_pretrained_weights_and_save import build_model_load_pretr
 from .generate_gt_for_training import main as generate_gt_cli
 from .inference import main as inference_cli
 from .train import ex
 from .extract_line_gt import linegt_cli
@click.command(context_settings=dict(
        ignore_unknown_options=True,
@ -24,3 +25,4 @@ main.add_command(build_model_load_pretrained_weights_and_save)
 main.add_command(generate_gt_cli, 'generate-gt')
 main.add_command(inference_cli, 'inference')
 main.add_command(train_cli, 'train')
 main.add_command(linegt_cli, 'export_textline_images_and_text')
--- a/src/eynollah/training/extract_line_gt.py
+++ b/src/eynollah/training/extract_line_gt.py
@ -0,0 +1,136 @@
 from logging import Logger, getLogger
 from typing import Optional
 from pathlib import Path
 import os
 import click
 import cv2
 import xml.etree.ElementTree as ET
 import numpy as np
 from ..utils import is_image_filename
@click.command()
@click.option(
    "--image",
    "-i",
    help="input image filename",
    type=click.Path(exists=True, dir_okay=False),
 )
@click.option(
    "--dir_in",
    "-di",
    'image_filename',
    help="directory of input images (instead of --image)",
    type=click.Path(exists=True, file_okay=False),
 )
@click.option(
    "--dir_xmls",
    "-dx",
    help="directory of input PAGE-XML files (in addition to --dir_in; filename stems must match the image files, with '.xml' suffix).",
    type=click.Path(exists=True, file_okay=False),
    required=True,
 )
@click.option(
    "--out",
    "-o",
    'dir_out',
    help="directory for output PAGE-XML files",
    type=click.Path(exists=True, file_okay=False),
    required=True,
 )
@click.option(
    "--dataset_abbrevation",
    "-ds_pref",
    'pref_of_dataset',
    help="in the case of extracting textline and text from a xml GT file user can add an abbrevation of dataset name to generated dataset",
 )
@click.option(
    "--do_not_mask_with_textline_contour",
    "-nmtc/-mtc",
    is_flag=True,
    help="if this parameter set to true, cropped textline images will not be masked with textline contour.",
 )
 def linegt_cli(
    image_filename,
    dir_in,
    dir_xmls,
    dir_out,
    pref_of_dataset,
    do_not_mask_with_textline_contour,
 ):
    assert bool(dir_in) ^ bool(image_filename), "Set --dir-in or --image-filename, not both"
    if dir_in:
        ls_imgs = [
            os.path.join(dir_in, image_filename) for image_filename in filter(is_image_filename, os.listdir(dir_in))
        ]
    else:
        assert image_filename
        ls_imgs = [image_filename]
    for dir_img in ls_imgs:
        file_name = Path(dir_img).stem
        dir_xml = os.path.join(dir_xmls, file_name + '.xml')
        img = cv2.imread(dir_img)
        total_bb_coordinates = []
        tree1 = ET.parse(dir_xml, parser=ET.XMLParser(encoding="utf-8"))
        root1 = tree1.getroot()
        alltags = [elem.tag for elem in root1.iter()]
        name_space = alltags[0].split('}')[0]
        name_space = name_space.split('{')[1]
        region_tags = [x for x in alltags if x.endswith('TextRegion')][0]
        cropped_lines_region_indexer = []
        indexer_text_region = 0
        indexer_textlines = 0
        # FIXME: non recursive, use OCR-D PAGE generateDS API. Or use an existing tool for this purpose altogether
        for nn in root1.iter(region_tags):
            for child_textregion in nn:
                if child_textregion.tag.endswith("TextLine"):
                    for child_textlines in child_textregion:
                        if child_textlines.tag.endswith("Coords"):
                            cropped_lines_region_indexer.append(indexer_text_region)
                            p_h = child_textlines.attrib['points'].split(' ')
                            textline_coords = np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])
                            x, y, w, h = cv2.boundingRect(textline_coords)
                            total_bb_coordinates.append([x, y, w, h])
                            img_poly_on_img = np.copy(img)
                            mask_poly = np.zeros(img.shape)
                            mask_poly = cv2.fillPoly(mask_poly, pts=[textline_coords], color=(1, 1, 1))
                            mask_poly = mask_poly[y : y + h, x : x + w, :]
                            img_crop = img_poly_on_img[y : y + h, x : x + w, :]
                            if not do_not_mask_with_textline_contour:
                                img_crop[mask_poly == 0] = 255
                            if img_crop.shape[0] == 0 or img_crop.shape[1] == 0:
                                continue
                            if child_textlines.tag.endswith("TextEquiv"):
                                for cheild_text in child_textlines:
                                    if cheild_text.tag.endswith("Unicode"):
                                        textline_text = cheild_text.text
                                        if textline_text:
                                            base_name = os.path.join(
                                                dir_out, file_name + '_line_' + str(indexer_textlines)
                                            )
                                            if pref_of_dataset:
                                                base_name += '_' + pref_of_dataset
                                            if not do_not_mask_with_textline_contour:
                                                base_name += '_masked'
                                            with open(base_name + '.txt', 'w') as text_file:
                                                text_file.write(textline_text)
                                            cv2.imwrite(base_name + '.png', img_crop)
                                        indexer_textlines += 1