move line-gt extraction out of ocr to eynollah-training

2026-06-28 07:49:21 +02:00 · 2025-11-28 12:09:50 +01:00 · 2025-11-28 12:09:50 +01:00 · 30f9c695dc
commit 30f9c695dc
parent 951bd2fce6
4 changed files with 500 additions and 429 deletions
--- a/src/eynollah/cli/cli_ocr.py
+++ b/src/eynollah/cli/cli_ocr.py
@ -59,12 +59,6 @@ import click
    is_flag=True,
    help="if this parameter set to true, transformer ocr will be applied, otherwise cnn_rnn model.",
 )
@click.option(
    "--export_textline_images_and_text",
    "-etit/-noetit",
    is_flag=True,
    help="if this parameter set to true, images and text in xml will be exported into output dir. This files can be used for training a OCR engine.",
 )
@click.option(
    "--do_not_mask_with_textline_contour",
    "-nmtc/-mtc",
@ -76,11 +70,6 @@ import click
    "-bs",
    help="number of inference batch size. Default b_s for trocr and cnn_rnn models are 2 and 8 respectively",
 )
@click.option(
    "--dataset_abbrevation",
    "-ds_pref",
    help="in the case of extracting textline and text from a xml GT file user can add an abbrevation of dataset name to generated dataset",
 )
@click.option(
    "--min_conf_value_of_textline_text",
    "-min_conf",
@ -97,7 +86,6 @@ def ocr_cli(
    dir_out_image_text,
    overwrite,
    tr_ocr,
    export_textline_images_and_text,
    do_not_mask_with_textline_contour,
    batch_size,
    dataset_abbrevation,
@ -106,18 +94,11 @@ def ocr_cli(
    """
    Recognize text with a CNN/RNN or transformer ML model.
    """
-    assert not export_textline_images_and_text or not tr_ocr, "Exporting textline and text  -etit can not be set alongside transformer ocr -tr_ocr"
+    assert bool(image) ^ bool(dir_in), "Either -i (single image) or -di (directory) must be provided, but not both."
    # FIXME: refactor: move export_textline_images_and_text out of eynollah.py
    # assert not export_textline_images_and_text or not model, "Exporting textline and text  -etit can not be set alongside model -m"
    assert not export_textline_images_and_text or not batch_size, "Exporting textline and text  -etit can not be set alongside batch size -bs"
    assert not export_textline_images_and_text or not dir_in_bin, "Exporting textline and text  -etit can not be set alongside directory of bin images -dib"
    assert not export_textline_images_and_text or not dir_out_image_text, "Exporting textline and text  -etit can not be set alongside directory of images with predicted text -doit"
    assert bool(image) != bool(dir_in), "Either -i (single image) or -di (directory) must be provided, but not both."
    from ..eynollah_ocr import Eynollah_ocr
    eynollah_ocr = Eynollah_ocr(
        model_zoo=ctx.obj.model_zoo,
        tr_ocr=tr_ocr,
        export_textline_images_and_text=export_textline_images_and_text,
        do_not_mask_with_textline_contour=do_not_mask_with_textline_contour,
        batch_size=batch_size,
        pref_of_dataset=dataset_abbrevation,
--- a/src/eynollah/eynollah_ocr.py
+++ b/src/eynollah/eynollah_ocr.py
@ -9,17 +9,13 @@ from logging import Logger, getLogger
 from typing import Optional
 from pathlib import Path
 import os
 import json
 import gc
 import sys
 import math
 import time
 from keras.layers import StringLookup
 import cv2
 import xml.etree.ElementTree as ET
 import tensorflow as tf
 from keras.models import load_model
 from PIL import Image, ImageDraw, ImageFont
 import numpy as np
 from eynollah.model_zoo import EynollahModelZoo
@ -48,11 +44,6 @@ if sys.version_info < (3, 10):
 else:
    import importlib.resources as importlib_resources
 try:
    from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 except ImportError:
    TrOCRProcessor = VisionEncoderDecoderModel = None
 class Eynollah_ocr:
    def __init__(
        self,
@ -60,27 +51,16 @@ class Eynollah_ocr:
        model_zoo: EynollahModelZoo,
        tr_ocr=False,
        batch_size: Optional[int]=None,
        export_textline_images_and_text: bool=False,
        do_not_mask_with_textline_contour: bool=False,
        pref_of_dataset=None,
        min_conf_value_of_textline_text : Optional[float]=None,
        logger: Optional[Logger]=None,
    ):
        self.tr_ocr = tr_ocr
        # For generating textline-image pairs for traning, move to generate_gt_for_training
        self.export_textline_images_and_text = export_textline_images_and_text
        # masking for OCR and GT generation, relevant for skewed lines and bounding boxes
        self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour
        # prefix or dataset
        self.pref_of_dataset = pref_of_dataset
        self.logger = logger if logger else getLogger('eynollah.ocr')
        self.model_zoo = model_zoo
        # TODO: Properly document what 'export_textline_images_and_text' is about
        if export_textline_images_and_text:
            self.logger.info("export_textline_images_and_text was set, so no actual models are loaded")
            return
        self.min_conf_value_of_textline_text = min_conf_value_of_textline_text if min_conf_value_of_textline_text else 0.3
        self.b_s = 2 if batch_size is None and tr_ocr else 8 if batch_size is None else batch_size
@ -539,40 +519,55 @@ class Eynollah_ocr:
                                    mask_poly = mask_poly[y:y+h, x:x+w, :]
                                    img_crop = img_poly_on_img[y:y+h, x:x+w, :]
-                                    if self.export_textline_images_and_text:
+                                    # print(file_name, angle_degrees, w*h,
                                    #       mask_poly[:,:,0].sum(),
                                    #       mask_poly[:,:,0].sum() /float(w*h) ,
                                    #       'didi')
                                    if angle_degrees > 3:
                                        better_des_slope = get_orientation_moments(textline_coords)
                                        img_crop = rotate_image_with_padding(img_crop, better_des_slope)
                                        if dir_in_bin is not None:
                                            img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope)
                                        mask_poly = rotate_image_with_padding(mask_poly, better_des_slope)
                                        mask_poly = mask_poly.astype('uint8')
                                        #new bounding box
                                        x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_poly[:,:,0])
                                        mask_poly = mask_poly[y_n:y_n+h_n, x_n:x_n+w_n, :]
                                        img_crop = img_crop[y_n:y_n+h_n, x_n:x_n+w_n, :]
                                        if not self.do_not_mask_with_textline_contour:
                                            img_crop[mask_poly==0] = 255
-                                        
+                                        if dir_in_bin is not None:
-                                    else:
+                                            img_crop_bin = img_crop_bin[y_n:y_n+h_n, x_n:x_n+w_n, :]
                                        # print(file_name, angle_degrees, w*h,
                                        #       mask_poly[:,:,0].sum(),
                                        #       mask_poly[:,:,0].sum() /float(w*h) ,
                                        #       'didi')
                                        if angle_degrees > 3:
                                            better_des_slope = get_orientation_moments(textline_coords)
                                            img_crop = rotate_image_with_padding(img_crop, better_des_slope)
                                            if dir_in_bin is not None:
                                                img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope)
                                            mask_poly = rotate_image_with_padding(mask_poly, better_des_slope)
                                            mask_poly = mask_poly.astype('uint8')
                                            #new bounding box
                                            x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_poly[:,:,0])
                                            mask_poly = mask_poly[y_n:y_n+h_n, x_n:x_n+w_n, :]
                                            img_crop = img_crop[y_n:y_n+h_n, x_n:x_n+w_n, :]
                                            if not self.do_not_mask_with_textline_contour:
-                                                img_crop[mask_poly==0] = 255
+                                                img_crop_bin[mask_poly==0] = 255
                                        if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 90:
                                            if dir_in_bin is not None:
-                                                img_crop_bin = img_crop_bin[y_n:y_n+h_n, x_n:x_n+w_n, :]
+                                                img_crop, img_crop_bin = \
-                                                if not self.do_not_mask_with_textline_contour:
+                                                    break_curved_line_into_small_pieces_and_then_merge(
-                                                    img_crop_bin[mask_poly==0] = 255
+                                                        img_crop, mask_poly, img_crop_bin)
-                                            
+                                            else:
-                                            if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 90:
+                                                img_crop, _ = \
                                                    break_curved_line_into_small_pieces_and_then_merge(
                                                        img_crop, mask_poly)
                                    else:
                                        better_des_slope = 0
                                        if not self.do_not_mask_with_textline_contour:
                                            img_crop[mask_poly==0] = 255
                                        if dir_in_bin is not None:
                                            if not self.do_not_mask_with_textline_contour:
                                                img_crop_bin[mask_poly==0] = 255
                                        if type_textregion=='drop-capital':
                                            pass
                                        else:
                                            if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 90:
                                                if dir_in_bin is not None:
                                                    img_crop, img_crop_bin = \
                                                        break_curved_line_into_small_pieces_and_then_merge(
@ -581,188 +576,178 @@ class Eynollah_ocr:
                                                    img_crop, _ = \
                                                        break_curved_line_into_small_pieces_and_then_merge(
                                                            img_crop, mask_poly)
                                        else:
                                            better_des_slope = 0
                                            if not self.do_not_mask_with_textline_contour:
                                                img_crop[mask_poly==0] = 255
                                            if dir_in_bin is not None:
                                                if not self.do_not_mask_with_textline_contour:
                                                    img_crop_bin[mask_poly==0] = 255
                                            if type_textregion=='drop-capital':
                                                pass
                                            else:
                                                if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 90:
                                                    if dir_in_bin is not None:
                                                        img_crop, img_crop_bin = \
                                                            break_curved_line_into_small_pieces_and_then_merge(
                                                                img_crop, mask_poly, img_crop_bin)
                                                    else:
                                                        img_crop, _ = \
                                                            break_curved_line_into_small_pieces_and_then_merge(
                                                                img_crop, mask_poly)
-                                    if not self.export_textline_images_and_text:
+                                    if w_scaled < 750:#1.5*image_width:
-                                        if w_scaled < 750:#1.5*image_width:
+                                        img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                            img_crop, image_height, image_width)
                                        cropped_lines.append(img_fin)
                                        if abs(better_des_slope) > 45:
                                            cropped_lines_ver_index.append(1)
                                        else:
                                            cropped_lines_ver_index.append(0)
                                        cropped_lines_meging_indexing.append(0)
                                        if dir_in_bin is not None:
                                            img_fin = preprocess_and_resize_image_for_ocrcnn_model(
-                                                img_crop, image_height, image_width)
+                                                img_crop_bin, image_height, image_width)
                                            cropped_lines_bin.append(img_fin)
                                    else:
                                        splited_images, splited_images_bin = return_textlines_split_if_needed(
                                            img_crop, img_crop_bin if dir_in_bin is not None else None)
                                        if splited_images:
                                            img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                                splited_images[0], image_height, image_width)
                                            cropped_lines.append(img_fin)
                                            cropped_lines_meging_indexing.append(1)
                                            if abs(better_des_slope) > 45:
                                                cropped_lines_ver_index.append(1)
                                            else:
                                                cropped_lines_ver_index.append(0)
                                            img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                                splited_images[1], image_height, image_width)
                                            cropped_lines.append(img_fin)
                                            cropped_lines_meging_indexing.append(-1)
                                            if abs(better_des_slope) > 45:
                                                cropped_lines_ver_index.append(1)
                                            else:
                                                cropped_lines_ver_index.append(0)
                                            if dir_in_bin is not None:
                                                img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                                    splited_images_bin[0], image_height, image_width)
                                                cropped_lines_bin.append(img_fin)
                                                img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                                    splited_images_bin[1], image_height, image_width)
                                                cropped_lines_bin.append(img_fin)
                                        else:
                                            img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                                img_crop, image_height, image_width)
                                            cropped_lines.append(img_fin)
                                            cropped_lines_meging_indexing.append(0)
                                            if abs(better_des_slope) > 45:
                                                cropped_lines_ver_index.append(1)
                                            else:
                                                cropped_lines_ver_index.append(0)
                                            if dir_in_bin is not None:
                                                img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                                    img_crop_bin, image_height, image_width)
                                                cropped_lines_bin.append(img_fin)
                                        else:
                                            splited_images, splited_images_bin = return_textlines_split_if_needed(
                                                img_crop, img_crop_bin if dir_in_bin is not None else None)
                                            if splited_images:
                                                img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                                    splited_images[0], image_height, image_width)
                                                cropped_lines.append(img_fin)
                                                cropped_lines_meging_indexing.append(1)
                                                if abs(better_des_slope) > 45:
                                                    cropped_lines_ver_index.append(1)
                                                else:
                                                    cropped_lines_ver_index.append(0)
                                                img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                                    splited_images[1], image_height, image_width)
                                                cropped_lines.append(img_fin)
                                                cropped_lines_meging_indexing.append(-1)
                                                if abs(better_des_slope) > 45:
                                                    cropped_lines_ver_index.append(1)
                                                else:
                                                    cropped_lines_ver_index.append(0)
                                                if dir_in_bin is not None:
                                                    img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                                        splited_images_bin[0], image_height, image_width)
                                                    cropped_lines_bin.append(img_fin)
                                                    img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                                        splited_images_bin[1], image_height, image_width)
                                                    cropped_lines_bin.append(img_fin)
                                            else:
                                                img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                                    img_crop, image_height, image_width)
                                                cropped_lines.append(img_fin)
                                                cropped_lines_meging_indexing.append(0)
                                                if abs(better_des_slope) > 45:
                                                    cropped_lines_ver_index.append(1)
                                                else:
                                                    cropped_lines_ver_index.append(0)
                                                if dir_in_bin is not None:
                                                    img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                                        img_crop_bin, image_height, image_width)
                                                    cropped_lines_bin.append(img_fin)
                                if self.export_textline_images_and_text:
                                    if img_crop.shape[0]==0 or img_crop.shape[1]==0:
                                        pass
                                    else:
                                        if child_textlines.tag.endswith("TextEquiv"):
                                            for cheild_text in child_textlines:
                                                if cheild_text.tag.endswith("Unicode"):
                                                    textline_text = cheild_text.text
                                                    if textline_text:
                                                        base_name = os.path.join(
                                                            dir_out, file_name + '_line_' + str(indexer_textlines))
                                                        if self.pref_of_dataset:
                                                            base_name += '_' + self.pref_of_dataset
                                                        if not self.do_not_mask_with_textline_contour:
                                                            base_name += '_masked'
                                                        with open(base_name + '.txt', 'w') as text_file:
                                                            text_file.write(textline_text)
                                                        cv2.imwrite(base_name + '.png', img_crop)
                                                    indexer_textlines+=1
                    if not self.export_textline_images_and_text:
                        indexer_text_region = indexer_text_region +1
                if not self.export_textline_images_and_text:
                    extracted_texts = []
                    extracted_conf_value = []
                    n_iterations  = math.ceil(len(cropped_lines) / self.b_s) 
                    for i in range(n_iterations):
                        if i==(n_iterations-1):
                            n_start = i*self.b_s
                            imgs = cropped_lines[n_start:]
                            imgs = np.array(imgs)
                            imgs = imgs.reshape(imgs.shape[0], image_height, image_width, 3)
                            ver_imgs = np.array( cropped_lines_ver_index[n_start:] )
                            indices_ver = np.where(ver_imgs == 1)[0]
                            #print(indices_ver, 'indices_ver')
                            if len(indices_ver)>0:
                                imgs_ver_flipped = imgs[indices_ver, : ,: ,:]
                                imgs_ver_flipped = imgs_ver_flipped[:,::-1,::-1,:]
                                #print(imgs_ver_flipped, 'imgs_ver_flipped')
                            else:
                                imgs_ver_flipped = None
                            if dir_in_bin is not None:
                                imgs_bin = cropped_lines_bin[n_start:]
                                imgs_bin = np.array(imgs_bin)
                                imgs_bin = imgs_bin.reshape(imgs_bin.shape[0], image_height, image_width, 3)
                                if len(indices_ver)>0:
                                    imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:]
                                    imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:]
                                    #print(imgs_ver_flipped, 'imgs_ver_flipped')
-                                else:
+
-                                    imgs_bin_ver_flipped = None
+                    indexer_text_region = indexer_text_region +1
-                        else:
+                    
-                            n_start = i*self.b_s
+                extracted_texts = []
-                            n_end = (i+1)*self.b_s
+                extracted_conf_value = []
-                            imgs = cropped_lines[n_start:n_end]
+
-                            imgs = np.array(imgs).reshape(self.b_s, image_height, image_width, 3)
+                n_iterations  = math.ceil(len(cropped_lines) / self.b_s) 
                for i in range(n_iterations):
                    if i==(n_iterations-1):
                        n_start = i*self.b_s
                        imgs = cropped_lines[n_start:]
                        imgs = np.array(imgs)
                        imgs = imgs.reshape(imgs.shape[0], image_height, image_width, 3)
                        ver_imgs = np.array( cropped_lines_ver_index[n_start:] )
                        indices_ver = np.where(ver_imgs == 1)[0]
                        #print(indices_ver, 'indices_ver')
                        if len(indices_ver)>0:
                            imgs_ver_flipped = imgs[indices_ver, : ,: ,:]
                            imgs_ver_flipped = imgs_ver_flipped[:,::-1,::-1,:]
                            #print(imgs_ver_flipped, 'imgs_ver_flipped')
-                            ver_imgs = np.array( cropped_lines_ver_index[n_start:n_end] )
+                        else:
-                            indices_ver = np.where(ver_imgs == 1)[0]
+                            imgs_ver_flipped = None
-                            #print(indices_ver, 'indices_ver')
+                        
                        if dir_in_bin is not None:
                            imgs_bin = cropped_lines_bin[n_start:]
                            imgs_bin = np.array(imgs_bin)
                            imgs_bin = imgs_bin.reshape(imgs_bin.shape[0], image_height, image_width, 3)
                            if len(indices_ver)>0:
-                                imgs_ver_flipped = imgs[indices_ver, : ,: ,:]
+                                imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:]
-                                imgs_ver_flipped = imgs_ver_flipped[:,::-1,::-1,:]
+                                imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:]
                                #print(imgs_ver_flipped, 'imgs_ver_flipped')
                            else:
-                                imgs_ver_flipped = None
+                                imgs_bin_ver_flipped = None
-
+                    else:
-                            
+                        n_start = i*self.b_s
-                            if dir_in_bin is not None:
+                        n_end = (i+1)*self.b_s
-                                imgs_bin = cropped_lines_bin[n_start:n_end]
+                        imgs = cropped_lines[n_start:n_end]
-                                imgs_bin = np.array(imgs_bin).reshape(self.b_s, image_height, image_width, 3)
+                        imgs = np.array(imgs).reshape(self.b_s, image_height, image_width, 3)
-                                
+                        
-                                
+                        ver_imgs = np.array( cropped_lines_ver_index[n_start:n_end] )
-                                if len(indices_ver)>0:
+                        indices_ver = np.where(ver_imgs == 1)[0]
-                                    imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:]
+                        #print(indices_ver, 'indices_ver')
                                    imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:]
                                    #print(imgs_ver_flipped, 'imgs_ver_flipped')
                                else:
                                    imgs_bin_ver_flipped = None
                        self.logger.debug("processing next %d lines", len(imgs))
                        preds = self.model_zoo.get('ocr').predict(imgs, verbose=0)
                        if len(indices_ver)>0:
-                            preds_flipped = self.model_zoo.get('ocr').predict(imgs_ver_flipped, verbose=0)
+                            imgs_ver_flipped = imgs[indices_ver, : ,: ,:]
                            imgs_ver_flipped = imgs_ver_flipped[:,::-1,::-1,:]
                            #print(imgs_ver_flipped, 'imgs_ver_flipped')
                        else:
                            imgs_ver_flipped = None
                        if dir_in_bin is not None:
                            imgs_bin = cropped_lines_bin[n_start:n_end]
                            imgs_bin = np.array(imgs_bin).reshape(self.b_s, image_height, image_width, 3)
                            if len(indices_ver)>0:
                                imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:]
                                imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:]
                                #print(imgs_ver_flipped, 'imgs_ver_flipped')
                            else:
                                imgs_bin_ver_flipped = None
                    self.logger.debug("processing next %d lines", len(imgs))
                    preds = self.model_zoo.get('ocr').predict(imgs, verbose=0)
                    if len(indices_ver)>0:
                        preds_flipped = self.model_zoo.get('ocr').predict(imgs_ver_flipped, verbose=0)
                        preds_max_fliped = np.max(preds_flipped, axis=2 )
                        preds_max_args_flipped = np.argmax(preds_flipped, axis=2 )
                        pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=self.end_character
                        masked_means_flipped = \
                            np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / \
                            np.sum(pred_max_not_unk_mask_bool_flipped, axis=1)
                        masked_means_flipped[np.isnan(masked_means_flipped)] = 0
                        preds_max = np.max(preds, axis=2 )
                        preds_max_args = np.argmax(preds, axis=2 )
                        pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character
                        masked_means = \
                            np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / \
                            np.sum(pred_max_not_unk_mask_bool, axis=1)
                        masked_means[np.isnan(masked_means)] = 0
                        masked_means_ver = masked_means[indices_ver]
                        #print(masked_means_ver, 'pred_max_not_unk')
                        indices_where_flipped_conf_value_is_higher = \
                            np.where(masked_means_flipped > masked_means_ver)[0]
                        #print(indices_where_flipped_conf_value_is_higher, 'indices_where_flipped_conf_value_is_higher')
                        if len(indices_where_flipped_conf_value_is_higher)>0:
                            indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher]
                            preds[indices_to_be_replaced,:,:] = \
                                preds_flipped[indices_where_flipped_conf_value_is_higher, :, :]
                    if dir_in_bin is not None:
                        preds_bin = self.model_zoo.get('ocr').predict(imgs_bin, verbose=0)
                        if len(indices_ver)>0:
                            preds_flipped = self.model_zoo.get('ocr').predict(imgs_bin_ver_flipped, verbose=0)
                            preds_max_fliped = np.max(preds_flipped, axis=2 )
                            preds_max_args_flipped = np.argmax(preds_flipped, axis=2 )
                            pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=self.end_character
@ -789,212 +774,179 @@ class Eynollah_ocr:
                            #print(indices_where_flipped_conf_value_is_higher, 'indices_where_flipped_conf_value_is_higher')
                            if len(indices_where_flipped_conf_value_is_higher)>0:
                                indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher]
-                                preds[indices_to_be_replaced,:,:] = \
+                                preds_bin[indices_to_be_replaced,:,:] = \
                                    preds_flipped[indices_where_flipped_conf_value_is_higher, :, :]
                        if dir_in_bin is not None:
                            preds_bin = self.model_zoo.get('ocr').predict(imgs_bin, verbose=0)
                            if len(indices_ver)>0:
                                preds_flipped = self.model_zoo.get('ocr').predict(imgs_bin_ver_flipped, verbose=0)
                                preds_max_fliped = np.max(preds_flipped, axis=2 )
                                preds_max_args_flipped = np.argmax(preds_flipped, axis=2 )
                                pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=self.end_character
                                masked_means_flipped = \
                                    np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / \
                                    np.sum(pred_max_not_unk_mask_bool_flipped, axis=1)
                                masked_means_flipped[np.isnan(masked_means_flipped)] = 0
                                preds_max = np.max(preds, axis=2 )
                                preds_max_args = np.argmax(preds, axis=2 )
                                pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character
                                masked_means = \
                                    np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / \
                                    np.sum(pred_max_not_unk_mask_bool, axis=1)
                                masked_means[np.isnan(masked_means)] = 0
                                masked_means_ver = masked_means[indices_ver]
                                #print(masked_means_ver, 'pred_max_not_unk')
                                indices_where_flipped_conf_value_is_higher = \
                                    np.where(masked_means_flipped > masked_means_ver)[0]
                                #print(indices_where_flipped_conf_value_is_higher, 'indices_where_flipped_conf_value_is_higher')
                                if len(indices_where_flipped_conf_value_is_higher)>0:
                                    indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher]
                                    preds_bin[indices_to_be_replaced,:,:] = \
                                        preds_flipped[indices_where_flipped_conf_value_is_higher, :, :]
                            preds = (preds + preds_bin) / 2.
                        pred_texts = decode_batch_predictions(preds, self.model_zoo.get('num_to_char'))
-                        preds_max = np.max(preds, axis=2 )
+                        preds = (preds + preds_bin) / 2.
                        preds_max_args = np.argmax(preds, axis=2 )
                        pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character
                        masked_means = \
                            np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / \
                            np.sum(pred_max_not_unk_mask_bool, axis=1)
-                        for ib in range(imgs.shape[0]):
+                    pred_texts = decode_batch_predictions(preds, self.model_zoo.get('num_to_char'))
                            pred_texts_ib = pred_texts[ib].replace("[UNK]", "")
                            if masked_means[ib] >= self.min_conf_value_of_textline_text:
                                extracted_texts.append(pred_texts_ib)
                                extracted_conf_value.append(masked_means[ib])
                            else:
                                extracted_texts.append("")
                                extracted_conf_value.append(0)
                    del cropped_lines
                    if dir_in_bin is not None:
                        del cropped_lines_bin
                    gc.collect()
-                    extracted_texts_merged = [extracted_texts[ind]
+                    preds_max = np.max(preds, axis=2 )
-                                              if cropped_lines_meging_indexing[ind]==0
+                    preds_max_args = np.argmax(preds, axis=2 )
-                                              else extracted_texts[ind]+" "+extracted_texts[ind+1]
+                    pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character
-                                              if cropped_lines_meging_indexing[ind]==1
+                    masked_means = \
-                                              else None
+                        np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / \
-                                              for ind in range(len(cropped_lines_meging_indexing))]
+                        np.sum(pred_max_not_unk_mask_bool, axis=1)
                    extracted_conf_value_merged = [extracted_conf_value[ind]
                                                   if cropped_lines_meging_indexing[ind]==0
                                                   else (extracted_conf_value[ind]+extracted_conf_value[ind+1])/2.
                                                   if cropped_lines_meging_indexing[ind]==1
                                                   else None
                                                   for ind in range(len(cropped_lines_meging_indexing))]
-                    extracted_conf_value_merged = [extracted_conf_value_merged[ind_cfm]
+                    for ib in range(imgs.shape[0]):
-                                                   for ind_cfm in range(len(extracted_texts_merged))
+                        pred_texts_ib = pred_texts[ib].replace("[UNK]", "")
-                                                   if extracted_texts_merged[ind_cfm] is not None]
+                        if masked_means[ib] >= self.min_conf_value_of_textline_text:
-                    extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None]
+                            extracted_texts.append(pred_texts_ib)
-                    unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer)
+                            extracted_conf_value.append(masked_means[ib])
                    if dir_out_image_text:
                        #font_path = "Charis-7.000/Charis-Regular.ttf"  # Make sure this file exists!
                        font = importlib_resources.files(__package__) / "Charis-Regular.ttf"
                        with importlib_resources.as_file(font) as font:
                            font = ImageFont.truetype(font=font, size=40)
                        for indexer_text, bb_ind in enumerate(total_bb_coordinates):
                            x_bb = bb_ind[0]
                            y_bb = bb_ind[1]
                            w_bb = bb_ind[2]
                            h_bb = bb_ind[3]
                            font = fit_text_single_line(draw, extracted_texts_merged[indexer_text],
                                                        font.path, w_bb, int(h_bb*0.4) )
                            ##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2)
                            text_bbox = draw.textbbox((0, 0), extracted_texts_merged[indexer_text], font=font)
                            text_width = text_bbox[2] - text_bbox[0]
                            text_height = text_bbox[3] - text_bbox[1]
                            text_x = x_bb + (w_bb - text_width) // 2  # Center horizontally
                            text_y = y_bb + (h_bb - text_height) // 2  # Center vertically
                            # Draw the text
                            draw.text((text_x, text_y), extracted_texts_merged[indexer_text], fill="black", font=font)
                        image_text.save(out_image_with_text)
                    text_by_textregion = []
                    for ind in unique_cropped_lines_region_indexer:
                        ind = np.array(cropped_lines_region_indexer)==ind
                        extracted_texts_merged_un = np.array(extracted_texts_merged)[ind]
                        if len(extracted_texts_merged_un)>1:
                            text_by_textregion_ind = ""
                            next_glue = ""
                            for indt in range(len(extracted_texts_merged_un)):
                                if (extracted_texts_merged_un[indt].endswith('⸗') or
                                    extracted_texts_merged_un[indt].endswith('-') or
                                    extracted_texts_merged_un[indt].endswith('¬')):
                                    text_by_textregion_ind += next_glue + extracted_texts_merged_un[indt][:-1]
                                    next_glue = ""
                                else:
                                    text_by_textregion_ind += next_glue + extracted_texts_merged_un[indt]
                                    next_glue = " "
                            text_by_textregion.append(text_by_textregion_ind)
                        else:
-                            text_by_textregion.append(" ".join(extracted_texts_merged_un))
+                            extracted_texts.append("")
-                        #print(text_by_textregion, 'text_by_textregiontext_by_textregiontext_by_textregiontext_by_textregiontext_by_textregion')
+                            extracted_conf_value.append(0)
                del cropped_lines
                if dir_in_bin is not None:
                    del cropped_lines_bin
                gc.collect()
                extracted_texts_merged = [extracted_texts[ind]
                                            if cropped_lines_meging_indexing[ind]==0
                                            else extracted_texts[ind]+" "+extracted_texts[ind+1]
                                            if cropped_lines_meging_indexing[ind]==1
                                            else None
                                            for ind in range(len(cropped_lines_meging_indexing))]
                extracted_conf_value_merged = [extracted_conf_value[ind]
                                                if cropped_lines_meging_indexing[ind]==0
                                                else (extracted_conf_value[ind]+extracted_conf_value[ind+1])/2.
                                                if cropped_lines_meging_indexing[ind]==1
                                                else None
                                                for ind in range(len(cropped_lines_meging_indexing))]
-                    ###index_tot_regions = []
+                extracted_conf_value_merged = [extracted_conf_value_merged[ind_cfm]
-                    ###tot_region_ref = []
+                                                for ind_cfm in range(len(extracted_texts_merged))
                                                if extracted_texts_merged[ind_cfm] is not None]
                extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None]
                unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer)
                if dir_out_image_text:
                    #font_path = "Charis-7.000/Charis-Regular.ttf"  # Make sure this file exists!
                    font = importlib_resources.files(__package__) / "Charis-Regular.ttf"
                    with importlib_resources.as_file(font) as font:
                        font = ImageFont.truetype(font=font, size=40)
                    for indexer_text, bb_ind in enumerate(total_bb_coordinates):
                        x_bb = bb_ind[0]
                        y_bb = bb_ind[1]
                        w_bb = bb_ind[2]
                        h_bb = bb_ind[3]
                        font = fit_text_single_line(draw, extracted_texts_merged[indexer_text],
                                                    font.path, w_bb, int(h_bb*0.4) )
                        ##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2)
                        text_bbox = draw.textbbox((0, 0), extracted_texts_merged[indexer_text], font=font)
                        text_width = text_bbox[2] - text_bbox[0]
                        text_height = text_bbox[3] - text_bbox[1]
-                    ###for jj in root1.iter(link+'RegionRefIndexed'):
+                        text_x = x_bb + (w_bb - text_width) // 2  # Center horizontally
-                        ###index_tot_regions.append(jj.attrib['index'])
+                        text_y = y_bb + (h_bb - text_height) // 2  # Center vertically
                        ###tot_region_ref.append(jj.attrib['regionRef'])
                    ###id_to_order = {tid: ro for tid, ro in zip(tot_region_ref, index_tot_regions)}
                    #id_textregions = []
                    #textregions_by_existing_ids = []
                    indexer = 0
                    indexer_textregion = 0
                    for nn in root1.iter(region_tags):
                        #id_textregion = nn.attrib['id']
                        #id_textregions.append(id_textregion)
                        #textregions_by_existing_ids.append(text_by_textregion[indexer_textregion])
                        is_textregion_text = False
                        for childtest in nn:
                            if childtest.tag.endswith("TextEquiv"):
                                is_textregion_text = True
                        if not is_textregion_text:
                            text_subelement_textregion = ET.SubElement(nn, 'TextEquiv')
                            unicode_textregion = ET.SubElement(text_subelement_textregion, 'Unicode')
-                        
+                        # Draw the text
-                        has_textline = False
+                        draw.text((text_x, text_y), extracted_texts_merged[indexer_text], fill="black", font=font)
-                        for child_textregion in nn:
+                    image_text.save(out_image_with_text)
-                            if child_textregion.tag.endswith("TextLine"):
+
-                                
+                text_by_textregion = []
-                                is_textline_text = False
+                for ind in unique_cropped_lines_region_indexer:
-                                for childtest2 in child_textregion:
+                    ind = np.array(cropped_lines_region_indexer)==ind
-                                    if childtest2.tag.endswith("TextEquiv"):
+                    extracted_texts_merged_un = np.array(extracted_texts_merged)[ind]
-                                        is_textline_text = True
+                    if len(extracted_texts_merged_un)>1:
-                                
+                        text_by_textregion_ind = ""
-                                
+                        next_glue = ""
-                                if not is_textline_text:
+                        for indt in range(len(extracted_texts_merged_un)):
-                                    text_subelement = ET.SubElement(child_textregion, 'TextEquiv')
+                            if (extracted_texts_merged_un[indt].endswith('⸗') or
-                                    text_subelement.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}")
+                                extracted_texts_merged_un[indt].endswith('-') or
-                                    unicode_textline = ET.SubElement(text_subelement, 'Unicode')
+                                extracted_texts_merged_un[indt].endswith('¬')):
-                                    unicode_textline.text = extracted_texts_merged[indexer]
+                                text_by_textregion_ind += next_glue + extracted_texts_merged_un[indt][:-1]
-                                else:
+                                next_glue = ""
                                    for childtest3 in child_textregion:
                                        if childtest3.tag.endswith("TextEquiv"):
                                            for child_uc in childtest3:
                                                if child_uc.tag.endswith("Unicode"):
                                                    childtest3.set('conf',
                                                                   f"{extracted_conf_value_merged[indexer]:.2f}")
                                                    child_uc.text = extracted_texts_merged[indexer]
                                indexer = indexer + 1
                                has_textline = True
                        if has_textline:
                            if is_textregion_text:
                                for child4 in nn:
                                    if child4.tag.endswith("TextEquiv"):
                                        for childtr_uc in child4:
                                            if childtr_uc.tag.endswith("Unicode"):
                                                childtr_uc.text = text_by_textregion[indexer_textregion]
                            else:
-                                unicode_textregion.text = text_by_textregion[indexer_textregion]
+                                text_by_textregion_ind += next_glue + extracted_texts_merged_un[indt]
-                            indexer_textregion = indexer_textregion + 1
+                                next_glue = " "
                        text_by_textregion.append(text_by_textregion_ind)
                    else:
                        text_by_textregion.append(" ".join(extracted_texts_merged_un))
                    #print(text_by_textregion, 'text_by_textregiontext_by_textregiontext_by_textregiontext_by_textregiontext_by_textregion')
                ###index_tot_regions = []
                ###tot_region_ref = []
                ###for jj in root1.iter(link+'RegionRefIndexed'):
                    ###index_tot_regions.append(jj.attrib['index'])
                    ###tot_region_ref.append(jj.attrib['regionRef'])
                ###id_to_order = {tid: ro for tid, ro in zip(tot_region_ref, index_tot_regions)}
                #id_textregions = []
                #textregions_by_existing_ids = []
                indexer = 0
                indexer_textregion = 0
                for nn in root1.iter(region_tags):
                    #id_textregion = nn.attrib['id']
                    #id_textregions.append(id_textregion)
                    #textregions_by_existing_ids.append(text_by_textregion[indexer_textregion])
                    is_textregion_text = False
                    for childtest in nn:
                        if childtest.tag.endswith("TextEquiv"):
                            is_textregion_text = True
                    if not is_textregion_text:
                        text_subelement_textregion = ET.SubElement(nn, 'TextEquiv')
                        unicode_textregion = ET.SubElement(text_subelement_textregion, 'Unicode')
                    has_textline = False
                    for child_textregion in nn:
                        if child_textregion.tag.endswith("TextLine"):
-                    ###sample_order  = [(id_to_order[tid], text)
+                            is_textline_text = False
-                    ###                 for tid, text in zip(id_textregions, textregions_by_existing_ids)
+                            for childtest2 in child_textregion:
-                    ###                 if tid in id_to_order]
+                                if childtest2.tag.endswith("TextEquiv"):
-                    
+                                    is_textline_text = True
-                    ##ordered_texts_sample = [text for _, text in sorted(sample_order)]
+                            
-                    ##tot_page_text = ' '.join(ordered_texts_sample)
+                            
-                    
+                            if not is_textline_text:
-                    ##for page_element in root1.iter(link+'Page'):
+                                text_subelement = ET.SubElement(child_textregion, 'TextEquiv')
-                        ##text_page = ET.SubElement(page_element, 'TextEquiv')
+                                text_subelement.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}")
-                        ##unicode_textpage = ET.SubElement(text_page, 'Unicode')
+                                unicode_textline = ET.SubElement(text_subelement, 'Unicode')
-                        ##unicode_textpage.text = tot_page_text
+                                unicode_textline.text = extracted_texts_merged[indexer]
-                    
+                            else:
-                    ET.register_namespace("",name_space)
+                                for childtest3 in child_textregion:
-                    tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf-8",default_namespace=None)
+                                    if childtest3.tag.endswith("TextEquiv"):
-                    #print("Job done in %.1fs", time.time() - t0)
+                                        for child_uc in childtest3:
                                            if child_uc.tag.endswith("Unicode"):
                                                childtest3.set('conf',
                                                                f"{extracted_conf_value_merged[indexer]:.2f}")
                                                child_uc.text = extracted_texts_merged[indexer]
                            indexer = indexer + 1
                            has_textline = True
                    if has_textline:
                        if is_textregion_text:
                            for child4 in nn:
                                if child4.tag.endswith("TextEquiv"):
                                    for childtr_uc in child4:
                                        if childtr_uc.tag.endswith("Unicode"):
                                            childtr_uc.text = text_by_textregion[indexer_textregion]
                        else:
                            unicode_textregion.text = text_by_textregion[indexer_textregion]
                        indexer_textregion = indexer_textregion + 1
                ###sample_order  = [(id_to_order[tid], text)
                ###                 for tid, text in zip(id_textregions, textregions_by_existing_ids)
                ###                 if tid in id_to_order]
                ##ordered_texts_sample = [text for _, text in sorted(sample_order)]
                ##tot_page_text = ' '.join(ordered_texts_sample)
                ##for page_element in root1.iter(link+'Page'):
                    ##text_page = ET.SubElement(page_element, 'TextEquiv')
                    ##unicode_textpage = ET.SubElement(text_page, 'Unicode')
                    ##unicode_textpage.text = tot_page_text
                ET.register_namespace("",name_space)
                tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf-8",default_namespace=None)
                #print("Job done in %.1fs", time.time() - t0)
--- a/src/eynollah/training/cli.py
+++ b/src/eynollah/training/cli.py
@ -8,6 +8,7 @@ from .build_model_load_pretrained_weights_and_save import build_model_load_pretr
 from .generate_gt_for_training import main as generate_gt_cli
 from .inference import main as inference_cli
 from .train import ex
 from .extract_line_gt import linegt_cli
@click.command(context_settings=dict(
        ignore_unknown_options=True,
@ -24,3 +25,4 @@ main.add_command(build_model_load_pretrained_weights_and_save)
 main.add_command(generate_gt_cli, 'generate-gt')
 main.add_command(inference_cli, 'inference')
 main.add_command(train_cli, 'train')
 main.add_command(linegt_cli, 'export_textline_images_and_text')
--- a/src/eynollah/training/extract_line_gt.py
+++ b/src/eynollah/training/extract_line_gt.py
@ -0,0 +1,136 @@
 from logging import Logger, getLogger
 from typing import Optional
 from pathlib import Path
 import os
 import click
 import cv2
 import xml.etree.ElementTree as ET
 import numpy as np
 from ..utils import is_image_filename
@click.command()
@click.option(
    "--image",
    "-i",
    help="input image filename",
    type=click.Path(exists=True, dir_okay=False),
 )
@click.option(
    "--dir_in",
    "-di",
    'image_filename',
    help="directory of input images (instead of --image)",
    type=click.Path(exists=True, file_okay=False),
 )
@click.option(
    "--dir_xmls",
    "-dx",
    help="directory of input PAGE-XML files (in addition to --dir_in; filename stems must match the image files, with '.xml' suffix).",
    type=click.Path(exists=True, file_okay=False),
    required=True,
 )
@click.option(
    "--out",
    "-o",
    'dir_out',
    help="directory for output PAGE-XML files",
    type=click.Path(exists=True, file_okay=False),
    required=True,
 )
@click.option(
    "--dataset_abbrevation",
    "-ds_pref",
    'pref_of_dataset',
    help="in the case of extracting textline and text from a xml GT file user can add an abbrevation of dataset name to generated dataset",
 )
@click.option(
    "--do_not_mask_with_textline_contour",
    "-nmtc/-mtc",
    is_flag=True,
    help="if this parameter set to true, cropped textline images will not be masked with textline contour.",
 )
 def linegt_cli(
    image_filename,
    dir_in,
    dir_xmls,
    dir_out,
    pref_of_dataset,
    do_not_mask_with_textline_contour,
 ):
    assert bool(dir_in) ^ bool(image_filename), "Set --dir-in or --image-filename, not both"
    if dir_in:
        ls_imgs = [
            os.path.join(dir_in, image_filename) for image_filename in filter(is_image_filename, os.listdir(dir_in))
        ]
    else:
        assert image_filename
        ls_imgs = [image_filename]
    for dir_img in ls_imgs:
        file_name = Path(dir_img).stem
        dir_xml = os.path.join(dir_xmls, file_name + '.xml')
        img = cv2.imread(dir_img)
        total_bb_coordinates = []
        tree1 = ET.parse(dir_xml, parser=ET.XMLParser(encoding="utf-8"))
        root1 = tree1.getroot()
        alltags = [elem.tag for elem in root1.iter()]
        name_space = alltags[0].split('}')[0]
        name_space = name_space.split('{')[1]
        region_tags = [x for x in alltags if x.endswith('TextRegion')][0]
        cropped_lines_region_indexer = []
        indexer_text_region = 0
        indexer_textlines = 0
        # FIXME: non recursive, use OCR-D PAGE generateDS API. Or use an existing tool for this purpose altogether
        for nn in root1.iter(region_tags):
            for child_textregion in nn:
                if child_textregion.tag.endswith("TextLine"):
                    for child_textlines in child_textregion:
                        if child_textlines.tag.endswith("Coords"):
                            cropped_lines_region_indexer.append(indexer_text_region)
                            p_h = child_textlines.attrib['points'].split(' ')
                            textline_coords = np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])
                            x, y, w, h = cv2.boundingRect(textline_coords)
                            total_bb_coordinates.append([x, y, w, h])
                            img_poly_on_img = np.copy(img)
                            mask_poly = np.zeros(img.shape)
                            mask_poly = cv2.fillPoly(mask_poly, pts=[textline_coords], color=(1, 1, 1))
                            mask_poly = mask_poly[y : y + h, x : x + w, :]
                            img_crop = img_poly_on_img[y : y + h, x : x + w, :]
                            if not do_not_mask_with_textline_contour:
                                img_crop[mask_poly == 0] = 255
                            if img_crop.shape[0] == 0 or img_crop.shape[1] == 0:
                                continue
                            if child_textlines.tag.endswith("TextEquiv"):
                                for cheild_text in child_textlines:
                                    if cheild_text.tag.endswith("Unicode"):
                                        textline_text = cheild_text.text
                                        if textline_text:
                                            base_name = os.path.join(
                                                dir_out, file_name + '_line_' + str(indexer_textlines)
                                            )
                                            if pref_of_dataset:
                                                base_name += '_' + pref_of_dataset
                                            if not do_not_mask_with_textline_contour:
                                                base_name += '_masked'
                                            with open(base_name + '.txt', 'w') as text_file:
                                                text_file.write(textline_text)
                                            cv2.imwrite(base_name + '.png', img_crop)
                                        indexer_textlines += 1