Eynollah_ocr: adapt to inference model, improve and simplify…

- drop `end_character` mechanics and `characters` model type for decoding output probability (not needed) - drop `decode_batch_predictions()` and `num_to_char` model type (part of inference model) - drop roughshot confidence estimation calculation (returned precisely by inference model) - adapt model prediction to inference model: just omit zeros, map to bytes, filter OOV tokens and decode UTF-8 to str - if no binarization input was provided, then compute it on the fly using `binarization` model - also apply `min_conf_value_of_textline_text` (as for TrOCR) - batching over entire page instead of region-wise (which underfilled batches) - simplify and avoid copied redundant code - rename `extracted_conf_value_merged` → `extracted_confs_merged` - move `batched()` from `utils.utils_ocr` to `utils` - drop `utils_ocr.distortion_free_resize()` (not needed) - simplify `utils_ocr.break_curved_line_into_small_pieces_and_then_merge()` - drop `utils_ocr.return_textline_contour_with_added_box_coordinate()` and `utils_ocr.return_rnn_cnn_ocr_of_given_textlines()` (not needed)
2026-08-03 09:22:32 +02:00 · 2026-06-02 21:20:06 +02:00 · 2026-06-02 21:20:06 +02:00 · 8ffc4ed8d3
commit 8ffc4ed8d3
parent a391ee24e6
3 changed files with 206 additions and 631 deletions
--- a/src/eynollah/eynollah_ocr.py
+++ b/src/eynollah/eynollah_ocr.py
@ -19,27 +19,29 @@ from ocrd_utils import polygon_from_points, xywh_from_polygon
 from .eynollah import Eynollah
 from .model_zoo import EynollahModelZoo
-from .utils import is_image_filename
+from .utils import (
    is_image_filename,
    batched,
    pairwise,
 )
 from .utils.font import get_font
 from .utils.xml import etree_namespace_for_element_tag
 from .utils.resize import resize_image
 from .utils.utils_ocr import (
    break_curved_line_into_small_pieces_and_then_merge,
    decode_batch_predictions,
    fit_text_single_line,
    get_contours_and_bounding_boxes,
    get_orientation_moments,
    preprocess_and_resize_image_for_ocrcnn_model,
    return_textlines_split_if_needed,
    rotate_image_with_padding,
    batched,
 )
 # TODO: refine typing
@dataclass
 class EynollahOcrResult:
    extracted_texts_merged: List
-    extracted_conf_value_merged: Optional[List]
+    extracted_confs_merged: Optional[List]
    cropped_lines_region_indexer: List
    total_bb_coordinates:List
@ -73,10 +75,8 @@ class Eynollah_ocr(Eynollah):
                                       device=device)
        else:
            self.model_zoo.load_models('ocr',
-                                       'num_to_char',
+                                       'binarization',
                                       'characters',
                                       device=device)
            self.end_character = len(self.model_zoo.get('characters')) + 2
    @property
    def device(self):
@ -95,8 +95,6 @@ class Eynollah_ocr(Eynollah):
        cropped_lines = []
        cropped_lines_region_indexer = []
        cropped_lines_meging_indexing = []
        extracted_texts = []
        extracted_confs = []
        for n_region, region in enumerate(page_tree.getroot().iter('{%s}TextRegion' % page_ns)):
            for n_line, line in enumerate(region.iter('{%s}TextLine' % page_ns)):
@ -139,7 +137,8 @@ class Eynollah_ocr(Eynollah):
                        cropped_lines.append(img_crop)
                        cropped_lines_meging_indexing.append(0)
-
+        extracted_texts = []
        extracted_confs = []
        self.logger.debug("processing %d lines for %d regions",
                          len(cropped_lines), len(set(cropped_lines_region_indexer)))
        for imgs in batched(cropped_lines, self.b_s):
@ -157,6 +156,10 @@ class Eynollah_ocr(Eynollah):
                conf = output.sequences_scores.exp().clamp(0.0, 1.0).tolist()
            else:
                conf = [1.0] * len(output.sequences)
            if conf < self.min_conf_value_of_textline_text:
                extracted_confs.extend(0)
                extracted_texts.extend("")
                continue
            text = self.model_zoo.get('trocr_processor').batch_decode(
                output.sequences,
                skip_special_tokens=True,
@ -179,7 +182,7 @@ class Eynollah_ocr(Eynollah):
        return EynollahOcrResult(
            extracted_texts_merged=extracted_texts_merged,
-            extracted_conf_value_merged=extracted_confs_merged,
+            extracted_confs_merged=extracted_confs_merged,
            cropped_lines_region_indexer=cropped_lines_region_indexer,
            total_bb_coordinates=total_bb_coordinates,
        )
@ -196,362 +199,163 @@ class Eynollah_ocr(Eynollah):
    ) -> EynollahOcrResult:
        total_bb_coordinates = []
-
+        cropped_lines_rgb = []
        cropped_lines = []
        img_crop_bin = None
        imgs_bin = None
        imgs_bin_ver_flipped = None
        cropped_lines_bin = []
        cropped_lines_ver_index = []
        cropped_lines_region_indexer = []
        cropped_lines_meging_indexing = []
-        indexer_text_region = 0
+        img_rgb = img # cosmetic
-        for nn in page_tree.getroot().iter(f'{{{page_ns}}}TextRegion'):
+        if img_bin is None:
-            try:
+            # run ad-hoc binarization
-                type_textregion = nn.attrib['type']
+            self.logger.info("running binarization for ensemble input")
-            except:
+            img_bin = self.do_prediction(True, img, self.model_zoo.get("binarization"),
-                type_textregion = 'paragraph'
+                                         n_batch_inference=5)
-            for child_textregion in nn:
+            img_bin = np.repeat(img_bin[:, :, np.newaxis], 3, axis=2)
-                if child_textregion.tag.endswith("TextLine"):
+            img_bin = 255 * (img_bin == 0).astype(np.uint8)
                    for child_textlines in child_textregion:
                        if child_textlines.tag.endswith("Coords"):
                            cropped_lines_region_indexer.append(indexer_text_region)
                            p_h=child_textlines.attrib['points'].split(' ')
                            textline_coords =  np.array( [ [int(x.split(',')[0]),
                                                            int(x.split(',')[1]) ]
                                                        for x in p_h] )
-                            x,y,w,h = cv2.boundingRect(textline_coords)
+        for n_region, region in enumerate(page_tree.getroot().iter('{%s}TextRegion' % page_ns)):
            type_textregion = region.attrib.get('type', 'paragraph')
            for n_line, line in enumerate(region.iter('{%s}TextLine' % page_ns)):
                cropped_lines_region_indexer.append(n_region)
-                            angle_radians = math.atan2(h, w)
+                coords = line.find('{%s}Coords' % page_ns)
-                            # Convert to degrees
+                if coords is None:
-                            angle_degrees = math.degrees(angle_radians)
+                    self.logger.warning("region '%s' line '%s' has no Coords", region.attrib['id'], line.attrib['id'])
-                            if type_textregion=='drop-capital':
+                    continue
-                                angle_degrees = 0
+                poly = np.array(polygon_from_points(coords.attrib['points'])).astype(int)
                cont = poly[:, np.newaxis]
                xywh = xywh_from_polygon(poly)
                x, y, w, h = xywh['x'], xywh['y'], xywh['w'], xywh['h']
-                            total_bb_coordinates.append([x,y,w,h])
+                angle_radians = math.atan2(h, w)
                angle_degrees = math.degrees(angle_radians)
                if type_textregion=='drop-capital':
                    angle_degrees = 0
-                            w_scaled = w *  image_height/float(h)
+                total_bb_coordinates.append([x, y, w, h])
-                            img_poly_on_img = np.copy(img)
+                w_scaled = w * image_height / float(h)
                            if img_bin:
                                img_poly_on_img_bin = np.copy(img_bin)
                                img_crop_bin = img_poly_on_img_bin[y:y+h, x:x+w, :]
-                            mask_poly = np.zeros(img.shape)
+                img_crop_rgb = img_rgb[y: y + h, x: x + w]
-                            mask_poly = cv2.fillPoly(mask_poly, pts=[textline_coords], color=(1, 1, 1))
+                img_crop_bin = img_bin[y: y + h, x: x + w]
                mask_poly = np.zeros(img_crop_rgb.shape[:2], dtype=np.uint8)
                mask_poly = cv2.fillPoly(mask_poly, pts=[cont - [x, y]], color=1)
-                            mask_poly = mask_poly[y:y+h, x:x+w, :]
+                if angle_degrees > 3:
-                            img_crop = img_poly_on_img[y:y+h, x:x+w, :]
+                    better_des_slope = get_orientation_moments(cont)
                    img_crop_rgb = rotate_image_with_padding(img_crop_rgb, better_des_slope)
                    img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope)
                    mask_poly = rotate_image_with_padding(mask_poly, better_des_slope)
                    # get new bounding box
                    x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_poly)
                    img_crop_rgb = img_crop_rgb[y_n: y_n + h_n, x_n: x_n + w_n]
                    img_crop_bin = img_crop_bin[y_n: y_n + h_n, x_n: x_n + w_n]
                    mask_poly = mask_poly[y_n: y_n + h_n, x_n: x_n + w_n]
                else:
                    better_des_slope = 0
-                            # print(file_name, angle_degrees, w*h,
+                if not self.do_not_mask_with_textline_contour:
-                            #       mask_poly[:,:,0].sum(),
+                    img_crop_rgb[mask_poly == 0] = 255 # FIXME: or median color?
-                            #       mask_poly[:,:,0].sum() /float(w*h) ,
+                    img_crop_bin[mask_poly == 0] = 255
                            #       'didi')
-                            if angle_degrees > 3:
+                if (type_textregion !='drop-capital' and
-                                better_des_slope = get_orientation_moments(textline_coords)
+                    mask_poly.sum() < 0.50 * mask_poly.size and
                    w_scaled > 90):
-                                img_crop = rotate_image_with_padding(img_crop, better_des_slope)
+                    img_crop_rgb, img_crop_bin = \
-                                if img_bin:
+                        break_curved_line_into_small_pieces_and_then_merge(
-                                    img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope)
+                            img_crop_rgb, img_crop_bin, mask_poly)
-                                mask_poly = rotate_image_with_padding(mask_poly, better_des_slope)
+                if w_scaled < 750:#1.5*image_width:
-                                mask_poly = mask_poly.astype('uint8')
+                    img_crop_split_rgb = img_crop_split_bin = None
                else:
                    img_crop_split_rgb, img_crop_split_bin = return_textlines_split_if_needed(
                        img_crop_rgb, img_crop_bin)
                if img_crop_split_rgb:
                    cropped_lines_rgb.extend(img_crop_split_rgb)
                    cropped_lines_bin.extend(img_crop_split_bin)
                    if abs(better_des_slope) > 45:
                        cropped_lines_ver_index.append(1)
                        cropped_lines_ver_index.append(1)
                    else:
                        cropped_lines_ver_index.append(0)
                        cropped_lines_ver_index.append(0)
                    cropped_lines_meging_indexing.append(1)
                    cropped_lines_meging_indexing.append(-1)
                else:
                    cropped_lines_rgb.append(img_crop_rgb)
                    cropped_lines_bin.append(img_crop_bin)
                    if abs(better_des_slope) > 45:
                        cropped_lines_ver_index.append(1)
                    else:
                        cropped_lines_ver_index.append(0)
                    cropped_lines_meging_indexing.append(0)
-                                #new bounding box
+        cropped_lines_rgb = [preprocess_and_resize_image_for_ocrcnn_model(img, image_height, image_width)
-                                x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_poly[:,:,0])
+                             for img in cropped_lines_rgb]
-                                
+        cropped_lines_bin = [preprocess_and_resize_image_for_ocrcnn_model(img, image_height, image_width)
-                                mask_poly = mask_poly[y_n:y_n+h_n, x_n:x_n+w_n, :]
+                             for img in cropped_lines_bin]
                                img_crop = img_crop[y_n:y_n+h_n, x_n:x_n+w_n, :]
                                if not self.do_not_mask_with_textline_contour:
                                    img_crop[mask_poly==0] = 255
                                if img_bin:
                                    img_crop_bin = img_crop_bin[y_n:y_n+h_n, x_n:x_n+w_n, :]
                                    if not self.do_not_mask_with_textline_contour:
                                        img_crop_bin[mask_poly==0] = 255
                                if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 90:
                                    if img_bin:
                                        img_crop, img_crop_bin = \
                                            break_curved_line_into_small_pieces_and_then_merge(
                                                img_crop, mask_poly, img_crop_bin)
                                    else:
                                        img_crop, _ = \
                                            break_curved_line_into_small_pieces_and_then_merge(
                                                img_crop, mask_poly)
                            else:
                                better_des_slope = 0
                                if not self.do_not_mask_with_textline_contour:
                                    img_crop[mask_poly==0] = 255
                                if img_bin:
                                    if not self.do_not_mask_with_textline_contour:
                                        img_crop_bin[mask_poly==0] = 255
                                if type_textregion=='drop-capital':
                                    pass
                                else:
                                    if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 90:
                                        if img_bin:
                                            img_crop, img_crop_bin = \
                                                break_curved_line_into_small_pieces_and_then_merge(
                                                    img_crop, mask_poly, img_crop_bin)
                                        else:
                                            img_crop, _ = \
                                                break_curved_line_into_small_pieces_and_then_merge(
                                                    img_crop, mask_poly)
                            if w_scaled < 750:#1.5*image_width:
                                img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                    img_crop, image_height, image_width)
                                cropped_lines.append(img_fin)
                                if abs(better_des_slope) > 45:
                                    cropped_lines_ver_index.append(1)
                                else:
                                    cropped_lines_ver_index.append(0)
                                cropped_lines_meging_indexing.append(0)
                                if img_bin:
                                    img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                        img_crop_bin, image_height, image_width)
                                    cropped_lines_bin.append(img_fin)
                            else:
                                splited_images, splited_images_bin = return_textlines_split_if_needed(
                                    img_crop, img_crop_bin if img_bin else None)
                                if splited_images:
                                    img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                        splited_images[0], image_height, image_width)
                                    cropped_lines.append(img_fin)
                                    cropped_lines_meging_indexing.append(1)
                                    if abs(better_des_slope) > 45:
                                        cropped_lines_ver_index.append(1)
                                    else:
                                        cropped_lines_ver_index.append(0)
                                    img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                        splited_images[1], image_height, image_width)
                                    cropped_lines.append(img_fin)
                                    cropped_lines_meging_indexing.append(-1)
                                    if abs(better_des_slope) > 45:
                                        cropped_lines_ver_index.append(1)
                                    else:
                                        cropped_lines_ver_index.append(0)
                                    if img_bin:
                                        img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                            splited_images_bin[0], image_height, image_width)
                                        cropped_lines_bin.append(img_fin)
                                        img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                            splited_images_bin[1], image_height, image_width)
                                        cropped_lines_bin.append(img_fin)
                                else:
                                    img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                        img_crop, image_height, image_width)
                                    cropped_lines.append(img_fin)
                                    cropped_lines_meging_indexing.append(0)
                                    if abs(better_des_slope) > 45:
                                        cropped_lines_ver_index.append(1)
                                    else:
                                        cropped_lines_ver_index.append(0)
                                    if img_bin:
                                        img_fin = preprocess_and_resize_image_for_ocrcnn_model(
                                            img_crop_bin, image_height, image_width)
                                        cropped_lines_bin.append(img_fin)
            indexer_text_region = indexer_text_region +1
        extracted_texts = []
-        extracted_conf_value = []
+        extracted_confs = []
        self.logger.debug("processing %d lines for %d regions",
                          len(cropped_lines_rgb), len(set(cropped_lines_region_indexer)))
        cropped_lines = zip(cropped_lines_rgb, cropped_lines_bin, cropped_lines_ver_index)
        for batch in batched(cropped_lines, self.b_s):
            imgs_rgb, imgs_bin, ver_index = zip(*batch)
            ver_index = np.array(ver_index)
            imgs_rgb = np.stack(imgs_rgb)
            imgs_bin = np.stack(imgs_bin)
            imgs_rgb_ver = imgs_rgb[ver_index > 0, ::-1, ::-1]
            imgs_bin_ver = imgs_bin[ver_index > 0, ::-1, ::-1]
-        n_iterations  = math.ceil(len(cropped_lines) / self.b_s) 
+            # inference model now yields (char-bytes, line-prob) instead of vocidx-softmax
            # (so ctc_decode and inverse StringLookup are included)
            # also, the model now expects a secondary binary input image
            preds, probs = self.model_zoo.get('ocr').predict((imgs_rgb, imgs_bin), verbose=0)
-        # FIXME: copy pasta
+            if ver_index.any():
-        for i in range(n_iterations):
+                preds_ver, probs_ver = self.model_zoo.get('ocr').predict((imgs_rgb_ver, imgs_bin_ver), verbose=0)
-            if i==(n_iterations-1):
+                flipped_ver_is_better = np.flatnonzero(probs_ver > probs[ver_index > 0])
-                n_start = i*self.b_s
+                if len(flipped_ver_is_better):
-                imgs = cropped_lines[n_start:]
+                    self.logger.info("%d skewed lines perform better when flipped", len(flipped_ver_is_better))
-                imgs = np.array(imgs)
+                    preds[ver_index > 0][flipped_ver_is_better] = preds_ver[flipped_ver_is_better]
-                imgs = imgs.reshape(imgs.shape[0], image_height, image_width, 3)
+                    probs[ver_index > 0][flipped_ver_is_better] = probs_ver[flipped_ver_is_better]
-                ver_imgs = np.array( cropped_lines_ver_index[n_start:] )
+            def nooov(x):
-                indices_ver = np.where(ver_imgs == 1)[0]
+                return x != b'[UNK]'
-                
+            for pred, prob in zip(preds, probs):
-                #print(indices_ver, 'indices_ver')
+                if prob < self.min_conf_value_of_textline_text:
                if len(indices_ver)>0:
                    imgs_ver_flipped = imgs[indices_ver, : ,: ,:]
                    imgs_ver_flipped = imgs_ver_flipped[:,::-1,::-1,:]
                    #print(imgs_ver_flipped, 'imgs_ver_flipped')
                else:
                    imgs_ver_flipped = None
                if img_bin:
                    imgs_bin = cropped_lines_bin[n_start:]
                    imgs_bin = np.array(imgs_bin)
                    imgs_bin = imgs_bin.reshape(imgs_bin.shape[0], image_height, image_width, 3)
                    if len(indices_ver)>0:
                        imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:]
                        imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:]
                        #print(imgs_ver_flipped, 'imgs_ver_flipped')
                    else:
                        imgs_bin_ver_flipped = None
            else:
                n_start = i*self.b_s
                n_end = (i+1)*self.b_s
                imgs = cropped_lines[n_start:n_end]
                imgs = np.array(imgs).reshape(self.b_s, image_height, image_width, 3)
                ver_imgs = np.array( cropped_lines_ver_index[n_start:n_end] )
                indices_ver = np.where(ver_imgs == 1)[0]
                #print(indices_ver, 'indices_ver')
                if len(indices_ver)>0:
                    imgs_ver_flipped = imgs[indices_ver, : ,: ,:]
                    imgs_ver_flipped = imgs_ver_flipped[:,::-1,::-1,:]
                    #print(imgs_ver_flipped, 'imgs_ver_flipped')
                else:
                    imgs_ver_flipped = None
                if img_bin:
                    imgs_bin = cropped_lines_bin[n_start:n_end]
                    imgs_bin = np.array(imgs_bin).reshape(self.b_s, image_height, image_width, 3)
                    if len(indices_ver)>0:
                        imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:]
                        imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:]
                        #print(imgs_ver_flipped, 'imgs_ver_flipped')
                    else:
                        imgs_bin_ver_flipped = None
            self.logger.debug("processing next %d lines", len(imgs))
            preds = self.model_zoo.get('ocr').predict(imgs, verbose=0)
            if len(indices_ver)>0:
                preds_flipped = self.model_zoo.get('ocr').predict(imgs_ver_flipped, verbose=0)
                preds_max_fliped = np.max(preds_flipped, axis=2 )
                preds_max_args_flipped = np.argmax(preds_flipped, axis=2 )
                pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=self.end_character
                masked_means_flipped = \
                    np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / \
                    np.sum(pred_max_not_unk_mask_bool_flipped, axis=1)
                masked_means_flipped[np.isnan(masked_means_flipped)] = 0
                preds_max = np.max(preds, axis=2 )
                preds_max_args = np.argmax(preds, axis=2 )
                pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character
                masked_means = \
                    np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / \
                    np.sum(pred_max_not_unk_mask_bool, axis=1)
                masked_means[np.isnan(masked_means)] = 0
                masked_means_ver = masked_means[indices_ver]
                #print(masked_means_ver, 'pred_max_not_unk')
                indices_where_flipped_conf_value_is_higher = \
                    np.where(masked_means_flipped > masked_means_ver)[0]
                #print(indices_where_flipped_conf_value_is_higher, 'indices_where_flipped_conf_value_is_higher')
                if len(indices_where_flipped_conf_value_is_higher)>0:
                    indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher]
                    preds[indices_to_be_replaced,:,:] = \
                        preds_flipped[indices_where_flipped_conf_value_is_higher, :, :]
            if img_bin:
                preds_bin = self.model_zoo.get('ocr').predict(imgs_bin, verbose=0)
                if len(indices_ver)>0:
                    preds_flipped = self.model_zoo.get('ocr').predict(imgs_bin_ver_flipped, verbose=0)
                    preds_max_fliped = np.max(preds_flipped, axis=2 )
                    preds_max_args_flipped = np.argmax(preds_flipped, axis=2 )
                    pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=self.end_character
                    masked_means_flipped = \
                        np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / \
                        np.sum(pred_max_not_unk_mask_bool_flipped, axis=1)
                    masked_means_flipped[np.isnan(masked_means_flipped)] = 0
                    preds_max = np.max(preds, axis=2 )
                    preds_max_args = np.argmax(preds, axis=2 )
                    pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character
                    masked_means = \
                        np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / \
                        np.sum(pred_max_not_unk_mask_bool, axis=1)
                    masked_means[np.isnan(masked_means)] = 0
                    masked_means_ver = masked_means[indices_ver]
                    #print(masked_means_ver, 'pred_max_not_unk')
                    indices_where_flipped_conf_value_is_higher = \
                        np.where(masked_means_flipped > masked_means_ver)[0]
                    #print(indices_where_flipped_conf_value_is_higher, 'indices_where_flipped_conf_value_is_higher')
                    if len(indices_where_flipped_conf_value_is_higher)>0:
                        indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher]
                        preds_bin[indices_to_be_replaced,:,:] = \
                            preds_flipped[indices_where_flipped_conf_value_is_higher, :, :]
                preds = (preds + preds_bin) / 2.
            pred_texts = decode_batch_predictions(preds, self.model_zoo.get('num_to_char'))
            preds_max = np.max(preds, axis=2 )
            preds_max_args = np.argmax(preds, axis=2 )
            pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character
            masked_means = \
                np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / \
                np.sum(pred_max_not_unk_mask_bool, axis=1)
            for ib in range(imgs.shape[0]):
                pred_texts_ib = pred_texts[ib].replace("[UNK]", "")
                if masked_means[ib] >= self.min_conf_value_of_textline_text:
                    extracted_texts.append(pred_texts_ib)
                    extracted_conf_value.append(masked_means[ib])
                else:
                    extracted_texts.append("")
-                    extracted_conf_value.append(0)
+                    extracted_confs.append(0)
-        del cropped_lines
+                else:
                    text = b''.join(
                        filter(nooov,
                               map(bytes,
                                   (filter(None, char)
                                    for char in pred.tolist())))).decode('utf-8')
                    extracted_texts.append(text)
                    extracted_confs.append(prob)
        del cropped_lines_rgb
        del cropped_lines_bin
        gc.collect()
        extracted_texts_merged = [extracted_texts[ind]
-                                    if cropped_lines_meging_indexing[ind]==0
+                                  if cropped_lines_meging_indexing[ind] == 0
-                                    else extracted_texts[ind]+" "+extracted_texts[ind+1]
+                                  else extracted_texts[ind] + " " + extracted_texts[ind + 1]
-                                    if cropped_lines_meging_indexing[ind]==1
+                                  for ind in range(len(cropped_lines_meging_indexing))
-                                    else None
+                                  if cropped_lines_meging_indexing[ind] >= 0]
-                                    for ind in range(len(cropped_lines_meging_indexing))]
+        extracted_confs_merged = [extracted_confs[ind]
-        
+                                  if cropped_lines_meging_indexing[ind] == 0
-        extracted_conf_value_merged = [extracted_conf_value[ind]  # type: ignore
+                                  else 0.5 * (extracted_confs[ind] + extracted_confs[ind + 1])
-                                        if cropped_lines_meging_indexing[ind]==0
+                                  for ind in range(len(cropped_lines_meging_indexing))
-                                        else (extracted_conf_value[ind]+extracted_conf_value[ind+1])/2.
+                                  if cropped_lines_meging_indexing[ind] >= 0]
                                        if cropped_lines_meging_indexing[ind]==1
                                        else None
                                        for ind in range(len(cropped_lines_meging_indexing))]
        extracted_conf_value_merged: List[float] = [extracted_conf_value_merged[ind_cfm]
                                        for ind_cfm in range(len(extracted_texts_merged))
                                        if extracted_texts_merged[ind_cfm] is not None]
        extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None]
        return EynollahOcrResult(
            extracted_texts_merged=extracted_texts_merged,
-            extracted_conf_value_merged=extracted_conf_value_merged,
+            extracted_confs_merged=extracted_confs_merged,
            cropped_lines_region_indexer=cropped_lines_region_indexer,
            total_bb_coordinates=total_bb_coordinates,
        )
@ -569,7 +373,7 @@ class Eynollah_ocr(Eynollah):
        cropped_lines_region_indexer = result.cropped_lines_region_indexer
        total_bb_coordinates = result.total_bb_coordinates
        extracted_texts_merged = result.extracted_texts_merged
-        extracted_conf_value_merged = result.extracted_conf_value_merged
+        extracted_confs_merged = result.extracted_confs_merged
        unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer)
        if out_image_with_text:
@ -646,8 +450,8 @@ class Eynollah_ocr(Eynollah):
                    if not is_textline_text:
                        text_subelement = ET.SubElement(child_textregion, 'TextEquiv')
-                        if extracted_conf_value_merged:
+                        if extracted_confs_merged:
-                            text_subelement.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}")
+                            text_subelement.set('conf', f"{extracted_confs_merged[indexer]:.2f}")
                        unicode_textline = ET.SubElement(text_subelement, 'Unicode')
                        unicode_textline.text = extracted_texts_merged[indexer]
                    else:
@ -655,8 +459,8 @@ class Eynollah_ocr(Eynollah):
                            if childtest3.tag.endswith("TextEquiv"):
                                for child_uc in childtest3:
                                    if child_uc.tag.endswith("Unicode"):
-                                        if extracted_conf_value_merged:
+                                        if extracted_confs_merged:
-                                            childtest3.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}")
+                                            childtest3.set('conf', f"{extracted_confs_merged[indexer]:.2f}")
                                        child_uc.text = extracted_texts_merged[indexer]
                    indexer = indexer + 1
--- a/src/eynollah/utils/init.py
+++ b/src/eynollah/utils/init.py
@ -2,6 +2,7 @@ from typing import Iterable, List, Tuple
 from logging import getLogger
 import time
 import math
 from itertools import islice
 try:
    import matplotlib.pyplot as plt
@ -33,6 +34,11 @@ def pairwise(iterable):
        yield a, b
        a = b
 def batched(iterable, n):
    iterator = iter(iterable)
    while batch := tuple(islice(iterator, n)):
        yield batch
 def return_multicol_separators_x_start_end(
        regions_without_separators, peak_points, top, bot,
        x_min_hor_some, x_max_hor_some, cy_hor_some, y_min_hor_some, y_max_hor_some):
--- a/src/eynollah/utils/utils_ocr.py
+++ b/src/eynollah/utils/utils_ocr.py
@ -1,6 +1,5 @@
 import math
 import copy
 from itertools import islice
 import numpy as np
 import cv2
@ -11,6 +10,7 @@ from scipy.signal import find_peaks
 from scipy.ndimage import gaussian_filter1d
 from PIL import Image, ImageDraw, ImageFont
 from . import pairwise
 from .resize import resize_image
@ -42,45 +42,6 @@ def decode_batch_predictions(pred, num_to_char, max_len = 128):
        output.append(d)
    return output
 def distortion_free_resize(image, img_size):
    import tensorflow as tf
    w, h = img_size
    image = tf.image.resize(image, size=(h, w), preserve_aspect_ratio=True)
    # Check tha amount of padding needed to be done.
    pad_height = h - tf.shape(image)[0]
    pad_width = w - tf.shape(image)[1]
    # Only necessary if you want to do same amount of padding on both sides.
    if pad_height % 2 != 0:
        height = pad_height // 2
        pad_height_top = height + 1
        pad_height_bottom = height
    else:
        pad_height_top = pad_height_bottom = pad_height // 2
    if pad_width % 2 != 0:
        width = pad_width // 2
        pad_width_left = width + 1
        pad_width_right = width
    else:
        pad_width_left = pad_width_right = pad_width // 2
    image = tf.pad(
        image,
        paddings=[
            [pad_height_top, pad_height_bottom],
            [pad_width_left, pad_width_right],
            [0, 0],
        ],
    )
    image = tf.transpose(image, (1, 0, 2))
    image = tf.image.flip_left_right(image)
    return image
 def return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image):
    width = np.shape(textline_image)[1]
    height = np.shape(textline_image)[0]
@ -263,254 +224,58 @@ def return_splitting_point_of_image(image_to_spliited):
    return np.sort(peaks_sort_4)
-def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved, img_bin_curved=None):
+def break_curved_line_into_small_pieces_and_then_merge(img_rgb_curved, img_bin_curved, mask_curved):
-    peaks_4 = return_splitting_point_of_image(img_curved)
+    peaks_4 = return_splitting_point_of_image(img_rgb_curved)
-    if len(peaks_4)>0:
+    if len(peaks_4):
        imgs_tot = []
-        
+        for left, right in pairwise([None] + peaks_4 + [None]):
-        for ind in range(len(peaks_4)+1):
+            img_rgb = img_rgb_curved[:, left: right]
-            if ind==0:
+            img_bin = img_bin_curved[:, left: right]
-                img = img_curved[:, :peaks_4[ind], :]
+            mask = mask_curved[:, left: right]
                if img_bin_curved is not None:
                    img_bin = img_bin_curved[:, :peaks_4[ind], :]
                mask = mask_curved[:, :peaks_4[ind], :]
            elif ind==len(peaks_4):
                img = img_curved[:, peaks_4[ind-1]:, :]
                if img_bin_curved is not None:
                    img_bin = img_bin_curved[:, peaks_4[ind-1]:, :]
                mask = mask_curved[:, peaks_4[ind-1]:, :]
            else:
                img = img_curved[:, peaks_4[ind-1]:peaks_4[ind], :]
                if img_bin_curved is not None:
                    img_bin = img_bin_curved[:, peaks_4[ind-1]:peaks_4[ind], :]
                mask = mask_curved[:, peaks_4[ind-1]:peaks_4[ind], :]
            or_ma = get_orientation_moments_of_mask(mask)
-            
+            imgs_tot.append([img_rgb, img_bin, mask, or_ma])
            if img_bin_curved is not None:
                imgs_tot.append([img, mask, or_ma, img_bin] )
            else:
                imgs_tot.append([img, mask, or_ma] )
        w_tot_des_list = []
-        w_tot_des = 0
+        imgs_rgb_deskewed_list = []
        imgs_deskewed_list = []
        imgs_bin_deskewed_list = []
-        for ind in range(len(imgs_tot)):
+        for img_rgb_in, img_bin_in, mask_in, ori_in in imgs_tot:
-            img_in = imgs_tot[ind][0]
+            if abs(ori_in) < 45:
-            mask_in = imgs_tot[ind][1]
+                img_rgb_in_des = rotate_image_with_padding(img_rgb_in, ori_in, border_value=(255,255,255) )
-            ori_in = imgs_tot[ind][2]
+                img_bin_in_des = rotate_image_with_padding(img_bin_in, ori_in, border_value=(255,255,255) )
            if img_bin_curved is not None:
                img_bin_in = imgs_tot[ind][3]
            if abs(ori_in)<45:
                img_in_des = rotate_image_with_padding(img_in, ori_in, border_value=(255,255,255) )
                if img_bin_curved is not None:
                    img_bin_in_des = rotate_image_with_padding(img_bin_in, ori_in, border_value=(255,255,255) )
                mask_in_des = rotate_image_with_padding(mask_in, ori_in)
-                mask_in_des = mask_in_des.astype('uint8')
+                # get new bounding box
-                
+                x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_in_des)
-                #new bounding box
+                if w_n and h_n:
-                x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_in_des[:,:,0])
+                    img_rgb_in_des = img_rgb_in_des[y_n: y_n + h_n, x_n: x_n + w_n]
-                
+                    img_bin_in_des = img_bin_in_des[y_n: y_n + h_n, x_n: x_n + w_n]
                if w_n==0 or h_n==0:
                    img_in_des = np.copy(img_in)
                    if img_bin_curved is not None:
                        img_bin_in_des = np.copy(img_bin_in)
                    w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) )
                    if w_relative==0:
                        w_relative = img_in_des.shape[1]
                    img_in_des = resize_image(img_in_des, 32, w_relative)
                    if img_bin_curved is not None:
                        img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative)
                else:
-                    mask_in_des = mask_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :]
+                    img_rgb_in_des = np.copy(img_rgb_in)
                    img_in_des = img_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :]
                    if img_bin_curved is not None:
                        img_bin_in_des = img_bin_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :]
                    w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) )
                    if w_relative==0:
                        w_relative = img_in_des.shape[1]
                    img_in_des = resize_image(img_in_des, 32, w_relative)
                    if img_bin_curved is not None:
                        img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative)
            else:
                img_in_des = np.copy(img_in)
                if img_bin_curved is not None:
                    img_bin_in_des = np.copy(img_bin_in)
-                w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) )
+            else:
-                if w_relative==0:
+                img_rgb_in_des = np.copy(img_rgb_in)
-                    w_relative = img_in_des.shape[1]
+                img_bin_in_des = np.copy(img_bin_in)
                img_in_des = resize_image(img_in_des, 32, w_relative)
                if img_bin_curved is not None:
                    img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative)
-            w_tot_des+=img_in_des.shape[1]
+            h, w = img_rgb_in_des.shape[:2]
-            w_tot_des_list.append(img_in_des.shape[1])
+            new_h = 32
-            imgs_deskewed_list.append(img_in_des)
+            new_w = 32 * w // h
-            if img_bin_curved is not None:
+            new_w = new_w or w
-                imgs_bin_deskewed_list.append(img_bin_in_des)
+            img_rgb_in_des = resize_image(img_rgb_in_des, new_h, new_w)
            img_bin_in_des = resize_image(img_bin_in_des, new_h, new_w)
            w_tot_des_list.append(new_w)
            imgs_rgb_deskewed_list.append(img_rgb_in_des)
            imgs_bin_deskewed_list.append(img_bin_in_des)
-            
+        img_rgb_final_deskewed = np.ones((new_h, sum(w_tot_des_list), 3)) * 255
-
+        img_bin_final_deskewed = np.ones((new_h, sum(w_tot_des_list), 3)) * 255
        img_final_deskewed = np.zeros((32, w_tot_des, 3))+255
        if img_bin_curved is not None:
            img_bin_final_deskewed = np.zeros((32, w_tot_des, 3))+255
        else:
            img_bin_final_deskewed = None
        w_indexer = 0
        for ind in range(len(w_tot_des_list)):
-            img_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_deskewed_list[ind][:,:,:]
+            w_indexer2 = w_indexer + w_tot_des_list[ind]
-            if img_bin_curved is not None:
+            img_rgb_final_deskewed[:, w_indexer: w_indexer2] = imgs_rgb_deskewed_list[ind]
-                img_bin_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_bin_deskewed_list[ind][:,:,:]
+            img_bin_final_deskewed[:, w_indexer: w_indexer2] = imgs_bin_deskewed_list[ind]
-            w_indexer = w_indexer+w_tot_des_list[ind]
+            w_indexer = w_indexer2
-        return img_final_deskewed, img_bin_final_deskewed
+        return img_rgb_final_deskewed, img_bin_final_deskewed
    else:
-        return img_curved, img_bin_curved
+        return img_rgb_curved, img_bin_curved
 def return_textline_contour_with_added_box_coordinate(textline_contour,  box_ind):
    textline_contour[:,:,0] += box_ind[2]
    textline_contour[:,:,1] += box_ind[0]
    return textline_contour
 def return_rnn_cnn_ocr_of_given_textlines(image,
                                          all_found_textline_polygons,
                                          all_box_coord,
                                          prediction_model,
                                          b_s_ocr, num_to_char,
                                          curved_line=False):
    max_len = 512
    padding_token = 299
    image_width = 512#max_len * 4
    image_height = 32
    ind_tot = 0
    #cv2.imwrite('./img_out.png', image_page)
    ocr_all_textlines = []
    cropped_lines_region_indexer = []
    cropped_lines_meging_indexing = []
    cropped_lines = []
    indexer_text_region = 0
    for indexing, ind_poly_first in enumerate(all_found_textline_polygons):
        #ocr_textline_in_textregion = []
        if len(ind_poly_first)==0:
            cropped_lines_region_indexer.append(indexer_text_region)
            cropped_lines_meging_indexing.append(0)
            img_fin = np.ones((image_height, image_width, 3))*1
            cropped_lines.append(img_fin)
        else:
            for indexing2, ind_poly in enumerate(ind_poly_first):
                cropped_lines_region_indexer.append(indexer_text_region)
                if not curved_line:
                    ind_poly = copy.deepcopy(ind_poly)
                    box_ind = all_box_coord[indexing]
                    ind_poly = return_textline_contour_with_added_box_coordinate(ind_poly, box_ind)
                    #print(ind_poly_copy)
                    ind_poly[ind_poly<0] = 0
                x, y, w, h = cv2.boundingRect(ind_poly)
                w_scaled = w *  image_height/float(h)
                mask_poly = np.zeros(image.shape)
                img_poly_on_img = np.copy(image)
                mask_poly = cv2.fillPoly(mask_poly, pts=[ind_poly], color=(1, 1, 1))
                mask_poly = mask_poly[y:y+h, x:x+w, :]
                img_crop = img_poly_on_img[y:y+h, x:x+w, :]
                img_crop[mask_poly==0] = 255
                if w_scaled < 640:#1.5*image_width:
                    img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width)
                    cropped_lines.append(img_fin)
                    cropped_lines_meging_indexing.append(0)
                else:
                    splited_images, splited_images_bin = return_textlines_split_if_needed(img_crop, None)
                    if splited_images:
                        img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[0],
                                                                               image_height,
                                                                               image_width)
                        cropped_lines.append(img_fin)
                        cropped_lines_meging_indexing.append(1)
                        img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[1],
                                                                               image_height,
                                                                               image_width)
                        cropped_lines.append(img_fin)
                        cropped_lines_meging_indexing.append(-1)
                    else:
                        img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop,
                                                                               image_height,
                                                                               image_width)
                        cropped_lines.append(img_fin)
                        cropped_lines_meging_indexing.append(0)
        indexer_text_region+=1
    extracted_texts = []
    n_iterations  = math.ceil(len(cropped_lines) / b_s_ocr) 
    for i in range(n_iterations):
        if i==(n_iterations-1):
            n_start = i*b_s_ocr
            imgs = cropped_lines[n_start:]
            imgs = np.array(imgs)
            imgs = imgs.reshape(imgs.shape[0], image_height, image_width, 3)
        else:
            n_start = i*b_s_ocr
            n_end = (i+1)*b_s_ocr
            imgs = cropped_lines[n_start:n_end]
            imgs = np.array(imgs).reshape(b_s_ocr, image_height, image_width, 3)
        preds = prediction_model.predict(imgs, verbose=0)
        pred_texts = decode_batch_predictions(preds, num_to_char)
        for ib in range(imgs.shape[0]):
            pred_texts_ib = pred_texts[ib].replace("[UNK]", "")
            extracted_texts.append(pred_texts_ib)
    extracted_texts_merged = [extracted_texts[ind]
                              if cropped_lines_meging_indexing[ind]==0
                              else extracted_texts[ind]+" "+extracted_texts[ind+1]
                              if cropped_lines_meging_indexing[ind]==1
                              else None
                              for ind in range(len(cropped_lines_meging_indexing))]
    extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None]
    unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer)
    ocr_all_textlines = []
    for ind in unique_cropped_lines_region_indexer:
        ocr_textline_in_textregion = []
        extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind]
        for  it_ind, text_textline in enumerate(extracted_texts_merged_un):
            ocr_textline_in_textregion.append(text_textline)
        ocr_all_textlines.append(ocr_textline_in_textregion)
    return ocr_all_textlines
 def batched(iterable, n):
    iterator = iter(iterable)
    while batch := tuple(islice(iterator, n)):
        yield batch