strings alignment function is added + new changes needed for prediction with both bin and rgb inputs is implemented

2025-10-27 15:54:13 +01:00 · 2025-05-25 21:44:36 +02:00 · 2025-05-25 21:44:36 +02:00 · 0f154c605a
commit 0f154c605a
parent 097520bfd2
3 changed files with 107 additions and 19 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -6,3 +6,4 @@ tensorflow < 2.13
 numba <= 0.58.1
 scikit-image
 loky
 biopython
--- a/src/eynollah/eynollah.py
+++ b/src/eynollah/eynollah.py
@ -5647,6 +5647,10 @@ class Eynollah_ocr:
                                            better_des_slope = get_orientation_moments(textline_coords)
                                            img_crop = rotate_image_with_padding(img_crop, better_des_slope )
                                            if self.prediction_with_both_of_rgb_and_bin:
                                                img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope )
                                            mask_poly = rotate_image_with_padding(mask_poly, better_des_slope )
                                            mask_poly = mask_poly.astype('uint8')
@ -5658,24 +5662,33 @@ class Eynollah_ocr:
                                            img_crop[mask_poly==0] = 255
-                                            if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 100:
+                                            if self.prediction_with_both_of_rgb_and_bin:
-                                                img_crop = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly)
+                                                img_crop_bin = img_crop_bin[y_n:y_n+h_n, x_n:x_n+w_n, :]
                                                img_crop_bin[mask_poly==0] = 255
                                            if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 100:
                                                if self.prediction_with_both_of_rgb_and_bin:
                                                    img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin)
                                                else:
                                                    img_crop, _ = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly)
                                            #print(file_name,w_n*h_n , mask_poly[:,:,0].sum(),  mask_poly[:,:,0].sum() /float(w_n*h_n) , 'ikiiiiii')
                                        else:
                                            img_crop[mask_poly==0] = 255
                                            if self.prediction_with_both_of_rgb_and_bin:
                                                img_crop_bin[mask_poly==0] = 255
                                            if type_textregion=='drop-capital':
                                                pass
                                            else:
                                                if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 100:
-                                                    img_crop = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly)
+                                                    if self.prediction_with_both_of_rgb_and_bin:
                                                        img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin)
                                                    else:
                                                        img_crop, _ = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly)
                                        if self.prediction_with_both_of_rgb_and_bin:
                                            img_crop_bin[mask_poly==0] = 255
                                    if not self.export_textline_images_and_text:
                                        if w_scaled < 640:#1.5*image_width:
                                            img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width)
@ -5796,6 +5809,14 @@ class Eynollah_ocr:
                                imgs_bin = cropped_lines_bin[n_start:]
                                imgs_bin = np.array(imgs_bin)
                                imgs_bin = imgs_bin.reshape(imgs_bin.shape[0], image_height, image_width, 3)
                                if len(indices_ver)>0:
                                    imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:]
                                    imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:]
                                    #print(imgs_ver_flipped, 'imgs_ver_flipped')
                                else:
                                    imgs_bin_ver_flipped = None
                        else:
                            n_start = i*self.b_s
                            n_end = (i+1)*self.b_s
@ -5819,20 +5840,23 @@ class Eynollah_ocr:
                                imgs_bin = np.array(imgs_bin).reshape(self.b_s, image_height, image_width, 3)
                                if len(indices_ver)>0:
                                    imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:]
                                    imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:]
                                    #print(imgs_ver_flipped, 'imgs_ver_flipped')
                                else:
                                    imgs_bin_ver_flipped = None
                        preds = self.prediction_model.predict(imgs, verbose=0)
                        if len(indices_ver)>0:
                            #cv2.imwrite('flipped.png', (imgs_ver_flipped[0, :,:,:]*255).astype('uint8'))
                            #cv2.imwrite('original.png', (imgs[0, :,:,:]*255).astype('uint8'))
                            #sys.exit()
                            #print(imgs_ver_flipped.shape, 'imgs_ver_flipped.shape')
                            preds_flipped = self.prediction_model.predict(imgs_ver_flipped, verbose=0)
                            preds_max_fliped = np.max(preds_flipped, axis=2 )
                            preds_max_args_flipped = np.argmax(preds_flipped, axis=2 )
                            pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=256
                            masked_means_flipped = np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / np.sum(pred_max_not_unk_mask_bool_flipped, axis=1)
                            masked_means_flipped[np.isnan(masked_means_flipped)] = 0
                            #print(masked_means_flipped, 'masked_means_flipped')
                            preds_max = np.max(preds, axis=2 )
                            preds_max_args = np.argmax(preds, axis=2 )
@ -5852,6 +5876,32 @@ class Eynollah_ocr:
                                preds[indices_to_be_replaced,:,:] = preds_flipped[indices_where_flipped_conf_value_is_higher, :, :]
                        if self.prediction_with_both_of_rgb_and_bin:
                            preds_bin = self.prediction_model.predict(imgs_bin, verbose=0)
                            if len(indices_ver)>0:
                                preds_flipped = self.prediction_model.predict(imgs_bin_ver_flipped, verbose=0)
                                preds_max_fliped = np.max(preds_flipped, axis=2 )
                                preds_max_args_flipped = np.argmax(preds_flipped, axis=2 )
                                pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=256
                                masked_means_flipped = np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / np.sum(pred_max_not_unk_mask_bool_flipped, axis=1)
                                masked_means_flipped[np.isnan(masked_means_flipped)] = 0
                                preds_max = np.max(preds, axis=2 )
                                preds_max_args = np.argmax(preds, axis=2 )
                                pred_max_not_unk_mask_bool = preds_max_args[:,:]!=256
                                masked_means = np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / np.sum(pred_max_not_unk_mask_bool, axis=1)
                                masked_means[np.isnan(masked_means)] = 0
                                masked_means_ver = masked_means[indices_ver]
                                #print(masked_means_ver, 'pred_max_not_unk')
                                indices_where_flipped_conf_value_is_higher = np.where(masked_means_flipped > masked_means_ver)[0]
                                #print(indices_where_flipped_conf_value_is_higher, 'indices_where_flipped_conf_value_is_higher')
                                if len(indices_where_flipped_conf_value_is_higher)>0:
                                    indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher]
                                    preds_bin[indices_to_be_replaced,:,:] = preds_flipped[indices_where_flipped_conf_value_is_higher, :, :]
                            preds = (preds + preds_bin) / 2.
                        pred_texts = decode_batch_predictions(preds, self.num_to_char)
--- a/src/eynollah/utils/utils_ocr.py
+++ b/src/eynollah/utils/utils_ocr.py
@ -5,6 +5,7 @@ from scipy.signal import find_peaks
 from scipy.ndimage import gaussian_filter1d
 import math
 from PIL import Image, ImageDraw, ImageFont
 from Bio import pairwise2
 from .resize import resize_image
 def decode_batch_predictions(pred, num_to_char, max_len = 128):
@ -252,7 +253,7 @@ def return_splitting_point_of_image(image_to_spliited):
    return np.sort(peaks_sort_4)
-def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved):
+def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved, img_bin_curved=None):
    peaks_4 = return_splitting_point_of_image(img_curved)
    if len(peaks_4)>0:
        imgs_tot = []
@ -260,29 +261,44 @@ def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved):
        for ind in range(len(peaks_4)+1):
            if ind==0:
                img = img_curved[:, :peaks_4[ind], :]
                if img_bin_curved:
                    img_bin = img_curved_bin[:, :peaks_4[ind], :]
                mask = mask_curved[:, :peaks_4[ind], :]
            elif ind==len(peaks_4):
                img = img_curved[:, peaks_4[ind-1]:, :]
                if img_bin_curved:
                    img_bin = img_curved_bin[:, peaks_4[ind-1]:, :]
                mask = mask_curved[:, peaks_4[ind-1]:, :]
            else:
                img = img_curved[:, peaks_4[ind-1]:peaks_4[ind], :]
                if img_bin_curved:
                    img_bin = img_curved_bin[:, peaks_4[ind-1]:peaks_4[ind], :]
                mask = mask_curved[:, peaks_4[ind-1]:peaks_4[ind], :]
            or_ma = get_orientation_moments_of_mask(mask)
-            imgs_tot.append([img, mask, or_ma] )
+            if img_bin_curved:
                imgs_tot.append([img, mask, or_ma, img_bin] )
            else:
                imgs_tot.append([img, mask, or_ma] )
        w_tot_des_list = []
        w_tot_des = 0
        imgs_deskewed_list = []
        imgs_bin_deskewed_list = []
        for ind in range(len(imgs_tot)):
            img_in = imgs_tot[ind][0]
            mask_in = imgs_tot[ind][1]
            ori_in = imgs_tot[ind][2]
            if img_bin_curved:
                img_bin_in = imgs_tot[ind][3]
            if abs(ori_in)<45:
                img_in_des = rotate_image_with_padding(img_in, ori_in, border_value=(255,255,255) )
                if img_bin_curved:
                    img_bin_in_des = rotate_image_with_padding(img_bin_in, ori_in, border_value=(255,255,255) )
                mask_in_des = rotate_image_with_padding(mask_in, ori_in)
                mask_in_des = mask_in_des.astype('uint8')
@ -291,36 +307,52 @@ def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved):
                mask_in_des = mask_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :]
                img_in_des = img_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :]
                if img_bin_curved:
                    img_bin_in_des = img_bin_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :]
                w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) )
                if w_relative==0:
                    w_relative = img_in_des.shape[1]
                img_in_des = resize_image(img_in_des, 32, w_relative)
                if img_bin_curved:
                    img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative)
            else:
                img_in_des = np.copy(img_in)
                if img_bin_curved:
                    img_bin_in_des = np.copy(img_bin_in)
                w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) )
                if w_relative==0:
                    w_relative = img_in_des.shape[1]
                img_in_des = resize_image(img_in_des, 32, w_relative)
                if img_bin_curved:
                    img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative)
            w_tot_des+=img_in_des.shape[1]
            w_tot_des_list.append(img_in_des.shape[1])
            imgs_deskewed_list.append(img_in_des)
            if img_bin_curved:
                imgs_bin_deskewed_list.append(img_bin_in_des)
        img_final_deskewed = np.zeros((32, w_tot_des, 3))+255
        if img_bin_curved:
            img_bin_final_deskewed = np.zeros((32, w_tot_des, 3))+255
        else:
            img_bin_final_deskewed = None
        w_indexer = 0
        for ind in range(len(w_tot_des_list)):
            img_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_deskewed_list[ind][:,:,:]
            if img_bin_curved:
                img_bin_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_bin_deskewed_list[ind][:,:,:]
            w_indexer = w_indexer+w_tot_des_list[ind]
-        return img_final_deskewed
+        return img_final_deskewed, img_bin_final_deskewed
    else:
-        return img_curved
+        return img_curved, img_bin_curved
 def return_textline_contour_with_added_box_coordinate(textline_contour,  box_ind):
    textline_contour[:,0] = textline_contour[:,0] + box_ind[2]
@ -434,3 +466,8 @@ def return_rnn_cnn_ocr_of_given_textlines(image, all_found_textline_polygons, pr
            ocr_textline_in_textregion.append(text_textline)
        ocr_all_textlines.append(ocr_textline_in_textregion)
    return ocr_all_textlines
 def biopython_align(str1, str2):
    alignments = pairwise2.align.globalms(str1, str2, 2, -1, -2, -2)
    best_alignment = alignments[0]  # Get the best alignment
    return best_alignment.seqA, best_alignment.seqB