Implement hyphenated textline merging in OCR engine and a bug fixed for curved textline OCR

2026-02-01 07:07:09 +01:00 · 2025-05-21 14:39:31 +02:00 · 2025-05-21 14:39:31 +02:00 · f94fc9973b
commit f94fc9973b
parent c0835665a9
1 changed files with 71 additions and 86 deletions
--- a/src/eynollah/eynollah.py
+++ b/src/eynollah/eynollah.py
@ -5500,7 +5500,6 @@ class Eynollah_ocr:
    def get_orientation_moments_of_mask(self, mask):
        mask=mask.astype('uint8')
        print(mask.shape)
        contours, _ = cv2.findContours(mask[:,:,0], cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        largest_contour = max(contours, key=cv2.contourArea) if contours else None
@ -5547,39 +5546,24 @@ class Eynollah_ocr:
    def break_curved_line_into_small_pieces_and_then_merge(self, img_curved, mask_curved):
        peaks_4 = self.return_splitting_point_of_image(img_curved)
-        
+        if len(peaks_4)>0:
        img_0 = img_curved[:, :peaks_4[0], :]
        img_1 = img_curved[:, peaks_4[0]:peaks_4[1], :]
        img_2 = img_curved[:, peaks_4[1]:peaks_4[2], :]
        img_3 = img_curved[:, peaks_4[2]:peaks_4[3], :]
        img_4 = img_curved[:, peaks_4[3]:, :]
        mask_0 = mask_curved[:, :peaks_4[0], :]
        mask_1 = mask_curved[:, peaks_4[0]:peaks_4[1], :]
        mask_2 = mask_curved[:, peaks_4[1]:peaks_4[2], :]
        mask_3 = mask_curved[:, peaks_4[2]:peaks_4[3], :]
        mask_4 = mask_curved[:, peaks_4[3]:, :]
        cv2.imwrite("split0.png", img_0)
        cv2.imwrite("split1.png", img_1)
        cv2.imwrite("split2.png", img_2)
        cv2.imwrite("split3.png", img_3)
        or_ma_0 = self.get_orientation_moments_of_mask(mask_0)
        or_ma_1 = self.get_orientation_moments_of_mask(mask_1)
        or_ma_2 = self.get_orientation_moments_of_mask(mask_2)
        or_ma_3 = self.get_orientation_moments_of_mask(mask_3)
        or_ma_4 = self.get_orientation_moments_of_mask(mask_4)
            imgs_tot = []
-        imgs_tot.append([img_0, mask_0, or_ma_0] )
+            
-        imgs_tot.append([img_1, mask_1, or_ma_1])
+            for ind in range(len(peaks_4)+1):
-        imgs_tot.append([img_2, mask_2, or_ma_2])
+                if ind==0:
-        imgs_tot.append([img_3, mask_3, or_ma_3])
+                    img = img_curved[:, :peaks_4[ind], :]
-        imgs_tot.append([img_4, mask_4, or_ma_4])
+                    mask = mask_curved[:, :peaks_4[ind], :]
                elif ind==len(peaks_4):
                    img = img_curved[:, peaks_4[ind-1]:, :]
                    mask = mask_curved[:, peaks_4[ind-1]:, :]
                else:
                    img = img_curved[:, peaks_4[ind-1]:peaks_4[ind], :]
                    mask = mask_curved[:, peaks_4[ind-1]:peaks_4[ind], :]
                or_ma = self.get_orientation_moments_of_mask(mask)
                imgs_tot.append([img, mask, or_ma] )
            w_tot_des_list = []
            w_tot_des = 0
@ -5622,22 +5606,9 @@ class Eynollah_ocr:
            for ind in range(len(w_tot_des_list)):
                img_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_deskewed_list[ind][:,:,:]
                w_indexer = w_indexer+w_tot_des_list[ind]
        #cv2.imwrite('final.png', img_final_deskewed)
        #print(or_ma_0, or_ma_1, or_ma_2, or_ma_3, or_ma_4, 'orients')
        ##cv2.imwrite("split4.png", img_curved[:, peaks_4[3]:peaks_4[4], :])
        ##cv2.imwrite("split5.png", img_curved[:, peaks_4[4]:peaks_4[5], :])
        ##cv2.imwrite("split6.png", img_curved[:, peaks_4[5]:peaks_4[6], :])
        ##cv2.imwrite("split7.png", img_curved[:, peaks_4[6]:peaks_4[7], :])
        ##cv2.imwrite("split8.png", img_curved[:, peaks_4[7]:peaks_4[8], :])
        ##cv2.imwrite("split9.png", img_curved[:, peaks_4[8]:peaks_4[9], :])
        #cv2.imwrite("split4.png", img_4)
        #sys.exit()
            return img_final_deskewed
        else:
            return img_curved
    def run(self):
        ls_imgs = os.listdir(self.dir_in)
@ -6144,7 +6115,21 @@ class Eynollah_ocr:
                    text_by_textregion = []
                    for ind in unique_cropped_lines_region_indexer:
                        extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind]
                        if len(extracted_texts_merged_un)>1:
                            text_by_textregion_ind = ""
                            next_glue = ""
                            for indt in range(len(extracted_texts_merged_un)):
                                if extracted_texts_merged_un[indt].endswith('⸗') or extracted_texts_merged_un[indt].endswith('-'):
                                    text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt][:-1]
                                    next_glue = ""
                                else:
                                    text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt]
                                    next_glue = " "
                            text_by_textregion.append(text_by_textregion_ind)
                        else:
                            text_by_textregion.append(" ".join(extracted_texts_merged_un))
                        #print(text_by_textregion, 'text_by_textregiontext_by_textregiontext_by_textregiontext_by_textregiontext_by_textregion')
                    indexer = 0
                    indexer_textregion = 0