mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-08-29 11:59:55 +02:00
Implement hyphenated textline merging in OCR engine and a bug fixed for curved textline OCR
This commit is contained in:
parent
c0835665a9
commit
f94fc9973b
1 changed files with 71 additions and 86 deletions
|
@ -5500,7 +5500,6 @@ class Eynollah_ocr:
|
||||||
|
|
||||||
def get_orientation_moments_of_mask(self, mask):
|
def get_orientation_moments_of_mask(self, mask):
|
||||||
mask=mask.astype('uint8')
|
mask=mask.astype('uint8')
|
||||||
print(mask.shape)
|
|
||||||
contours, _ = cv2.findContours(mask[:,:,0], cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
contours, _ = cv2.findContours(mask[:,:,0], cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
|
||||||
largest_contour = max(contours, key=cv2.contourArea) if contours else None
|
largest_contour = max(contours, key=cv2.contourArea) if contours else None
|
||||||
|
@ -5547,39 +5546,24 @@ class Eynollah_ocr:
|
||||||
|
|
||||||
def break_curved_line_into_small_pieces_and_then_merge(self, img_curved, mask_curved):
|
def break_curved_line_into_small_pieces_and_then_merge(self, img_curved, mask_curved):
|
||||||
peaks_4 = self.return_splitting_point_of_image(img_curved)
|
peaks_4 = self.return_splitting_point_of_image(img_curved)
|
||||||
|
if len(peaks_4)>0:
|
||||||
|
|
||||||
|
|
||||||
img_0 = img_curved[:, :peaks_4[0], :]
|
|
||||||
img_1 = img_curved[:, peaks_4[0]:peaks_4[1], :]
|
|
||||||
img_2 = img_curved[:, peaks_4[1]:peaks_4[2], :]
|
|
||||||
img_3 = img_curved[:, peaks_4[2]:peaks_4[3], :]
|
|
||||||
img_4 = img_curved[:, peaks_4[3]:, :]
|
|
||||||
|
|
||||||
|
|
||||||
mask_0 = mask_curved[:, :peaks_4[0], :]
|
|
||||||
mask_1 = mask_curved[:, peaks_4[0]:peaks_4[1], :]
|
|
||||||
mask_2 = mask_curved[:, peaks_4[1]:peaks_4[2], :]
|
|
||||||
mask_3 = mask_curved[:, peaks_4[2]:peaks_4[3], :]
|
|
||||||
mask_4 = mask_curved[:, peaks_4[3]:, :]
|
|
||||||
|
|
||||||
cv2.imwrite("split0.png", img_0)
|
|
||||||
cv2.imwrite("split1.png", img_1)
|
|
||||||
cv2.imwrite("split2.png", img_2)
|
|
||||||
cv2.imwrite("split3.png", img_3)
|
|
||||||
|
|
||||||
or_ma_0 = self.get_orientation_moments_of_mask(mask_0)
|
|
||||||
or_ma_1 = self.get_orientation_moments_of_mask(mask_1)
|
|
||||||
or_ma_2 = self.get_orientation_moments_of_mask(mask_2)
|
|
||||||
or_ma_3 = self.get_orientation_moments_of_mask(mask_3)
|
|
||||||
or_ma_4 = self.get_orientation_moments_of_mask(mask_4)
|
|
||||||
|
|
||||||
imgs_tot = []
|
imgs_tot = []
|
||||||
imgs_tot.append([img_0, mask_0, or_ma_0] )
|
|
||||||
imgs_tot.append([img_1, mask_1, or_ma_1])
|
for ind in range(len(peaks_4)+1):
|
||||||
imgs_tot.append([img_2, mask_2, or_ma_2])
|
if ind==0:
|
||||||
imgs_tot.append([img_3, mask_3, or_ma_3])
|
img = img_curved[:, :peaks_4[ind], :]
|
||||||
imgs_tot.append([img_4, mask_4, or_ma_4])
|
mask = mask_curved[:, :peaks_4[ind], :]
|
||||||
|
elif ind==len(peaks_4):
|
||||||
|
img = img_curved[:, peaks_4[ind-1]:, :]
|
||||||
|
mask = mask_curved[:, peaks_4[ind-1]:, :]
|
||||||
|
else:
|
||||||
|
img = img_curved[:, peaks_4[ind-1]:peaks_4[ind], :]
|
||||||
|
mask = mask_curved[:, peaks_4[ind-1]:peaks_4[ind], :]
|
||||||
|
|
||||||
|
or_ma = self.get_orientation_moments_of_mask(mask)
|
||||||
|
|
||||||
|
imgs_tot.append([img, mask, or_ma] )
|
||||||
|
|
||||||
|
|
||||||
w_tot_des_list = []
|
w_tot_des_list = []
|
||||||
w_tot_des = 0
|
w_tot_des = 0
|
||||||
|
@ -5622,22 +5606,9 @@ class Eynollah_ocr:
|
||||||
for ind in range(len(w_tot_des_list)):
|
for ind in range(len(w_tot_des_list)):
|
||||||
img_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_deskewed_list[ind][:,:,:]
|
img_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_deskewed_list[ind][:,:,:]
|
||||||
w_indexer = w_indexer+w_tot_des_list[ind]
|
w_indexer = w_indexer+w_tot_des_list[ind]
|
||||||
|
|
||||||
#cv2.imwrite('final.png', img_final_deskewed)
|
|
||||||
#print(or_ma_0, or_ma_1, or_ma_2, or_ma_3, or_ma_4, 'orients')
|
|
||||||
|
|
||||||
##cv2.imwrite("split4.png", img_curved[:, peaks_4[3]:peaks_4[4], :])
|
|
||||||
##cv2.imwrite("split5.png", img_curved[:, peaks_4[4]:peaks_4[5], :])
|
|
||||||
##cv2.imwrite("split6.png", img_curved[:, peaks_4[5]:peaks_4[6], :])
|
|
||||||
|
|
||||||
##cv2.imwrite("split7.png", img_curved[:, peaks_4[6]:peaks_4[7], :])
|
|
||||||
##cv2.imwrite("split8.png", img_curved[:, peaks_4[7]:peaks_4[8], :])
|
|
||||||
##cv2.imwrite("split9.png", img_curved[:, peaks_4[8]:peaks_4[9], :])
|
|
||||||
|
|
||||||
|
|
||||||
#cv2.imwrite("split4.png", img_4)
|
|
||||||
#sys.exit()
|
|
||||||
return img_final_deskewed
|
return img_final_deskewed
|
||||||
|
else:
|
||||||
|
return img_curved
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
ls_imgs = os.listdir(self.dir_in)
|
ls_imgs = os.listdir(self.dir_in)
|
||||||
|
@ -6144,7 +6115,21 @@ class Eynollah_ocr:
|
||||||
text_by_textregion = []
|
text_by_textregion = []
|
||||||
for ind in unique_cropped_lines_region_indexer:
|
for ind in unique_cropped_lines_region_indexer:
|
||||||
extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind]
|
extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind]
|
||||||
|
if len(extracted_texts_merged_un)>1:
|
||||||
|
text_by_textregion_ind = ""
|
||||||
|
next_glue = ""
|
||||||
|
for indt in range(len(extracted_texts_merged_un)):
|
||||||
|
if extracted_texts_merged_un[indt].endswith('⸗') or extracted_texts_merged_un[indt].endswith('-'):
|
||||||
|
text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt][:-1]
|
||||||
|
next_glue = ""
|
||||||
|
else:
|
||||||
|
text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt]
|
||||||
|
next_glue = " "
|
||||||
|
text_by_textregion.append(text_by_textregion_ind)
|
||||||
|
|
||||||
|
else:
|
||||||
text_by_textregion.append(" ".join(extracted_texts_merged_un))
|
text_by_textregion.append(" ".join(extracted_texts_merged_un))
|
||||||
|
#print(text_by_textregion, 'text_by_textregiontext_by_textregiontext_by_textregiontext_by_textregiontext_by_textregion')
|
||||||
|
|
||||||
indexer = 0
|
indexer = 0
|
||||||
indexer_textregion = 0
|
indexer_textregion = 0
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue