Merge text of textlines and handle hyphenated words by joining them correctly

This commit is contained in:
vahidrezanezhad 2025-09-19 23:23:30 +02:00 committed by kba
parent b38331b4ab
commit e97e3ab192

View file

@ -5481,16 +5481,30 @@ class Eynollah_ocr:
image_text.save(out_image_with_text)
#print(len(unique_cropped_lines_region_indexer), 'unique_cropped_lines_region_indexer')
#######text_by_textregion = []
#######for ind in unique_cropped_lines_region_indexer:
#######extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind]
#######text_by_textregion.append(" ".join(extracted_texts_merged_un))
text_by_textregion = []
for ind in unique_cropped_lines_region_indexer:
extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind]
if len(extracted_texts_merged_un)>1:
text_by_textregion_ind = ""
next_glue = ""
for indt in range(len(extracted_texts_merged_un)):
if extracted_texts_merged_un[indt].endswith('') or extracted_texts_merged_un[indt].endswith('-') or extracted_texts_merged_un[indt].endswith('¬'):
text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt][:-1]
next_glue = ""
else:
text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt]
next_glue = " "
text_by_textregion.append(text_by_textregion_ind)
else:
text_by_textregion.append(" ".join(extracted_texts_merged_un))
#print(len(text_by_textregion) , indexer_text_region, "text_by_textregion")
#print(time.time() - t0 ,'elapsed time')
indexer = 0
indexer_textregion = 0
@ -5993,7 +6007,7 @@ class Eynollah_ocr:
text_by_textregion_ind = ""
next_glue = ""
for indt in range(len(extracted_texts_merged_un)):
if extracted_texts_merged_un[indt].endswith('') or extracted_texts_merged_un[indt].endswith('-'):
if extracted_texts_merged_un[indt].endswith('') or extracted_texts_merged_un[indt].endswith('-') or extracted_texts_merged_un[indt].endswith('¬'):
text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt][:-1]
next_glue = ""
else: