mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-10-06 14:39:55 +02:00
Merge text of textlines and handle hyphenated words by joining them correctly
This commit is contained in:
parent
b38331b4ab
commit
e97e3ab192
1 changed files with 23 additions and 9 deletions
|
@ -5481,16 +5481,30 @@ class Eynollah_ocr:
|
|||
image_text.save(out_image_with_text)
|
||||
|
||||
#print(len(unique_cropped_lines_region_indexer), 'unique_cropped_lines_region_indexer')
|
||||
#######text_by_textregion = []
|
||||
#######for ind in unique_cropped_lines_region_indexer:
|
||||
#######extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind]
|
||||
|
||||
#######text_by_textregion.append(" ".join(extracted_texts_merged_un))
|
||||
|
||||
text_by_textregion = []
|
||||
for ind in unique_cropped_lines_region_indexer:
|
||||
extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind]
|
||||
if len(extracted_texts_merged_un)>1:
|
||||
text_by_textregion_ind = ""
|
||||
next_glue = ""
|
||||
for indt in range(len(extracted_texts_merged_un)):
|
||||
if extracted_texts_merged_un[indt].endswith('⸗') or extracted_texts_merged_un[indt].endswith('-') or extracted_texts_merged_un[indt].endswith('¬'):
|
||||
text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt][:-1]
|
||||
next_glue = ""
|
||||
else:
|
||||
text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt]
|
||||
next_glue = " "
|
||||
text_by_textregion.append(text_by_textregion_ind)
|
||||
|
||||
text_by_textregion.append(" ".join(extracted_texts_merged_un))
|
||||
else:
|
||||
text_by_textregion.append(" ".join(extracted_texts_merged_un))
|
||||
|
||||
#print(len(text_by_textregion) , indexer_text_region, "text_by_textregion")
|
||||
|
||||
|
||||
#print(time.time() - t0 ,'elapsed time')
|
||||
|
||||
indexer = 0
|
||||
indexer_textregion = 0
|
||||
|
@ -5993,7 +6007,7 @@ class Eynollah_ocr:
|
|||
text_by_textregion_ind = ""
|
||||
next_glue = ""
|
||||
for indt in range(len(extracted_texts_merged_un)):
|
||||
if extracted_texts_merged_un[indt].endswith('⸗') or extracted_texts_merged_un[indt].endswith('-'):
|
||||
if extracted_texts_merged_un[indt].endswith('⸗') or extracted_texts_merged_un[indt].endswith('-') or extracted_texts_merged_un[indt].endswith('¬'):
|
||||
text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt][:-1]
|
||||
next_glue = ""
|
||||
else:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue