From 5c8084a3976b9aef89518ac2029dbe1a76b634c9 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 2 May 2025 00:30:36 +0200 Subject: [PATCH 01/40] displaying detexted text on an image is provided for trocr case --- src/eynollah/eynollah.py | 55 +++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 9 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index d47016b..5793d37 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -259,7 +259,7 @@ class Eynollah: self.model_region_dir_p_ens = dir_models + "/eynollah-main-regions-ensembled_20210425" self.model_region_dir_p_ens_light = dir_models + "/eynollah-main-regions_20220314" self.model_region_dir_p_ens_light_only_images_extraction = dir_models + "/eynollah-main-regions_20231127_672_org_ens_11_13_16_17_18" - self.model_reading_order_dir = dir_models + "/model_ens_reading_order_machine_based" + self.model_reading_order_dir = dir_models + "/model_mb_ro_aug_2"#"/model_ens_reading_order_machine_based" #"/modelens_12sp_elay_0_3_4__3_6_n" #"/modelens_earlylayout_12spaltige_2_3_5_6_7_8" #"/modelens_early12_sp_2_3_5_6_7_8_9_10_12_14_15_16_18" @@ -1221,7 +1221,7 @@ class Eynollah: seg_art[seg_art>0] =1 seg_line = label_p_pred[:,:,:,3] - seg_line[seg_line>0.1] =1 + seg_line[seg_line>0.5] =1#seg_line[seg_line>0.1] =1 seg_line[seg_line<1] =0 seg[seg_art==1]=4 @@ -3329,13 +3329,13 @@ class Eynollah: img_poly[text_regions_p[:,:]==6] = 5 - #temp - sep_mask = (img_poly==5)*1 - sep_mask = sep_mask.astype('uint8') - sep_mask = cv2.erode(sep_mask, kernel=KERNEL, iterations=2) - img_poly[img_poly==5] = 0 - img_poly[sep_mask==1] = 5 - # + ###temp + ##sep_mask = (img_poly==5)*1 + ##sep_mask = sep_mask.astype('uint8') + ##sep_mask = cv2.erode(sep_mask, kernel=KERNEL, iterations=2) + ##img_poly[img_poly==5] = 0 + ##img_poly[sep_mask==1] = 5 + ### img_header_and_sep = np.zeros((y_len,x_len), dtype='uint8') if contours_only_text_parent_h: @@ -5081,6 +5081,12 @@ class Eynollah_ocr: dir_xml = os.path.join(self.dir_xmls, file_name+'.xml') out_file_ocr = os.path.join(self.dir_out, file_name+'.xml') img = cv2.imread(dir_img) + + if self.draw_texts_on_image: + out_image_with_text = os.path.join(self.dir_out_image_text, file_name+'.png') + image_text = Image.new("RGB", (img.shape[1], img.shape[0]), "white") + draw = ImageDraw.Draw(image_text) + total_bb_coordinates = [] ##file_name = Path(dir_xmls).stem tree1 = ET.parse(dir_xml, parser = ET.XMLParser(encoding="utf-8")) @@ -5111,6 +5117,9 @@ class Eynollah_ocr: textline_coords = np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) x,y,w,h = cv2.boundingRect(textline_coords) + if self.draw_texts_on_image: + total_bb_coordinates.append([x,y,w,h]) + h2w_ratio = h/float(w) img_poly_on_img = np.copy(img) @@ -5161,6 +5170,34 @@ class Eynollah_ocr: #print(extracted_texts_merged, len(extracted_texts_merged)) unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer) + + if self.draw_texts_on_image: + + font_path = "NotoSans-Regular.ttf" # Make sure this file exists! + font = ImageFont.truetype(font_path, 40) + + for indexer_text, bb_ind in enumerate(total_bb_coordinates): + + + x_bb = bb_ind[0] + y_bb = bb_ind[1] + w_bb = bb_ind[2] + h_bb = bb_ind[3] + + font = self.fit_text_single_line(draw, extracted_texts_merged[indexer_text], font_path, w_bb, int(h_bb*0.4) ) + + ##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2) + + text_bbox = draw.textbbox((0, 0), extracted_texts_merged[indexer_text], font=font) + text_width = text_bbox[2] - text_bbox[0] + text_height = text_bbox[3] - text_bbox[1] + + text_x = x_bb + (w_bb - text_width) // 2 # Center horizontally + text_y = y_bb + (h_bb - text_height) // 2 # Center vertically + + # Draw the text + draw.text((text_x, text_y), extracted_texts_merged[indexer_text], fill="black", font=font) + image_text.save(out_image_with_text) #print(len(unique_cropped_lines_region_indexer), 'unique_cropped_lines_region_indexer') text_by_textregion = [] From fd375e15d59e9e83dbfcc82c8e36a429883f3dad Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 2 May 2025 01:02:32 +0200 Subject: [PATCH 02/40] adding space between splitted textline predicted text in the case of trocr --- src/eynollah/eynollah.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 5793d37..d148c67 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -5164,7 +5164,7 @@ class Eynollah_ocr: extracted_texts = extracted_texts + generated_text_merged - extracted_texts_merged = [extracted_texts[ind] if cropped_lines_meging_indexing[ind]==0 else extracted_texts[ind]+extracted_texts[ind+1] if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] + extracted_texts_merged = [extracted_texts[ind] if cropped_lines_meging_indexing[ind]==0 else extracted_texts[ind]+" "+extracted_texts[ind+1] if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None] #print(extracted_texts_merged, len(extracted_texts_merged)) From a4defbb04d6c2867e3f80c3cd3aecc7cef6a0464 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 2 May 2025 12:53:33 +0200 Subject: [PATCH 03/40] inference batch size for ocr is passed as an argument --- src/eynollah/cli.py | 8 +++++- src/eynollah/eynollah.py | 53 ++++++++++++++++++++++++++-------------- 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index c189aca..56d5d7e 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -374,6 +374,11 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ is_flag=True, help="If this parameter is set to True, the prediction will be performed using both RGB and binary images. However, this does not necessarily improve results; it may be beneficial for certain document images.", ) +@click.option( + "--batch_size", + "-bs", + help="number of inference batch size. Default b_s for trocr and cnn_rnn models are 2 and 8 respectively", +) @click.option( "--log_level", "-l", @@ -381,7 +386,7 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ help="Override log level globally to this", ) -def ocr(dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, prediction_with_both_of_rgb_and_bin, log_level): +def ocr(dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, prediction_with_both_of_rgb_and_bin, batch_size, log_level): initLogging() if log_level: getLogger('eynollah').setLevel(getLevelName(log_level)) @@ -397,6 +402,7 @@ def ocr(dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, ex do_not_mask_with_textline_contour=do_not_mask_with_textline_contour, draw_texts_on_image=draw_texts_on_image, prediction_with_both_of_rgb_and_bin=prediction_with_both_of_rgb_and_bin, + batch_size=batch_size, ) eynollah_ocr.run() diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index d148c67..62026bf 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4872,6 +4872,7 @@ class Eynollah_ocr: dir_out=None, dir_out_image_text=None, tr_ocr=False, + batch_size=None, export_textline_images_and_text=False, do_not_mask_with_textline_contour=False, draw_texts_on_image=False, @@ -4895,6 +4896,10 @@ class Eynollah_ocr: self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124" self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) self.model_ocr.to(self.device) + if not batch_size: + self.b_s = 2 + else: + self.b_s = int(batch_size) else: self.model_ocr_dir = dir_models + "/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" @@ -4903,6 +4908,10 @@ class Eynollah_ocr: self.prediction_model = tf.keras.models.Model( model_ocr.get_layer(name = "image").input, model_ocr.get_layer(name = "dense2").output) + if not batch_size: + self.b_s = 8 + else: + self.b_s = int(batch_size) with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: @@ -4918,6 +4927,7 @@ class Eynollah_ocr: self.num_to_char = StringLookup( vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True ) + def decode_batch_predictions(self, pred, max_len = 128): # input_len is the product of the batch size and the @@ -5073,10 +5083,9 @@ class Eynollah_ocr: ls_imgs = os.listdir(self.dir_in) if self.tr_ocr: - b_s = 2 + tr_ocr_input_height_and_width = 384 for ind_img in ls_imgs: - t0 = time.time() - file_name = ind_img.split('.')[0] + file_name = Path(ind_img).stem dir_img = os.path.join(self.dir_in, ind_img) dir_xml = os.path.join(self.dir_xmls, file_name+'.xml') out_file_ocr = os.path.join(self.dir_out, file_name+'.xml') @@ -5131,15 +5140,15 @@ class Eynollah_ocr: img_crop[mask_poly==0] = 255 if h2w_ratio > 0.1: - cropped_lines.append(img_crop) + cropped_lines.append(resize_image(img_crop, tr_ocr_input_height_and_width, tr_ocr_input_height_and_width) ) cropped_lines_meging_indexing.append(0) else: splited_images, _ = self.return_textlines_split_if_needed(img_crop, None) #print(splited_images) if splited_images: - cropped_lines.append(splited_images[0]) + cropped_lines.append(resize_image(splited_images[0], tr_ocr_input_height_and_width, tr_ocr_input_height_and_width)) cropped_lines_meging_indexing.append(1) - cropped_lines.append(splited_images[1]) + cropped_lines.append(resize_image(splited_images[1], tr_ocr_input_height_and_width, tr_ocr_input_height_and_width)) cropped_lines_meging_indexing.append(-1) else: cropped_lines.append(img_crop) @@ -5148,21 +5157,24 @@ class Eynollah_ocr: extracted_texts = [] - n_iterations = math.ceil(len(cropped_lines) / b_s) + n_iterations = math.ceil(len(cropped_lines) / self.b_s) for i in range(n_iterations): if i==(n_iterations-1): - n_start = i*b_s + n_start = i*self.b_s imgs = cropped_lines[n_start:] else: - n_start = i*b_s - n_end = (i+1)*b_s + n_start = i*self.b_s + n_end = (i+1)*self.b_s imgs = cropped_lines[n_start:n_end] pixel_values_merged = self.processor(imgs, return_tensors="pt").pixel_values generated_ids_merged = self.model_ocr.generate(pixel_values_merged.to(self.device)) generated_text_merged = self.processor.batch_decode(generated_ids_merged, skip_special_tokens=True) extracted_texts = extracted_texts + generated_text_merged + + del cropped_lines + gc.collect() extracted_texts_merged = [extracted_texts[ind] if cropped_lines_meging_indexing[ind]==0 else extracted_texts[ind]+" "+extracted_texts[ind+1] if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] @@ -5241,14 +5253,12 @@ class Eynollah_ocr: padding_token = 299 image_width = 512#max_len * 4 image_height = 32 - b_s = 8 img_size=(image_width, image_height) for ind_img in ls_imgs: - t0 = time.time() - file_name = ind_img.split('.')[0] + file_name = Path(ind_img).stem dir_img = os.path.join(self.dir_in, ind_img) dir_xml = os.path.join(self.dir_xmls, file_name+'.xml') out_file_ocr = os.path.join(self.dir_out, file_name+'.xml') @@ -5368,11 +5378,11 @@ class Eynollah_ocr: if not self.export_textline_images_and_text: extracted_texts = [] - n_iterations = math.ceil(len(cropped_lines) / b_s) + n_iterations = math.ceil(len(cropped_lines) / self.b_s) for i in range(n_iterations): if i==(n_iterations-1): - n_start = i*b_s + n_start = i*self.b_s imgs = cropped_lines[n_start:] imgs = np.array(imgs) imgs = imgs.reshape(imgs.shape[0], image_height, image_width, 3) @@ -5381,14 +5391,14 @@ class Eynollah_ocr: imgs_bin = np.array(imgs_bin) imgs_bin = imgs_bin.reshape(imgs_bin.shape[0], image_height, image_width, 3) else: - n_start = i*b_s - n_end = (i+1)*b_s + n_start = i*self.b_s + n_end = (i+1)*self.b_s imgs = cropped_lines[n_start:n_end] - imgs = np.array(imgs).reshape(b_s, image_height, image_width, 3) + imgs = np.array(imgs).reshape(self.b_s, image_height, image_width, 3) if self.prediction_with_both_of_rgb_and_bin: imgs_bin = cropped_lines_bin[n_start:n_end] - imgs_bin = np.array(imgs_bin).reshape(b_s, image_height, image_width, 3) + imgs_bin = np.array(imgs_bin).reshape(self.b_s, image_height, image_width, 3) preds = self.prediction_model.predict(imgs, verbose=0) @@ -5402,6 +5412,11 @@ class Eynollah_ocr: pred_texts_ib = pred_texts[ib].replace("[UNK]", "") extracted_texts.append(pred_texts_ib) + del cropped_lines + if self.prediction_with_both_of_rgb_and_bin: + del cropped_lines_bin + gc.collect() + extracted_texts_merged = [extracted_texts[ind] if cropped_lines_meging_indexing[ind]==0 else extracted_texts[ind]+" "+extracted_texts[ind+1] if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None] From 8c8fa461bba762a07ee4a0e129c391b91be23e18 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 2 May 2025 12:57:26 +0200 Subject: [PATCH 04/40] machine based model name changed to public one --- src/eynollah/eynollah.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 62026bf..cc1f766 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -259,7 +259,7 @@ class Eynollah: self.model_region_dir_p_ens = dir_models + "/eynollah-main-regions-ensembled_20210425" self.model_region_dir_p_ens_light = dir_models + "/eynollah-main-regions_20220314" self.model_region_dir_p_ens_light_only_images_extraction = dir_models + "/eynollah-main-regions_20231127_672_org_ens_11_13_16_17_18" - self.model_reading_order_dir = dir_models + "/model_mb_ro_aug_2"#"/model_ens_reading_order_machine_based" + self.model_reading_order_dir = dir_models + "/model_ens_reading_order_machine_based" #"/modelens_12sp_elay_0_3_4__3_6_n" #"/modelens_earlylayout_12spaltige_2_3_5_6_7_8" #"/modelens_early12_sp_2_3_5_6_7_8_9_10_12_14_15_16_18" From 5d447abcc4e24cec25e228fb93f95bdd6e549e5a Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Sat, 3 May 2025 02:59:16 +0200 Subject: [PATCH 05/40] let to add dataset abbrevation to extracted textline images and text --- src/eynollah/cli.py | 17 +++++++- src/eynollah/eynollah.py | 91 ++++++++++++++++++++++++---------------- 2 files changed, 71 insertions(+), 37 deletions(-) diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index 56d5d7e..7d08ac8 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -342,7 +342,6 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ "-m", help="directory of models", type=click.Path(exists=True, file_okay=False), - required=True, ) @click.option( "--tr_ocr", @@ -379,6 +378,11 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ "-bs", help="number of inference batch size. Default b_s for trocr and cnn_rnn models are 2 and 8 respectively", ) +@click.option( + "--dataset_abbrevation", + "-ds_pref", + help="in the case of extracting textline and text from a xml GT file user can add an abbrevation of dataset name to generated dataset", +) @click.option( "--log_level", "-l", @@ -386,10 +390,18 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ help="Override log level globally to this", ) -def ocr(dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, prediction_with_both_of_rgb_and_bin, batch_size, log_level): +def ocr(dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, prediction_with_both_of_rgb_and_bin, batch_size, dataset_abbrevation, log_level): initLogging() if log_level: getLogger('eynollah').setLevel(getLevelName(log_level)) + assert not export_textline_images_and_text or not tr_ocr, "Exporting textline and text -etit can not be set alongside transformer ocr -tr_ocr" + assert not export_textline_images_and_text or not model, "Exporting textline and text -etit can not be set alongside model -m" + assert not export_textline_images_and_text or not batch_size, "Exporting textline and text -etit can not be set alongside batch size -bs" + assert not export_textline_images_and_text or not dir_in_bin, "Exporting textline and text -etit can not be set alongside directory of bin images -dib" + assert not export_textline_images_and_text or not dir_out_image_text, "Exporting textline and text -etit can not be set alongside directory of images with predicted text -doit" + assert not export_textline_images_and_text or not draw_texts_on_image, "Exporting textline and text -etit can not be set alongside draw text on image -dtoi" + assert not export_textline_images_and_text or not prediction_with_both_of_rgb_and_bin, "Exporting textline and text -etit can not be set alongside prediction with both rgb and bin -brb" + eynollah_ocr = Eynollah_ocr( dir_xmls=dir_xmls, dir_out_image_text=dir_out_image_text, @@ -403,6 +415,7 @@ def ocr(dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, ex draw_texts_on_image=draw_texts_on_image, prediction_with_both_of_rgb_and_bin=prediction_with_both_of_rgb_and_bin, batch_size=batch_size, + pref_of_dataset=dataset_abbrevation, ) eynollah_ocr.run() diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index cc1f766..0b15573 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4877,6 +4877,7 @@ class Eynollah_ocr: do_not_mask_with_textline_contour=False, draw_texts_on_image=False, prediction_with_both_of_rgb_and_bin=False, + pref_of_dataset = None, logger=None, ): self.dir_in = dir_in @@ -4890,43 +4891,45 @@ class Eynollah_ocr: self.draw_texts_on_image = draw_texts_on_image self.dir_out_image_text = dir_out_image_text self.prediction_with_both_of_rgb_and_bin = prediction_with_both_of_rgb_and_bin - if tr_ocr: - self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") - self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124" - self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) - self.model_ocr.to(self.device) - if not batch_size: - self.b_s = 2 + self.pref_of_dataset = pref_of_dataset + if not export_textline_images_and_text: + if tr_ocr: + self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124" + self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) + self.model_ocr.to(self.device) + if not batch_size: + self.b_s = 2 + else: + self.b_s = int(batch_size) + else: - self.b_s = int(batch_size) - - else: - self.model_ocr_dir = dir_models + "/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" - model_ocr = load_model(self.model_ocr_dir , compile=False) - - self.prediction_model = tf.keras.models.Model( - model_ocr.get_layer(name = "image").input, - model_ocr.get_layer(name = "dense2").output) - if not batch_size: - self.b_s = 8 - else: - self.b_s = int(batch_size) - + self.model_ocr_dir = dir_models + "/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" + model_ocr = load_model(self.model_ocr_dir , compile=False) - with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: - characters = json.load(config_file) + self.prediction_model = tf.keras.models.Model( + model_ocr.get_layer(name = "image").input, + model_ocr.get_layer(name = "dense2").output) + if not batch_size: + self.b_s = 8 + else: + self.b_s = int(batch_size) - - AUTOTUNE = tf.data.AUTOTUNE + + with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: + characters = json.load(config_file) - # Mapping characters to integers. - char_to_num = StringLookup(vocabulary=list(characters), mask_token=None) + + AUTOTUNE = tf.data.AUTOTUNE - # Mapping integers back to original characters. - self.num_to_char = StringLookup( - vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True - ) + # Mapping characters to integers. + char_to_num = StringLookup(vocabulary=list(characters), mask_token=None) + + # Mapping integers back to original characters. + self.num_to_char = StringLookup( + vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True + ) def decode_batch_predictions(self, pred, max_len = 128): @@ -5365,10 +5368,28 @@ class Eynollah_ocr: if cheild_text.tag.endswith("Unicode"): textline_text = cheild_text.text if textline_text: - with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.txt'), 'w') as text_file: - text_file.write(textline_text) + if self.do_not_mask_with_textline_contour: + if self.pref_of_dataset: + with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'.txt'), 'w') as text_file: + text_file.write(textline_text) - cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.png'), img_crop ) + cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'.png'), img_crop ) + else: + with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.txt'), 'w') as text_file: + text_file.write(textline_text) + + cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.png'), img_crop ) + else: + if self.pref_of_dataset: + with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'_masked.txt'), 'w') as text_file: + text_file.write(textline_text) + + cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'_masked.png'), img_crop ) + else: + with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_masked.txt'), 'w') as text_file: + text_file.write(textline_text) + + cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_masked.png'), img_crop ) indexer_textlines+=1 From 02a679a14500b414fd9e10357febc2e5c0bf9c21 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Mon, 12 May 2025 00:10:18 +0200 Subject: [PATCH 06/40] I have tried to address the issues #163 and #161 . The changes have also improved marginal detection and enhanced the isolation of headers. --- requirements.txt | 1 + src/eynollah/cli.py | 14 +- src/eynollah/eynollah.py | 294 ++++++++++++++++++++++++++++++++++----- 3 files changed, 275 insertions(+), 34 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9ed0584..aeffd47 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ numpy <1.24.0 scikit-learn >= 0.23.2 tensorflow < 2.13 numba <= 0.58.1 +scikit-image loky diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index 7d08ac8..99961c9 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -235,6 +235,16 @@ def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out) "-ncl", help="upper limit of columns in document image", ) +@click.option( + "--threshold_art_class_layout", + "-tharl", + help="threshold of artifical class in the case of layout detection", +) +@click.option( + "--threshold_art_class_textline", + "-thart", + help="threshold of artifical class in the case of textline detection", +) @click.option( "--skip_layout_and_reading_order", "-slro/-noslro", @@ -248,7 +258,7 @@ def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out) help="Override log level globally to this", ) -def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_deskewed, save_all, extract_only_images, save_page, enable_plotting, allow_enhancement, curved_line, textline_light, full_layout, tables, right2left, input_binary, allow_scaling, headers_off, light_version, reading_order_machine_based, do_ocr, num_col_upper, num_col_lower, skip_layout_and_reading_order, ignore_page_extraction, log_level): +def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_deskewed, save_all, extract_only_images, save_page, enable_plotting, allow_enhancement, curved_line, textline_light, full_layout, tables, right2left, input_binary, allow_scaling, headers_off, light_version, reading_order_machine_based, do_ocr, num_col_upper, num_col_lower, threshold_art_class_textline, threshold_art_class_layout, skip_layout_and_reading_order, ignore_page_extraction, log_level): initLogging() if log_level: getLogger('eynollah').setLevel(getLevelName(log_level)) @@ -298,6 +308,8 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ num_col_upper=num_col_upper, num_col_lower=num_col_lower, skip_layout_and_reading_order=skip_layout_and_reading_order, + threshold_art_class_textline=threshold_art_class_textline, + threshold_art_class_layout=threshold_art_class_layout, ) if dir_in: eynollah.run(dir_in=dir_in, overwrite=overwrite) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 0b15573..0c7c5d2 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -30,7 +30,7 @@ import numpy as np from scipy.signal import find_peaks from scipy.ndimage import gaussian_filter1d from numba import cuda - +from skimage.morphology import skeletonize from ocrd import OcrdPage from ocrd_utils import getLogger, tf_disable_interactive_logs @@ -200,6 +200,8 @@ class Eynollah: do_ocr : bool = False, num_col_upper : Optional[int] = None, num_col_lower : Optional[int] = None, + threshold_art_class_layout: Optional[float] = None, + threshold_art_class_textline: Optional[float] = None, skip_layout_and_reading_order : bool = False, logger : Optional[Logger] = None, ): @@ -237,6 +239,17 @@ class Eynollah: self.num_col_lower = int(num_col_lower) else: self.num_col_lower = num_col_lower + + if threshold_art_class_layout: + self.threshold_art_class_layout = float(threshold_art_class_layout) + else: + self.threshold_art_class_layout = 0.1 + + if threshold_art_class_textline: + self.threshold_art_class_textline = float(threshold_art_class_textline) + else: + self.threshold_art_class_textline = 0.1 + self.logger = logger if logger else getLogger('eynollah') # for parallelization of CPU-intensive tasks: self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200) @@ -784,7 +797,7 @@ class Eynollah: self, patches, img, model, n_batch_inference=1, marginal_of_patch_percent=0.1, thresholding_for_some_classes_in_light_version=False, - thresholding_for_artificial_class_in_light_version=False): + thresholding_for_artificial_class_in_light_version=False, threshold_art_class_textline=0.1): self.logger.debug("enter do_prediction") img_height_model = model.layers[-1].output_shape[1] @@ -802,10 +815,13 @@ class Eynollah: if thresholding_for_artificial_class_in_light_version: seg_art = label_p_pred[0,:,:,2] - seg_art[seg_art<0.2] = 0 + seg_art[seg_art0] =1 + + skeleton_art = skeletonize(seg_art) + skeleton_art = skeleton_art*1 - seg[seg_art==1]=2 + seg[skeleton_art==1]=2 seg_color = np.repeat(seg[:, :, np.newaxis], 3, axis=2) prediction_true = resize_image(seg_color, img_h_page, img_w_page).astype(np.uint8) return prediction_true @@ -896,14 +912,17 @@ class Eynollah: if thresholding_for_artificial_class_in_light_version: seg_art = label_p_pred[:,:,:,2] - seg_art[seg_art<0.2] = 0 + seg_art[seg_art0] =1 - seg[seg_art==1]=2 + ##seg[seg_art==1]=2 indexer_inside_batch = 0 for i_batch, j_batch in zip(list_i_s, list_j_s): seg_in = seg[indexer_inside_batch] + + if thresholding_for_artificial_class_in_light_version: + seg_in_art = seg_art[indexer_inside_batch] index_y_u_in = list_y_u[indexer_inside_batch] index_y_d_in = list_y_d[indexer_inside_batch] @@ -917,54 +936,107 @@ class Eynollah: seg_in[0:-margin or None, 0:-margin or None, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[0:-margin or None, + 0:-margin or None] + elif i_batch == nxf - 1 and j_batch == nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + margin:index_x_u_in - 0] = \ seg_in[margin:, margin:, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[margin:, + margin:] + elif i_batch == 0 and j_batch == nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + 0:index_x_u_in - margin] = \ seg_in[margin:, 0:-margin or None, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[margin:, + 0:-margin or None] + elif i_batch == nxf - 1 and j_batch == 0: prediction_true[index_y_d_in + 0:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - 0] = \ seg_in[0:-margin or None, margin:, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[0:-margin or None, + margin:] + elif i_batch == 0 and j_batch != 0 and j_batch != nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + 0:index_x_u_in - margin] = \ seg_in[margin:-margin or None, 0:-margin or None, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[margin:-margin or None, + 0:-margin or None] + elif i_batch == nxf - 1 and j_batch != 0 and j_batch != nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - 0] = \ seg_in[margin:-margin or None, margin:, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[margin:-margin or None, + margin:] + elif i_batch != 0 and i_batch != nxf - 1 and j_batch == 0: prediction_true[index_y_d_in + 0:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - margin] = \ seg_in[0:-margin or None, margin:-margin or None, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[0:-margin or None, + margin:-margin or None] + elif i_batch != 0 and i_batch != nxf - 1 and j_batch == nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + margin:index_x_u_in - margin] = \ seg_in[margin:, margin:-margin or None, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[margin:, + margin:-margin or None] + else: prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - margin] = \ seg_in[margin:-margin or None, margin:-margin or None, np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[margin:-margin or None, + margin:-margin or None] indexer_inside_batch += 1 @@ -979,6 +1051,19 @@ class Eynollah: img_patch[:] = 0 prediction_true = prediction_true.astype(np.uint8) + + if thresholding_for_artificial_class_in_light_version: + kernel_min = np.ones((3, 3), np.uint8) + prediction_true[:,:,0][prediction_true[:,:,0]==2] = 0 + + skeleton_art = skeletonize(prediction_true[:,:,1]) + skeleton_art = skeleton_art*1 + + skeleton_art = skeleton_art.astype('uint8') + + skeleton_art = cv2.dilate(skeleton_art, kernel_min, iterations=1) + + prediction_true[:,:,0][skeleton_art==1]=2 #del model gc.collect() return prediction_true @@ -1117,7 +1202,7 @@ class Eynollah: self, patches, img, model, n_batch_inference=1, marginal_of_patch_percent=0.1, thresholding_for_some_classes_in_light_version=False, - thresholding_for_artificial_class_in_light_version=False): + thresholding_for_artificial_class_in_light_version=False, threshold_art_class_textline=0.1, threshold_art_class_layout=0.1): self.logger.debug("enter do_prediction_new_concept") img_height_model = model.layers[-1].output_shape[1] @@ -1132,19 +1217,28 @@ class Eynollah: label_p_pred = model.predict(img[np.newaxis], verbose=0) seg = np.argmax(label_p_pred, axis=3)[0] - if thresholding_for_artificial_class_in_light_version: - #seg_text = label_p_pred[0,:,:,1] - #seg_text[seg_text<0.2] =0 - #seg_text[seg_text>0] =1 - #seg[seg_text==1]=1 - - seg_art = label_p_pred[0,:,:,4] - seg_art[seg_art<0.2] =0 - seg_art[seg_art>0] =1 - seg[seg_art==1]=4 - seg_color = np.repeat(seg[:, :, np.newaxis], 3, axis=2) prediction_true = resize_image(seg_color, img_h_page, img_w_page).astype(np.uint8) + + if thresholding_for_artificial_class_in_light_version: + kernel_min = np.ones((3, 3), np.uint8) + seg_art = label_p_pred[0,:,:,4] + seg_art[seg_art0] =1 + #seg[seg_art==1]=4 + seg_art = resize_image(seg_art, img_h_page, img_w_page).astype(np.uint8) + + prediction_true[:,:,0][prediction_true[:,:,0]==4] = 0 + + skeleton_art = skeletonize(seg_art) + skeleton_art = skeleton_art*1 + + skeleton_art = skeleton_art.astype('uint8') + + skeleton_art = cv2.dilate(skeleton_art, kernel_min, iterations=1) + + prediction_true[:,:,0][skeleton_art==1] = 4 + return prediction_true , resize_image(label_p_pred[0, :, :, 1] , img_h_page, img_w_page) if img.shape[0] < img_height_model: @@ -1217,26 +1311,29 @@ class Eynollah: if thresholding_for_some_classes_in_light_version: seg_art = label_p_pred[:,:,:,4] - seg_art[seg_art<0.2] =0 + seg_art[seg_art0] =1 seg_line = label_p_pred[:,:,:,3] seg_line[seg_line>0.5] =1#seg_line[seg_line>0.1] =1 seg_line[seg_line<1] =0 - seg[seg_art==1]=4 + ##seg[seg_art==1]=4 seg[(seg_line==1) & (seg==0)]=3 if thresholding_for_artificial_class_in_light_version: seg_art = label_p_pred[:,:,:,2] - seg_art[seg_art<0.2] = 0 + seg_art[seg_art0] =1 - seg[seg_art==1]=2 + ##seg[seg_art==1]=2 indexer_inside_batch = 0 for i_batch, j_batch in zip(list_i_s, list_j_s): seg_in = seg[indexer_inside_batch] + + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + seg_in_art = seg_art[indexer_inside_batch] index_y_u_in = list_y_u[indexer_inside_batch] index_y_d_in = list_y_d[indexer_inside_batch] @@ -1255,6 +1352,12 @@ class Eynollah: label_p_pred[0, 0:-margin or None, 0:-margin or None, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[0:-margin or None, + 0:-margin or None] + elif i_batch == nxf - 1 and j_batch == nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + margin:index_x_u_in - 0] = \ @@ -1266,6 +1369,12 @@ class Eynollah: label_p_pred[0, margin:, margin:, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[margin:, + margin:] + elif i_batch == 0 and j_batch == nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + 0:index_x_u_in - margin] = \ @@ -1277,6 +1386,13 @@ class Eynollah: label_p_pred[0, margin:, 0:-margin or None, 1] + + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[margin:, + 0:-margin or None] + elif i_batch == nxf - 1 and j_batch == 0: prediction_true[index_y_d_in + 0:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - 0] = \ @@ -1288,6 +1404,12 @@ class Eynollah: label_p_pred[0, 0:-margin or None, margin:, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[0:-margin or None, + margin:] + elif i_batch == 0 and j_batch != 0 and j_batch != nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + 0:index_x_u_in - margin] = \ @@ -1299,6 +1421,11 @@ class Eynollah: label_p_pred[0, margin:-margin or None, 0:-margin or None, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[margin:-margin or None, + 0:-margin or None] elif i_batch == nxf - 1 and j_batch != 0 and j_batch != nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - 0] = \ @@ -1310,6 +1437,11 @@ class Eynollah: label_p_pred[0, margin:-margin or None, margin:, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[margin:-margin or None, + margin:] elif i_batch != 0 and i_batch != nxf - 1 and j_batch == 0: prediction_true[index_y_d_in + 0:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - margin] = \ @@ -1321,6 +1453,11 @@ class Eynollah: label_p_pred[0, 0:-margin or None, margin:-margin or None, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[0:-margin or None, + margin:-margin or None] elif i_batch != 0 and i_batch != nxf - 1 and j_batch == nyf - 1: prediction_true[index_y_d_in + margin:index_y_u_in - 0, index_x_d_in + margin:index_x_u_in - margin] = \ @@ -1332,6 +1469,11 @@ class Eynollah: label_p_pred[0, margin:, margin:-margin or None, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[margin:, + margin:-margin or None] else: prediction_true[index_y_d_in + margin:index_y_u_in - margin, index_x_d_in + margin:index_x_u_in - margin] = \ @@ -1343,6 +1485,11 @@ class Eynollah: label_p_pred[0, margin:-margin or None, margin:-margin or None, 1] + if thresholding_for_artificial_class_in_light_version or thresholding_for_some_classes_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[margin:-margin or None, + margin:-margin or None] indexer_inside_batch += 1 list_i_s = [] @@ -1356,6 +1503,32 @@ class Eynollah: img_patch[:] = 0 prediction_true = prediction_true.astype(np.uint8) + + if thresholding_for_artificial_class_in_light_version: + kernel_min = np.ones((3, 3), np.uint8) + prediction_true[:,:,0][prediction_true[:,:,0]==2] = 0 + + skeleton_art = skeletonize(prediction_true[:,:,1]) + skeleton_art = skeleton_art*1 + + skeleton_art = skeleton_art.astype('uint8') + + skeleton_art = cv2.dilate(skeleton_art, kernel_min, iterations=1) + + prediction_true[:,:,0][skeleton_art==1]=2 + + if thresholding_for_some_classes_in_light_version: + kernel_min = np.ones((3, 3), np.uint8) + prediction_true[:,:,0][prediction_true[:,:,0]==4] = 0 + + skeleton_art = skeletonize(prediction_true[:,:,1]) + skeleton_art = skeleton_art*1 + + skeleton_art = skeleton_art.astype('uint8') + + skeleton_art = cv2.dilate(skeleton_art, kernel_min, iterations=1) + + prediction_true[:,:,0][skeleton_art==1]=4 gc.collect() return prediction_true, confidence_matrix @@ -1608,7 +1781,7 @@ class Eynollah: prediction_textline = self.do_prediction( use_patches, img, self.model_textline, marginal_of_patch_percent=0.15, n_batch_inference=3, - thresholding_for_artificial_class_in_light_version=self.textline_light) + thresholding_for_artificial_class_in_light_version=self.textline_light, threshold_art_class_textline=self.threshold_art_class_textline) #if not self.textline_light: #if num_col_classifier==1: #prediction_textline_nopatch = self.do_prediction(False, img, self.model_textline) @@ -1622,7 +1795,55 @@ class Eynollah: textline_mask_tot_ea_art = textline_mask_tot_ea_art.astype('uint8') #textline_mask_tot_ea_art = cv2.dilate(textline_mask_tot_ea_art, KERNEL, iterations=1) prediction_textline[:,:][textline_mask_tot_ea_art[:,:]==1]=2 + """ + else: + textline_mask_tot_ea_art = textline_mask_tot_ea_art.astype('uint8') + hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (8, 1)) + + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) + ##cv2.imwrite('textline_mask_tot_ea_art.png', textline_mask_tot_ea_art) + textline_mask_tot_ea_art = cv2.dilate(textline_mask_tot_ea_art, hor_kernel, iterations=1) + + ###cv2.imwrite('dil_textline_mask_tot_ea_art.png', dil_textline_mask_tot_ea_art) + + textline_mask_tot_ea_art = textline_mask_tot_ea_art.astype('uint8') + + #print(np.shape(dil_textline_mask_tot_ea_art), np.unique(dil_textline_mask_tot_ea_art), 'dil_textline_mask_tot_ea_art') + tsk = time.time() + skeleton_art_textline = skeletonize(textline_mask_tot_ea_art[:,:,0]) + + skeleton_art_textline = skeleton_art_textline*1 + + skeleton_art_textline = skeleton_art_textline.astype('uint8') + + skeleton_art_textline = cv2.dilate(skeleton_art_textline, kernel, iterations=1) + + #print(np.unique(skeleton_art_textline), np.shape(skeleton_art_textline)) + + #print(skeleton_art_textline, np.unique(skeleton_art_textline)) + + #cv2.imwrite('skeleton_art_textline.png', skeleton_art_textline) + + prediction_textline[:,:,0][skeleton_art_textline[:,:]==1]=2 + + #cv2.imwrite('prediction_textline1.png', prediction_textline[:,:,0]) + + ##hor_kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT, (4, 1)) + ##ver_kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 3)) + ##textline_mask_tot_ea_main = (prediction_textline[:,:]==1)*1 + ##textline_mask_tot_ea_main = textline_mask_tot_ea_main.astype('uint8') + + ##dil_textline_mask_tot_ea_main = cv2.erode(textline_mask_tot_ea_main, ver_kernel2, iterations=1) + + ##dil_textline_mask_tot_ea_main = cv2.dilate(textline_mask_tot_ea_main, hor_kernel2, iterations=1) + + ##dil_textline_mask_tot_ea_main = cv2.dilate(textline_mask_tot_ea_main, ver_kernel2, iterations=1) + + ##prediction_textline[:,:][dil_textline_mask_tot_ea_main[:,:]==1]=1 + + """ + textline_mask_tot_ea_lines = (prediction_textline[:,:]==1)*1 textline_mask_tot_ea_lines = textline_mask_tot_ea_lines.astype('uint8') if not self.textline_light: @@ -1631,10 +1852,15 @@ class Eynollah: prediction_textline[:,:][textline_mask_tot_ea_lines[:,:]==1]=1 if not self.textline_light: prediction_textline[:,:][old_art[:,:]==1]=2 + + #cv2.imwrite('prediction_textline2.png', prediction_textline[:,:,0]) prediction_textline_longshot = self.do_prediction(False, img, self.model_textline) prediction_textline_longshot_true_size = resize_image(prediction_textline_longshot, img_h, img_w) - + + + #cv2.imwrite('prediction_textline.png', prediction_textline[:,:,0]) + #sys.exit() self.logger.debug('exit textline_contours') return ((prediction_textline[:, :, 0]==1).astype(np.uint8), (prediction_textline_longshot_true_size[:, :, 0]==1).astype(np.uint8)) @@ -1840,7 +2066,7 @@ class Eynollah: textline_mask_tot_ea = resize_image(textline_mask_tot_ea,img_height_h, img_width_h ) #print(self.image_org.shape) - #cv2.imwrite('out_13.png', self.image_page_org_size) + #cv2.imwrite('textline.png', textline_mask_tot_ea) #plt.imshwo(self.image_page_org_size) #plt.show() @@ -1852,13 +2078,13 @@ class Eynollah: img_resized.shape[1], img_resized.shape[0], num_col_classifier) prediction_regions_org, confidence_matrix = self.do_prediction_new_concept( True, img_resized, self.model_region_1_2, n_batch_inference=1, - thresholding_for_some_classes_in_light_version=True) + thresholding_for_some_classes_in_light_version=True, threshold_art_class_layout=self.threshold_art_class_layout) else: prediction_regions_org = np.zeros((self.image_org.shape[0], self.image_org.shape[1], 3)) confidence_matrix = np.zeros((self.image_org.shape[0], self.image_org.shape[1])) prediction_regions_page, confidence_matrix_page = self.do_prediction_new_concept( False, self.image_page_org_size, self.model_region_1_2, n_batch_inference=1, - thresholding_for_artificial_class_in_light_version=True) + thresholding_for_artificial_class_in_light_version=True, threshold_art_class_layout=self.threshold_art_class_layout) ys = slice(*self.page_coord[0:2]) xs = slice(*self.page_coord[2:4]) prediction_regions_org[ys, xs] = prediction_regions_page @@ -1871,7 +2097,7 @@ class Eynollah: img_resized.shape[1], img_resized.shape[0], new_h, num_col_classifier) prediction_regions_org, confidence_matrix = self.do_prediction_new_concept( True, img_resized, self.model_region_1_2, n_batch_inference=2, - thresholding_for_some_classes_in_light_version=True) + thresholding_for_some_classes_in_light_version=True, threshold_art_class_layout=self.threshold_art_class_layout) ###prediction_regions_org = self.do_prediction(True, img_bin, self.model_region, n_batch_inference=3, thresholding_for_some_classes_in_light_version=True) #print("inside 3 ", time.time()-t_in) #plt.imshow(prediction_regions_org[:,:,0]) @@ -3811,7 +4037,7 @@ class Eynollah: if dilation_m1<6: dilation_m1 = 6 #print(dilation_m1, 'dilation_m1') - dilation_m1 = 6 + dilation_m1 = 4#6 dilation_m2 = int(dilation_m1/2.) +1 for i in range(len(x_differential)): @@ -4322,6 +4548,8 @@ class Eynollah: cnt_clean_rot_raw, hir_on_cnt_clean_rot = return_contours_of_image(textline_mask_tot_ea) all_found_textline_polygons = filter_contours_area_of_image( textline_mask_tot_ea, cnt_clean_rot_raw, hir_on_cnt_clean_rot, max_area=1, min_area=0.00001) + + all_found_textline_polygons = all_found_textline_polygons[::-1] all_found_textline_polygons=[ all_found_textline_polygons ] @@ -4329,8 +4557,8 @@ class Eynollah: all_found_textline_polygons) all_found_textline_polygons = self.filter_contours_inside_a_bigger_one( all_found_textline_polygons, None, textline_mask_tot_ea, type_contour="textline") - - + + order_text_new = [0] slopes =[0] id_of_texts_tot =['region_0001'] @@ -4343,7 +4571,7 @@ class Eynollah: polygons_lines_xml = [] contours_tables = [] ocr_all_textlines = None - conf_contours_textregions =None + conf_contours_textregions =[0] pcgts = self.writer.build_pagexml_no_full_layout( cont_page, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals, @@ -4905,7 +5133,7 @@ class Eynollah_ocr: self.b_s = int(batch_size) else: - self.model_ocr_dir = dir_models + "/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" + self.model_ocr_dir = dir_models + "/model_ens_ocrcnn_125_225"#"/model_step_125000_ocr"#"/model_step_25000_ocr"#"/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" model_ocr = load_model(self.model_ocr_dir , compile=False) self.prediction_model = tf.keras.models.Model( From 07f5b52fa704f0d74c9ce8a14234499b958a6849 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 13 May 2025 14:40:57 +0200 Subject: [PATCH 07/40] The initial attempt at reading heavily deskewed or vertically aligned lines. --- src/eynollah/eynollah.py | 91 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 2 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 0c7c5d2..9f2ca50 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -22,7 +22,6 @@ from multiprocessing import cpu_count import gc import copy import json - from loky import ProcessPoolExecutor import xml.etree.ElementTree as ET import cv2 @@ -77,7 +76,8 @@ from .utils.contour import ( from .utils.rotate import ( rotate_image, rotation_not_90_func, - rotation_not_90_func_full_layout + rotation_not_90_func_full_layout, + rotation_image_new ) from .utils.separate_lines import ( textline_contours_postprocessing, @@ -5310,6 +5310,75 @@ class Eynollah_ocr: img_fin = img_fin / 255. return img_fin + def get_deskewed_contour_and_bb_and_image(self, contour, image, deskew_angle): + (h_in, w_in) = image.shape[:2] + center = (w_in // 2, h_in // 2) + + rotation_matrix = cv2.getRotationMatrix2D(center, deskew_angle, 1.0) + + cos_angle = abs(rotation_matrix[0, 0]) + sin_angle = abs(rotation_matrix[0, 1]) + new_w = int((h_in * sin_angle) + (w_in * cos_angle)) + new_h = int((h_in * cos_angle) + (w_in * sin_angle)) + + rotation_matrix[0, 2] += (new_w / 2) - center[0] + rotation_matrix[1, 2] += (new_h / 2) - center[1] + + deskewed_image = cv2.warpAffine(image, rotation_matrix, (new_w, new_h)) + + contour_points = np.array(contour, dtype=np.float32) + transformed_points = cv2.transform(np.array([contour_points]), rotation_matrix)[0] + + x, y, w, h = cv2.boundingRect(np.array(transformed_points, dtype=np.int32)) + cropped_textline = deskewed_image[y:y+h, x:x+w] + + return cropped_textline + + def rotate_image_with_padding(self, image, angle): + # Get image dimensions + (h, w) = image.shape[:2] + + # Calculate the center of the image + center = (w // 2, h // 2) + + # Get the rotation matrix + rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1.0) + + # Compute the new bounding dimensions + cos = abs(rotation_matrix[0, 0]) + sin = abs(rotation_matrix[0, 1]) + new_w = int((h * sin) + (w * cos)) + new_h = int((h * cos) + (w * sin)) + + # Adjust the rotation matrix to account for translation + rotation_matrix[0, 2] += (new_w / 2) - center[0] + rotation_matrix[1, 2] += (new_h / 2) - center[1] + + # Perform the rotation + rotated_image = cv2.warpAffine(image, rotation_matrix, (new_w, new_h), borderValue=(0, 0, 0)) + + return rotated_image + + def get_orientation_moments(self, contour): + moments = cv2.moments(contour) + if moments["mu20"] - moments["mu02"] == 0: # Avoid division by zero + return 90 if moments["mu11"] > 0 else -90 + else: + angle = 0.5 * np.arctan2(2 * moments["mu11"], moments["mu20"] - moments["mu02"]) + return np.degrees(angle) # Convert radians to degrees + + def get_contours_and_bounding_boxes(self, mask): + # Find contours in the binary mask + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + largest_contour = max(contours, key=cv2.contourArea) if contours else None + + # Get the bounding rectangle for the contour + x, y, w, h = cv2.boundingRect(largest_contour) + #bounding_boxes.append((x, y, w, h)) + + return x, y, w, h + def run(self): ls_imgs = os.listdir(self.dir_in) @@ -5533,6 +5602,10 @@ class Eynollah_ocr: x,y,w,h = cv2.boundingRect(textline_coords) + angle_radians = math.atan2(h, w) + # Convert to degrees + angle_degrees = math.degrees(angle_radians) + if self.draw_texts_on_image: total_bb_coordinates.append([x,y,w,h]) @@ -5549,7 +5622,21 @@ class Eynollah_ocr: mask_poly = mask_poly[y:y+h, x:x+w, :] img_crop = img_poly_on_img[y:y+h, x:x+w, :] if not self.do_not_mask_with_textline_contour: + if angle_degrees > 15: + better_des_slope = self.get_orientation_moments(textline_coords) + + img_crop = self.rotate_image_with_padding(img_crop, -abs(better_des_slope) ) + mask_poly = self.rotate_image_with_padding(mask_poly, -abs(better_des_slope) ) + mask_poly = mask_poly.astype('uint8') + + #new bounding box + x_n, y_n, w_n, h_n = self.get_contours_and_bounding_boxes(mask_poly[:,:,0]) + + mask_poly = mask_poly[y_n:y_n+h_n, x_n:x_n+w_n, :] + img_crop = img_crop[y_n:y_n+h_n, x_n:x_n+w_n, :] + img_crop[mask_poly==0] = 255 + if self.prediction_with_both_of_rgb_and_bin: img_crop_bin[mask_poly==0] = 255 From 1ccd3fb7cf54d16cfa5969434aa33d059f252797 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 13 May 2025 15:53:05 +0200 Subject: [PATCH 08/40] Accurately writing text line contours into xml file when the deskewing exceeds 45 degrees and the text line is in light mode --- src/eynollah/writer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index 92e353f..8cd1c8e 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -119,7 +119,7 @@ class EynollahXmlWriter(): points_co += ',' points_co += str(textline_y_coord) - if (self.curved_line or self.textline_light) and np.abs(slopes[region_idx]) <= 45: + if self.textline_light or (self.curved_line and np.abs(slopes[region_idx]) <= 45): if len(contour_textline) == 2: points_co += str(int((contour_textline[0] + page_coord[2]) / self.scale_x)) points_co += ',' @@ -128,7 +128,7 @@ class EynollahXmlWriter(): points_co += str(int((contour_textline[0][0] + page_coord[2]) / self.scale_x)) points_co += ',' points_co += str(int((contour_textline[0][1] + page_coord[0])/self.scale_y)) - elif (self.curved_line or self.textline_light) and np.abs(slopes[region_idx]) > 45: + elif self.curved_line and np.abs(slopes[region_idx]) > 45: if len(contour_textline)==2: points_co += str(int((contour_textline[0] + region_bboxes[2] + page_coord[2])/self.scale_x)) points_co += ',' From a9cdd56e9a2a30f89020487fe2567df9d5426fa0 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 14 May 2025 18:34:58 +0200 Subject: [PATCH 09/40] enhance ocr for vertical textlines --- src/eynollah/eynollah.py | 79 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 9f2ca50..5a73ef3 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -5133,7 +5133,7 @@ class Eynollah_ocr: self.b_s = int(batch_size) else: - self.model_ocr_dir = dir_models + "/model_ens_ocrcnn_125_225"#"/model_step_125000_ocr"#"/model_step_25000_ocr"#"/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" + self.model_ocr_dir = dir_models + "/model_step_425000_ocr"#"/model_step_125000_ocr"#"/model_step_25000_ocr"#"/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" model_ocr = load_model(self.model_ocr_dir , compile=False) self.prediction_model = tf.keras.models.Model( @@ -5585,6 +5585,7 @@ class Eynollah_ocr: region_tags=np.unique([x for x in alltags if x.endswith('TextRegion')]) cropped_lines = [] + cropped_lines_ver_index = [] cropped_lines_region_indexer = [] cropped_lines_meging_indexing = [] @@ -5644,6 +5645,11 @@ class Eynollah_ocr: if w_scaled < 1.5*image_width: img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) cropped_lines.append(img_fin) + if angle_degrees > 15: + cropped_lines_ver_index.append(1) + else: + cropped_lines_ver_index.append(0) + cropped_lines_meging_indexing.append(0) if self.prediction_with_both_of_rgb_and_bin: img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width) @@ -5657,11 +5663,22 @@ class Eynollah_ocr: img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(splited_images[0], image_height, image_width) cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(1) + + if angle_degrees > 15: + cropped_lines_ver_index.append(1) + else: + cropped_lines_ver_index.append(0) + img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(splited_images[1], image_height, image_width) cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(-1) + if angle_degrees > 15: + cropped_lines_ver_index.append(1) + else: + cropped_lines_ver_index.append(0) + if self.prediction_with_both_of_rgb_and_bin: img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(splited_images_bin[0], image_height, image_width) cropped_lines_bin.append(img_fin) @@ -5673,6 +5690,11 @@ class Eynollah_ocr: cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(0) + if angle_degrees > 15: + cropped_lines_ver_index.append(1) + else: + cropped_lines_ver_index.append(0) + if self.prediction_with_both_of_rgb_and_bin: img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width) cropped_lines_bin.append(img_fin) @@ -5722,6 +5744,19 @@ class Eynollah_ocr: imgs = cropped_lines[n_start:] imgs = np.array(imgs) imgs = imgs.reshape(imgs.shape[0], image_height, image_width, 3) + + ver_imgs = np.array( cropped_lines_ver_index[n_start:] ) + indices_ver = np.where(ver_imgs == 1)[0] + + #print(indices_ver, 'indices_ver') + if len(indices_ver)>0: + imgs_ver_flipped = imgs[indices_ver, : ,: ,:] + imgs_ver_flipped = imgs_ver_flipped[:,::-1,::-1,:] + #print(imgs_ver_flipped, 'imgs_ver_flipped') + + else: + imgs_ver_flipped = None + if self.prediction_with_both_of_rgb_and_bin: imgs_bin = cropped_lines_bin[n_start:] imgs_bin = np.array(imgs_bin) @@ -5732,12 +5767,54 @@ class Eynollah_ocr: imgs = cropped_lines[n_start:n_end] imgs = np.array(imgs).reshape(self.b_s, image_height, image_width, 3) + ver_imgs = np.array( cropped_lines_ver_index[n_start:n_end] ) + indices_ver = np.where(ver_imgs == 1)[0] + #print(indices_ver, 'indices_ver') + + if len(indices_ver)>0: + imgs_ver_flipped = imgs[indices_ver, : ,: ,:] + imgs_ver_flipped = imgs_ver_flipped[:,::-1,::-1,:] + #print(imgs_ver_flipped, 'imgs_ver_flipped') + else: + imgs_ver_flipped = None + + if self.prediction_with_both_of_rgb_and_bin: imgs_bin = cropped_lines_bin[n_start:n_end] imgs_bin = np.array(imgs_bin).reshape(self.b_s, image_height, image_width, 3) preds = self.prediction_model.predict(imgs, verbose=0) + + if len(indices_ver)>0: + #cv2.imwrite('flipped.png', (imgs_ver_flipped[0, :,:,:]*255).astype('uint8')) + #cv2.imwrite('original.png', (imgs[0, :,:,:]*255).astype('uint8')) + #sys.exit() + #print(imgs_ver_flipped.shape, 'imgs_ver_flipped.shape') + preds_flipped = self.prediction_model.predict(imgs_ver_flipped, verbose=0) + preds_max_fliped = np.max(preds_flipped, axis=2 ) + preds_max_args_flipped = np.argmax(preds_flipped, axis=2 ) + pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=256 + masked_means_flipped = np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / np.sum(pred_max_not_unk_mask_bool_flipped, axis=1) + masked_means_flipped[np.isnan(masked_means_flipped)] = 0 + #print(masked_means_flipped, 'masked_means_flipped') + + preds_max = np.max(preds, axis=2 ) + preds_max_args = np.argmax(preds, axis=2 ) + pred_max_not_unk_mask_bool = preds_max_args[:,:]!=256 + + masked_means = np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / np.sum(pred_max_not_unk_mask_bool, axis=1) + masked_means[np.isnan(masked_means)] = 0 + + masked_means_ver = masked_means[indices_ver] + #print(masked_means_ver, 'pred_max_not_unk') + + indices_where_flipped_conf_value_is_higher = np.where(masked_means_flipped > masked_means_ver)[0] + + #print(indices_where_flipped_conf_value_is_higher, 'indices_where_flipped_conf_value_is_higher') + if len(indices_where_flipped_conf_value_is_higher)>0: + indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher] + preds[indices_to_be_replaced,:,:] = preds_flipped[indices_where_flipped_conf_value_is_higher, :, :] if self.prediction_with_both_of_rgb_and_bin: preds_bin = self.prediction_model.predict(imgs_bin, verbose=0) preds = (preds + preds_bin) / 2. From adee1dc55cb67ad20fa0d6eb4a8ebc9edfa6d64a Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 15 May 2025 00:45:22 +0200 Subject: [PATCH 10/40] enhancement for vertical textlines --- src/eynollah/eynollah.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 5a73ef3..2e54687 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -5626,8 +5626,8 @@ class Eynollah_ocr: if angle_degrees > 15: better_des_slope = self.get_orientation_moments(textline_coords) - img_crop = self.rotate_image_with_padding(img_crop, -abs(better_des_slope) ) - mask_poly = self.rotate_image_with_padding(mask_poly, -abs(better_des_slope) ) + img_crop = self.rotate_image_with_padding(img_crop, better_des_slope ) + mask_poly = self.rotate_image_with_padding(mask_poly, better_des_slope ) mask_poly = mask_poly.astype('uint8') #new bounding box From 0819730355eba1b5b8e566048809b7c92610ff4d Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 15 May 2025 15:33:50 +0200 Subject: [PATCH 11/40] marginals detection enhanced for light version --- src/eynollah/utils/marginals.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/eynollah/utils/marginals.py b/src/eynollah/utils/marginals.py index a29e50d..c0c4892 100644 --- a/src/eynollah/utils/marginals.py +++ b/src/eynollah/utils/marginals.py @@ -26,8 +26,10 @@ def get_marginals(text_with_lines, text_regions, num_col, slope_deskew, light_ve text_with_lines=resize_image(text_with_lines,int(text_with_lines.shape[0]*1.8),text_with_lines.shape[1]) text_with_lines=cv2.erode(text_with_lines,kernel,iterations=7) text_with_lines=resize_image(text_with_lines,text_with_lines_eroded.shape[0],text_with_lines_eroded.shape[1]) - - + + if light_version: + text_with_lines=rotate_image(text_with_lines,-slope_deskew) + text_with_lines_y=text_with_lines.sum(axis=0) text_with_lines_y_eroded=text_with_lines_eroded.sum(axis=0) From 7a34bbb49333e78808d5eb0a2eaca406a35fa948 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Sun, 18 May 2025 02:48:05 +0200 Subject: [PATCH 12/40] enhancing marginal detection for light version --- src/eynollah/eynollah.py | 7 +++---- src/eynollah/utils/marginals.py | 13 ++++++++----- src/eynollah/utils/separate_lines.py | 4 ++-- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 2e54687..08a781c 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -272,7 +272,7 @@ class Eynollah: self.model_region_dir_p_ens = dir_models + "/eynollah-main-regions-ensembled_20210425" self.model_region_dir_p_ens_light = dir_models + "/eynollah-main-regions_20220314" self.model_region_dir_p_ens_light_only_images_extraction = dir_models + "/eynollah-main-regions_20231127_672_org_ens_11_13_16_17_18" - self.model_reading_order_dir = dir_models + "/model_ens_reading_order_machine_based" + self.model_reading_order_dir = dir_models + "/model_step_2500000_mb_ro"#"/model_ens_reading_order_machine_based" #"/modelens_12sp_elay_0_3_4__3_6_n" #"/modelens_earlylayout_12spaltige_2_3_5_6_7_8" #"/modelens_early12_sp_2_3_5_6_7_8_9_10_12_14_15_16_18" @@ -1315,7 +1315,7 @@ class Eynollah: seg_art[seg_art>0] =1 seg_line = label_p_pred[:,:,:,3] - seg_line[seg_line>0.5] =1#seg_line[seg_line>0.1] =1 + seg_line[seg_line>0.3] =1#seg_line[seg_line>0.5] =1#seg_line[seg_line>0.1] =1 seg_line[seg_line<1] =0 ##seg[seg_art==1]=4 @@ -3667,7 +3667,6 @@ class Eynollah: peaks_real, _ = find_peaks(sum_smoothed, height=0) if len(peaks_real)>70: - print(len(peaks_real), 'len(peaks_real)') peaks_real = peaks_real[(peaks_realwidth1)] @@ -5133,7 +5132,7 @@ class Eynollah_ocr: self.b_s = int(batch_size) else: - self.model_ocr_dir = dir_models + "/model_step_425000_ocr"#"/model_step_125000_ocr"#"/model_step_25000_ocr"#"/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" + self.model_ocr_dir = dir_models + "/model_step_600000_ocr"#"/model_step_125000_ocr"#"/model_step_25000_ocr"#"/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" model_ocr = load_model(self.model_ocr_dir , compile=False) self.prediction_model = tf.keras.models.Model( diff --git a/src/eynollah/utils/marginals.py b/src/eynollah/utils/marginals.py index c0c4892..ac8dc1d 100644 --- a/src/eynollah/utils/marginals.py +++ b/src/eynollah/utils/marginals.py @@ -10,7 +10,6 @@ def get_marginals(text_with_lines, text_regions, num_col, slope_deskew, light_ve mask_marginals=np.zeros((text_with_lines.shape[0],text_with_lines.shape[1])) mask_marginals=mask_marginals.astype(np.uint8) - text_with_lines=text_with_lines.astype(np.uint8) ##text_with_lines=cv2.erode(text_with_lines,self.kernel,iterations=3) @@ -26,9 +25,11 @@ def get_marginals(text_with_lines, text_regions, num_col, slope_deskew, light_ve text_with_lines=resize_image(text_with_lines,int(text_with_lines.shape[0]*1.8),text_with_lines.shape[1]) text_with_lines=cv2.erode(text_with_lines,kernel,iterations=7) text_with_lines=resize_image(text_with_lines,text_with_lines_eroded.shape[0],text_with_lines_eroded.shape[1]) - + + if light_version: - text_with_lines=rotate_image(text_with_lines,-slope_deskew) + kernel_hor = np.ones((1, 5), dtype=np.uint8) + text_with_lines = cv2.erode(text_with_lines,kernel_hor,iterations=6) text_with_lines_y=text_with_lines.sum(axis=0) text_with_lines_y_eroded=text_with_lines_eroded.sum(axis=0) @@ -42,8 +43,10 @@ def get_marginals(text_with_lines, text_regions, num_col, slope_deskew, light_ve elif thickness_along_y_percent>=30 and thickness_along_y_percent<50: min_textline_thickness=20 else: - min_textline_thickness=40 - + if light_version: + min_textline_thickness=45 + else: + min_textline_thickness=40 if thickness_along_y_percent>=14: diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index 0322579..6289d4d 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -1466,7 +1466,7 @@ def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, main_page=False, logger=None, plotter=None, map=map): if main_page and plotter: plotter.save_plot_of_textline_density(img_patch_org) - + img_int=np.zeros((img_patch_org.shape[0],img_patch_org.shape[1])) img_int[:,:]=img_patch_org[:,:]#img_patch_org[:,:,0] @@ -1487,7 +1487,7 @@ def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, angles = np.linspace(angle - 22.5, angle + 22.5, n_tot_angles) angle = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) elif main_page: - angles = np.linspace(-12, 12, n_tot_angles)#np.array([0 , 45 , 90 , -45]) + angles = np.array (list(np.linspace(-12, -7, int(n_tot_angles/4))) + list(np.linspace(-6, 6, n_tot_angles- 2* int(n_tot_angles/4))) + list(np.linspace(7, 12, int(n_tot_angles/4))))#np.linspace(-12, 12, n_tot_angles)#np.array([0 , 45 , 90 , -45]) angle = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) early_slope_edge=11 From 848156dd9d2bcb834f33591a1377a9451e1d919f Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 20 May 2025 16:51:08 +0200 Subject: [PATCH 13/40] mb reading order now can be done faster. Text regions are clustered using dilation, and mb reading order needs to be implemented for fewer regions --- src/eynollah/eynollah.py | 181 +++++++++++++++++++++++++++++---- src/eynollah/utils/__init__.py | 2 +- 2 files changed, 163 insertions(+), 20 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 08a781c..eb5c860 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -32,6 +32,7 @@ from numba import cuda from skimage.morphology import skeletonize from ocrd import OcrdPage from ocrd_utils import getLogger, tf_disable_interactive_logs +import statistics try: import torch @@ -797,7 +798,7 @@ class Eynollah: self, patches, img, model, n_batch_inference=1, marginal_of_patch_percent=0.1, thresholding_for_some_classes_in_light_version=False, - thresholding_for_artificial_class_in_light_version=False, threshold_art_class_textline=0.1): + thresholding_for_artificial_class_in_light_version=False, thresholding_for_fl_light_version=False, threshold_art_class_textline=0.1): self.logger.debug("enter do_prediction") img_height_model = model.layers[-1].output_shape[1] @@ -822,6 +823,15 @@ class Eynollah: skeleton_art = skeleton_art*1 seg[skeleton_art==1]=2 + + if thresholding_for_fl_light_version: + seg_header = label_p_pred[0,:,:,2] + + seg_header[seg_header<0.2] = 0 + seg_header[seg_header>0] =1 + + seg[seg_header==1]=2 + seg_color = np.repeat(seg[:, :, np.newaxis], 3, axis=2) prediction_true = resize_image(seg_color, img_h_page, img_w_page).astype(np.uint8) return prediction_true @@ -1613,10 +1623,11 @@ class Eynollah: model_region = self.model_region_fl if patches else self.model_region_fl_np if self.light_version: - pass + thresholding_for_fl_light_version = True elif not patches: img = otsu_copy_binary(img).astype(np.uint8) prediction_regions = None + thresholding_for_fl_light_version = False elif cols: img = otsu_copy_binary(img).astype(np.uint8) if cols == 1: @@ -1632,7 +1643,7 @@ class Eynollah: else: img = resize_image(img, int(img_height_h * 2500 / float(img_width_h)), 2500).astype(np.uint8) - prediction_regions = self.do_prediction(patches, img, model_region, marginal_of_patch_percent=0.1, n_batch_inference=3) + prediction_regions = self.do_prediction(patches, img, model_region, marginal_of_patch_percent=0.1, n_batch_inference=3, thresholding_for_fl_light_version=thresholding_for_fl_light_version) prediction_regions = resize_image(prediction_regions, img_height_h, img_width_h) self.logger.debug("exit extract_text_regions") return prediction_regions, prediction_regions @@ -3544,9 +3555,87 @@ class Eynollah: return model def do_order_of_regions_with_model(self, contours_only_text_parent, contours_only_text_parent_h, text_regions_p): + #cv2.imwrite('textregions.png', text_regions_p*50) + min_cont_size_to_be_dilated = 10 + if len(contours_only_text_parent)>min_cont_size_to_be_dilated: + ver_kernel = np.ones((5, 1), dtype=np.uint8) + + cx_conts, cy_conts, x_min_conts, x_max_conts, y_min_conts, y_max_conts, _ = find_new_features_of_contours(contours_only_text_parent) + args_cont_located = np.array(range(len(contours_only_text_parent))) + + diff_y_conts = np.abs(y_max_conts[:]-y_min_conts) + diff_x_conts = np.abs(x_max_conts[:]-x_min_conts) + + mean_x = statistics.mean(diff_x_conts) + median_x = statistics.median(diff_x_conts) + + + diff_x_ratio= diff_x_conts/mean_x + + args_cont_located_excluded = args_cont_located[diff_x_ratio>=1.3] + args_cont_located_included = args_cont_located[diff_x_ratio<1.3] + + contours_only_text_parent_excluded = [contours_only_text_parent[ind] for ind in range(len(contours_only_text_parent)) if diff_x_ratio[ind]>=1.3]#contours_only_text_parent[diff_x_ratio>=1.3] + contours_only_text_parent_included = [contours_only_text_parent[ind] for ind in range(len(contours_only_text_parent)) if diff_x_ratio[ind]<1.3]#contours_only_text_parent[diff_x_ratio<1.3] + + + cx_conts_excluded = [cx_conts[ind] for ind in range(len(cx_conts)) if diff_x_ratio[ind]>=1.3]#cx_conts[diff_x_ratio>=1.3] + cx_conts_included = [cx_conts[ind] for ind in range(len(cx_conts)) if diff_x_ratio[ind]<1.3]#cx_conts[diff_x_ratio<1.3] + + cy_conts_excluded = [cy_conts[ind] for ind in range(len(cy_conts)) if diff_x_ratio[ind]>=1.3]#cy_conts[diff_x_ratio>=1.3] + cy_conts_included = [cy_conts[ind] for ind in range(len(cy_conts)) if diff_x_ratio[ind]<1.3]#cy_conts[diff_x_ratio<1.3] + + #print(diff_x_ratio, 'ratio') + text_regions_p = text_regions_p.astype('uint8') + + if len(contours_only_text_parent_excluded)>0: + textregion_par = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1])).astype('uint8') + textregion_par = cv2.fillPoly(textregion_par, pts=contours_only_text_parent_included, color=(1,1)) + else: + textregion_par = (text_regions_p[:,:]==1)*1 + textregion_par = textregion_par.astype('uint8') + + + text_regions_p_textregions_dilated = cv2.dilate(textregion_par , ver_kernel, iterations=8) + text_regions_p_textregions_dilated[text_regions_p[:,:]>1] = 0 + + #cv2.imwrite('textregions_dilated.png', text_regions_p_textregions_dilated*255) + + + contours_only_dilated, hir_on_text_dilated = return_contours_of_image(text_regions_p_textregions_dilated) + contours_only_dilated = return_parent_contours(contours_only_dilated, hir_on_text_dilated) + + indexes_of_located_cont, center_x_coordinates_of_located, center_y_coordinates_of_located = self.return_indexes_of_contours_loctaed_inside_another_list_of_contours(contours_only_dilated, contours_only_text_parent_included, cx_conts_included, cy_conts_included, args_cont_located_included) + + + if len(args_cont_located_excluded)>0: + for ind in args_cont_located_excluded: + indexes_of_located_cont.append(np.array([ind])) + contours_only_dilated.append(contours_only_text_parent[ind]) + center_y_coordinates_of_located.append(0) + + array_list = [np.array([elem]) if isinstance(elem, int) else elem for elem in indexes_of_located_cont] + flattened_array = np.concatenate([arr.ravel() for arr in array_list]) + #print(len( np.unique(flattened_array)), 'indexes_of_located_cont uniques') + + missing_textregions = list( set(np.array(range(len(contours_only_text_parent))) ) - set(np.unique(flattened_array)) ) + #print(missing_textregions, 'missing_textregions') + + for ind in missing_textregions: + indexes_of_located_cont.append(np.array([ind])) + contours_only_dilated.append(contours_only_text_parent[ind]) + center_y_coordinates_of_located.append(0) + + + if contours_only_text_parent_h: + for vi in range(len(contours_only_text_parent_h)): + indexes_of_located_cont.append(int(vi+len(contours_only_text_parent))) + + array_list = [np.array([elem]) if isinstance(elem, int) else elem for elem in indexes_of_located_cont] + flattened_array = np.concatenate([arr.ravel() for arr in array_list]) + y_len = text_regions_p.shape[0] x_len = text_regions_p.shape[1] - img_poly = np.zeros((y_len,x_len), dtype='uint8') img_poly[text_regions_p[:,:]==1] = 1 @@ -3554,25 +3643,24 @@ class Eynollah: img_poly[text_regions_p[:,:]==3] = 4 img_poly[text_regions_p[:,:]==6] = 5 - - ###temp - ##sep_mask = (img_poly==5)*1 - ##sep_mask = sep_mask.astype('uint8') - ##sep_mask = cv2.erode(sep_mask, kernel=KERNEL, iterations=2) - ##img_poly[img_poly==5] = 0 - ##img_poly[sep_mask==1] = 5 - ### - img_header_and_sep = np.zeros((y_len,x_len), dtype='uint8') if contours_only_text_parent_h: _, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, _ = find_new_features_of_contours( contours_only_text_parent_h) for j in range(len(cy_main)): img_header_and_sep[int(y_max_main[j]):int(y_max_main[j])+12, - int(x_min_main[j]):int(x_max_main[j])] = 1 - co_text_all = contours_only_text_parent + contours_only_text_parent_h + int(x_min_main[j]):int(x_max_main[j])] = 1 + co_text_all_org = contours_only_text_parent + contours_only_text_parent_h + if len(contours_only_text_parent)>min_cont_size_to_be_dilated: + co_text_all = contours_only_dilated + contours_only_text_parent_h + else: + co_text_all = contours_only_text_parent + contours_only_text_parent_h else: - co_text_all = contours_only_text_parent + co_text_all_org = contours_only_text_parent + if len(contours_only_text_parent)>min_cont_size_to_be_dilated: + co_text_all = contours_only_dilated + else: + co_text_all = contours_only_text_parent if not len(co_text_all): return [], [] @@ -3651,8 +3739,26 @@ class Eynollah: break ordered = [i[0] for i in ordered] - region_ids = ['region_%04d' % i for i in range(len(co_text_all))] - return ordered, region_ids + + if len(contours_only_text_parent)>min_cont_size_to_be_dilated: + org_contours_indexes = [] + for ind in range(len(ordered)): + region_with_curr_order = ordered[ind] + if region_with_curr_order < len(contours_only_dilated): + if np.isscalar(indexes_of_located_cont[region_with_curr_order]): + org_contours_indexes = org_contours_indexes + [indexes_of_located_cont[region_with_curr_order]] + else: + arg_sort_located_cont = np.argsort(center_y_coordinates_of_located[region_with_curr_order]) + org_contours_indexes = org_contours_indexes + list(np.array(indexes_of_located_cont[region_with_curr_order])[arg_sort_located_cont]) ##org_contours_indexes + list ( + else: + org_contours_indexes = org_contours_indexes + [indexes_of_located_cont[region_with_curr_order]] + + region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] + return org_contours_indexes, region_ids + else: + region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] + return ordered, region_ids + def return_start_and_end_of_common_text_of_textline_ocr(self, textline_image, ind_tot): width = np.shape(textline_image)[1] @@ -4293,6 +4399,29 @@ class Eynollah: contours[ind_u_a_trs].pop(ittrd) return contours + + def return_indexes_of_contours_loctaed_inside_another_list_of_contours(self, contours, contours_loc, cx_main_loc, cy_main_loc, indexes_loc): + indexes_of_located_cont = [] + center_x_coordinates_of_located = [] + center_y_coordinates_of_located = [] + #M_main_tot = [cv2.moments(contours_loc[j]) + #for j in range(len(contours_loc))] + #cx_main_loc = [(M_main_tot[j]["m10"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] + #cy_main_loc = [(M_main_tot[j]["m01"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] + + for ij in range(len(contours)): + results = [cv2.pointPolygonTest(contours[ij], (cx_main_loc[ind], cy_main_loc[ind]), False) + for ind in range(len(cy_main_loc)) ] + results = np.array(results) + indexes_in = np.where((results == 0) | (results == 1)) + indexes = indexes_loc[indexes_in]# [(results == 0) | (results == 1)]#np.where((results == 0) | (results == 1)) + + indexes_of_located_cont.append(indexes) + center_x_coordinates_of_located.append(np.array(cx_main_loc)[indexes_in] ) + center_y_coordinates_of_located.append(np.array(cy_main_loc)[indexes_in] ) + + return indexes_of_located_cont, center_x_coordinates_of_located, center_y_coordinates_of_located + def filter_contours_without_textline_inside( self, contours,text_con_org, contours_textline, contours_only_text_parent_d_ordered, conf_contours_textregions): @@ -4986,8 +5115,10 @@ class Eynollah: if self.full_layout: if self.reading_order_machine_based: + tror = time.time() order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( contours_only_text_parent, contours_only_text_parent_h, text_regions_p) + print('time spend for mb ro', time.time()-tror) else: if np.abs(slope_deskew) < SLOPE_THRESHOLD: order_text_new, id_of_texts_tot = self.do_order_of_regions( @@ -5619,8 +5750,15 @@ class Eynollah_ocr: mask_poly = np.zeros(img.shape) mask_poly = cv2.fillPoly(mask_poly, pts=[textline_coords], color=(1, 1, 1)) + mask_poly = mask_poly[y:y+h, x:x+w, :] img_crop = img_poly_on_img[y:y+h, x:x+w, :] + + if angle_degrees<=15: + if mask_poly[:,:,0].sum() /float(w*h) < 0.6 and w_scaled > 520: + cv2.imwrite(file_name+'_desk.png', img_crop) + + print(file_name, angle_degrees,w*h , mask_poly[:,:,0].sum(), mask_poly[:,:,0].sum() /float(w*h) , 'didi') if not self.do_not_mask_with_textline_contour: if angle_degrees > 15: better_des_slope = self.get_orientation_moments(textline_coords) @@ -5634,6 +5772,11 @@ class Eynollah_ocr: mask_poly = mask_poly[y_n:y_n+h_n, x_n:x_n+w_n, :] img_crop = img_crop[y_n:y_n+h_n, x_n:x_n+w_n, :] + + if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.6 and w_scaled > 520: + cv2.imwrite(file_name+'_desk.png', img_crop) + + print(file_name,w_n*h_n , mask_poly[:,:,0].sum(), mask_poly[:,:,0].sum() /float(w_n*h_n) , 'ikiiiiii') img_crop[mask_poly==0] = 255 @@ -5641,7 +5784,7 @@ class Eynollah_ocr: img_crop_bin[mask_poly==0] = 255 if not self.export_textline_images_and_text: - if w_scaled < 1.5*image_width: + if w_scaled < 640:#1.5*image_width: img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) cropped_lines.append(img_fin) if angle_degrees > 15: diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index c5962f8..7fa4a7b 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -992,7 +992,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light( (regions_model_full[:,:,0]==2)).sum() pixels_main = all_pixels - pixels_header - if (pixels_header/float(pixels_main)>=0.3) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ): + if ( (pixels_header/float(pixels_main)>=0.6) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ) and ( (length_con[ii]/float(height_con[ii]) )<=3 )) or ( (pixels_header/float(pixels_main)>=0.3) and ( (length_con[ii]/float(height_con[ii]) )>=3 ) ): regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=2 contours_only_text_parent_head.append(con) if contours_only_text_parent_d_ordered is not None: From c0835665a9d6a6f16dc42ee287aaf5da064927bd Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 20 May 2025 19:01:52 +0200 Subject: [PATCH 14/40] ocr for curved lines --- src/eynollah/eynollah.py | 157 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 146 insertions(+), 11 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index eb5c860..912bc31 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -5263,7 +5263,7 @@ class Eynollah_ocr: self.b_s = int(batch_size) else: - self.model_ocr_dir = dir_models + "/model_step_600000_ocr"#"/model_step_125000_ocr"#"/model_step_25000_ocr"#"/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" + self.model_ocr_dir = dir_models + "/model_step_750000_ocr"#"/model_step_125000_ocr"#"/model_step_25000_ocr"#"/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" model_ocr = load_model(self.model_ocr_dir , compile=False) self.prediction_model = tf.keras.models.Model( @@ -5464,7 +5464,7 @@ class Eynollah_ocr: return cropped_textline - def rotate_image_with_padding(self, image, angle): + def rotate_image_with_padding(self, image, angle, border_value=(0,0,0)): # Get image dimensions (h, w) = image.shape[:2] @@ -5485,7 +5485,7 @@ class Eynollah_ocr: rotation_matrix[1, 2] += (new_h / 2) - center[1] # Perform the rotation - rotated_image = cv2.warpAffine(image, rotation_matrix, (new_w, new_h), borderValue=(0, 0, 0)) + rotated_image = cv2.warpAffine(image, rotation_matrix, (new_w, new_h), borderValue=border_value) return rotated_image @@ -5496,6 +5496,21 @@ class Eynollah_ocr: else: angle = 0.5 * np.arctan2(2 * moments["mu11"], moments["mu20"] - moments["mu02"]) return np.degrees(angle) # Convert radians to degrees + + + def get_orientation_moments_of_mask(self, mask): + mask=mask.astype('uint8') + print(mask.shape) + contours, _ = cv2.findContours(mask[:,:,0], cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + largest_contour = max(contours, key=cv2.contourArea) if contours else None + + moments = cv2.moments(largest_contour) + if moments["mu20"] - moments["mu02"] == 0: # Avoid division by zero + return 90 if moments["mu11"] > 0 else -90 + else: + angle = 0.5 * np.arctan2(2 * moments["mu11"], moments["mu20"] - moments["mu02"]) + return np.degrees(angle) # Convert radians to degrees def get_contours_and_bounding_boxes(self, mask): # Find contours in the binary mask @@ -5508,6 +5523,121 @@ class Eynollah_ocr: #bounding_boxes.append((x, y, w, h)) return x, y, w, h + + def return_splitting_point_of_image(self, image_to_spliited): + width = np.shape(image_to_spliited)[1] + height = np.shape(image_to_spliited)[0] + common_window = int(0.03*width) + + width1 = int ( common_window) + width2 = int ( width - common_window ) + + img_sum = np.sum(image_to_spliited[:,:,0], axis=0) + sum_smoothed = gaussian_filter1d(img_sum, 3) + + peaks_real, _ = find_peaks(sum_smoothed, height=0) + + peaks_real = peaks_real[(peaks_realwidth1)] + + arg_sort = np.argsort(sum_smoothed[peaks_real]) + arg_sort4 =arg_sort[::-1][:4] + peaks_sort_4 = peaks_real[arg_sort][::-1][:4] + + return np.sort(peaks_sort_4) + + def break_curved_line_into_small_pieces_and_then_merge(self, img_curved, mask_curved): + peaks_4 = self.return_splitting_point_of_image(img_curved) + + + + img_0 = img_curved[:, :peaks_4[0], :] + img_1 = img_curved[:, peaks_4[0]:peaks_4[1], :] + img_2 = img_curved[:, peaks_4[1]:peaks_4[2], :] + img_3 = img_curved[:, peaks_4[2]:peaks_4[3], :] + img_4 = img_curved[:, peaks_4[3]:, :] + + + mask_0 = mask_curved[:, :peaks_4[0], :] + mask_1 = mask_curved[:, peaks_4[0]:peaks_4[1], :] + mask_2 = mask_curved[:, peaks_4[1]:peaks_4[2], :] + mask_3 = mask_curved[:, peaks_4[2]:peaks_4[3], :] + mask_4 = mask_curved[:, peaks_4[3]:, :] + + cv2.imwrite("split0.png", img_0) + cv2.imwrite("split1.png", img_1) + cv2.imwrite("split2.png", img_2) + cv2.imwrite("split3.png", img_3) + + or_ma_0 = self.get_orientation_moments_of_mask(mask_0) + or_ma_1 = self.get_orientation_moments_of_mask(mask_1) + or_ma_2 = self.get_orientation_moments_of_mask(mask_2) + or_ma_3 = self.get_orientation_moments_of_mask(mask_3) + or_ma_4 = self.get_orientation_moments_of_mask(mask_4) + + imgs_tot = [] + imgs_tot.append([img_0, mask_0, or_ma_0] ) + imgs_tot.append([img_1, mask_1, or_ma_1]) + imgs_tot.append([img_2, mask_2, or_ma_2]) + imgs_tot.append([img_3, mask_3, or_ma_3]) + imgs_tot.append([img_4, mask_4, or_ma_4]) + + w_tot_des_list = [] + w_tot_des = 0 + imgs_deskewed_list = [] + for ind in range(len(imgs_tot)): + img_in = imgs_tot[ind][0] + mask_in = imgs_tot[ind][1] + ori_in = imgs_tot[ind][2] + + if abs(ori_in)<45: + img_in_des = self.rotate_image_with_padding(img_in, ori_in, border_value=(255,255,255) ) + mask_in_des = self.rotate_image_with_padding(mask_in, ori_in) + mask_in_des = mask_in_des.astype('uint8') + + #new bounding box + x_n, y_n, w_n, h_n = self.get_contours_and_bounding_boxes(mask_in_des[:,:,0]) + + mask_in_des = mask_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] + img_in_des = img_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] + + w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) + img_in_des = resize_image(img_in_des, 32, w_relative) + + + else: + img_in_des = np.copy(img_in) + w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) + img_in_des = resize_image(img_in_des, 32, w_relative) + + w_tot_des+=img_in_des.shape[1] + w_tot_des_list.append(img_in_des.shape[1]) + imgs_deskewed_list.append(img_in_des) + + + + + img_final_deskewed = np.zeros((32, w_tot_des, 3))+255 + + w_indexer = 0 + for ind in range(len(w_tot_des_list)): + img_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_deskewed_list[ind][:,:,:] + w_indexer = w_indexer+w_tot_des_list[ind] + + #cv2.imwrite('final.png', img_final_deskewed) + #print(or_ma_0, or_ma_1, or_ma_2, or_ma_3, or_ma_4, 'orients') + + ##cv2.imwrite("split4.png", img_curved[:, peaks_4[3]:peaks_4[4], :]) + ##cv2.imwrite("split5.png", img_curved[:, peaks_4[4]:peaks_4[5], :]) + ##cv2.imwrite("split6.png", img_curved[:, peaks_4[5]:peaks_4[6], :]) + + ##cv2.imwrite("split7.png", img_curved[:, peaks_4[6]:peaks_4[7], :]) + ##cv2.imwrite("split8.png", img_curved[:, peaks_4[7]:peaks_4[8], :]) + ##cv2.imwrite("split9.png", img_curved[:, peaks_4[8]:peaks_4[9], :]) + + + #cv2.imwrite("split4.png", img_4) + #sys.exit() + return img_final_deskewed def run(self): ls_imgs = os.listdir(self.dir_in) @@ -5754,11 +5884,9 @@ class Eynollah_ocr: mask_poly = mask_poly[y:y+h, x:x+w, :] img_crop = img_poly_on_img[y:y+h, x:x+w, :] - if angle_degrees<=15: - if mask_poly[:,:,0].sum() /float(w*h) < 0.6 and w_scaled > 520: - cv2.imwrite(file_name+'_desk.png', img_crop) + - print(file_name, angle_degrees,w*h , mask_poly[:,:,0].sum(), mask_poly[:,:,0].sum() /float(w*h) , 'didi') + #print(file_name, angle_degrees,w*h , mask_poly[:,:,0].sum(), mask_poly[:,:,0].sum() /float(w*h) , 'didi') if not self.do_not_mask_with_textline_contour: if angle_degrees > 15: better_des_slope = self.get_orientation_moments(textline_coords) @@ -5773,12 +5901,19 @@ class Eynollah_ocr: mask_poly = mask_poly[y_n:y_n+h_n, x_n:x_n+w_n, :] img_crop = img_crop[y_n:y_n+h_n, x_n:x_n+w_n, :] - if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.6 and w_scaled > 520: - cv2.imwrite(file_name+'_desk.png', img_crop) + img_crop[mask_poly==0] = 255 - print(file_name,w_n*h_n , mask_poly[:,:,0].sum(), mask_poly[:,:,0].sum() /float(w_n*h_n) , 'ikiiiiii') + if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 100: + img_crop = self.break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) - img_crop[mask_poly==0] = 255 + #print(file_name,w_n*h_n , mask_poly[:,:,0].sum(), mask_poly[:,:,0].sum() /float(w_n*h_n) , 'ikiiiiii') + else: + img_crop[mask_poly==0] = 255 + if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 100: + img_crop = self.break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) + + + if self.prediction_with_both_of_rgb_and_bin: img_crop_bin[mask_poly==0] = 255 From f94fc9973bc370e9b780c8114520aec91c62e78b Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 21 May 2025 14:39:31 +0200 Subject: [PATCH 15/40] Implement hyphenated textline merging in OCR engine and a bug fixed for curved textline OCR --- src/eynollah/eynollah.py | 157 ++++++++++++++++++--------------------- 1 file changed, 71 insertions(+), 86 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 912bc31..6771db0 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -5500,7 +5500,6 @@ class Eynollah_ocr: def get_orientation_moments_of_mask(self, mask): mask=mask.astype('uint8') - print(mask.shape) contours, _ = cv2.findContours(mask[:,:,0], cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) largest_contour = max(contours, key=cv2.contourArea) if contours else None @@ -5547,97 +5546,69 @@ class Eynollah_ocr: def break_curved_line_into_small_pieces_and_then_merge(self, img_curved, mask_curved): peaks_4 = self.return_splitting_point_of_image(img_curved) - - - - img_0 = img_curved[:, :peaks_4[0], :] - img_1 = img_curved[:, peaks_4[0]:peaks_4[1], :] - img_2 = img_curved[:, peaks_4[1]:peaks_4[2], :] - img_3 = img_curved[:, peaks_4[2]:peaks_4[3], :] - img_4 = img_curved[:, peaks_4[3]:, :] - - - mask_0 = mask_curved[:, :peaks_4[0], :] - mask_1 = mask_curved[:, peaks_4[0]:peaks_4[1], :] - mask_2 = mask_curved[:, peaks_4[1]:peaks_4[2], :] - mask_3 = mask_curved[:, peaks_4[2]:peaks_4[3], :] - mask_4 = mask_curved[:, peaks_4[3]:, :] - - cv2.imwrite("split0.png", img_0) - cv2.imwrite("split1.png", img_1) - cv2.imwrite("split2.png", img_2) - cv2.imwrite("split3.png", img_3) - - or_ma_0 = self.get_orientation_moments_of_mask(mask_0) - or_ma_1 = self.get_orientation_moments_of_mask(mask_1) - or_ma_2 = self.get_orientation_moments_of_mask(mask_2) - or_ma_3 = self.get_orientation_moments_of_mask(mask_3) - or_ma_4 = self.get_orientation_moments_of_mask(mask_4) - - imgs_tot = [] - imgs_tot.append([img_0, mask_0, or_ma_0] ) - imgs_tot.append([img_1, mask_1, or_ma_1]) - imgs_tot.append([img_2, mask_2, or_ma_2]) - imgs_tot.append([img_3, mask_3, or_ma_3]) - imgs_tot.append([img_4, mask_4, or_ma_4]) - - w_tot_des_list = [] - w_tot_des = 0 - imgs_deskewed_list = [] - for ind in range(len(imgs_tot)): - img_in = imgs_tot[ind][0] - mask_in = imgs_tot[ind][1] - ori_in = imgs_tot[ind][2] + if len(peaks_4)>0: + imgs_tot = [] - if abs(ori_in)<45: - img_in_des = self.rotate_image_with_padding(img_in, ori_in, border_value=(255,255,255) ) - mask_in_des = self.rotate_image_with_padding(mask_in, ori_in) - mask_in_des = mask_in_des.astype('uint8') + for ind in range(len(peaks_4)+1): + if ind==0: + img = img_curved[:, :peaks_4[ind], :] + mask = mask_curved[:, :peaks_4[ind], :] + elif ind==len(peaks_4): + img = img_curved[:, peaks_4[ind-1]:, :] + mask = mask_curved[:, peaks_4[ind-1]:, :] + else: + img = img_curved[:, peaks_4[ind-1]:peaks_4[ind], :] + mask = mask_curved[:, peaks_4[ind-1]:peaks_4[ind], :] + + or_ma = self.get_orientation_moments_of_mask(mask) + + imgs_tot.append([img, mask, or_ma] ) + + + w_tot_des_list = [] + w_tot_des = 0 + imgs_deskewed_list = [] + for ind in range(len(imgs_tot)): + img_in = imgs_tot[ind][0] + mask_in = imgs_tot[ind][1] + ori_in = imgs_tot[ind][2] - #new bounding box - x_n, y_n, w_n, h_n = self.get_contours_and_bounding_boxes(mask_in_des[:,:,0]) + if abs(ori_in)<45: + img_in_des = self.rotate_image_with_padding(img_in, ori_in, border_value=(255,255,255) ) + mask_in_des = self.rotate_image_with_padding(mask_in, ori_in) + mask_in_des = mask_in_des.astype('uint8') + + #new bounding box + x_n, y_n, w_n, h_n = self.get_contours_and_bounding_boxes(mask_in_des[:,:,0]) + + mask_in_des = mask_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] + img_in_des = img_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] + + w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) + img_in_des = resize_image(img_in_des, 32, w_relative) + + + else: + img_in_des = np.copy(img_in) + w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) + img_in_des = resize_image(img_in_des, 32, w_relative) + + w_tot_des+=img_in_des.shape[1] + w_tot_des_list.append(img_in_des.shape[1]) + imgs_deskewed_list.append(img_in_des) - mask_in_des = mask_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] - img_in_des = img_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] - w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) - img_in_des = resize_image(img_in_des, 32, w_relative) - else: - img_in_des = np.copy(img_in) - w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) - img_in_des = resize_image(img_in_des, 32, w_relative) - - w_tot_des+=img_in_des.shape[1] - w_tot_des_list.append(img_in_des.shape[1]) - imgs_deskewed_list.append(img_in_des) + img_final_deskewed = np.zeros((32, w_tot_des, 3))+255 - - - - img_final_deskewed = np.zeros((32, w_tot_des, 3))+255 - - w_indexer = 0 - for ind in range(len(w_tot_des_list)): - img_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_deskewed_list[ind][:,:,:] - w_indexer = w_indexer+w_tot_des_list[ind] - - #cv2.imwrite('final.png', img_final_deskewed) - #print(or_ma_0, or_ma_1, or_ma_2, or_ma_3, or_ma_4, 'orients') - - ##cv2.imwrite("split4.png", img_curved[:, peaks_4[3]:peaks_4[4], :]) - ##cv2.imwrite("split5.png", img_curved[:, peaks_4[4]:peaks_4[5], :]) - ##cv2.imwrite("split6.png", img_curved[:, peaks_4[5]:peaks_4[6], :]) - - ##cv2.imwrite("split7.png", img_curved[:, peaks_4[6]:peaks_4[7], :]) - ##cv2.imwrite("split8.png", img_curved[:, peaks_4[7]:peaks_4[8], :]) - ##cv2.imwrite("split9.png", img_curved[:, peaks_4[8]:peaks_4[9], :]) - - - #cv2.imwrite("split4.png", img_4) - #sys.exit() - return img_final_deskewed + w_indexer = 0 + for ind in range(len(w_tot_des_list)): + img_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_deskewed_list[ind][:,:,:] + w_indexer = w_indexer+w_tot_des_list[ind] + return img_final_deskewed + else: + return img_curved def run(self): ls_imgs = os.listdir(self.dir_in) @@ -6144,7 +6115,21 @@ class Eynollah_ocr: text_by_textregion = [] for ind in unique_cropped_lines_region_indexer: extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind] - text_by_textregion.append("".join(extracted_texts_merged_un)) + if len(extracted_texts_merged_un)>1: + text_by_textregion_ind = "" + next_glue = "" + for indt in range(len(extracted_texts_merged_un)): + if extracted_texts_merged_un[indt].endswith('⸗') or extracted_texts_merged_un[indt].endswith('-'): + text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt][:-1] + next_glue = "" + else: + text_by_textregion_ind = text_by_textregion_ind + next_glue + extracted_texts_merged_un[indt] + next_glue = " " + text_by_textregion.append(text_by_textregion_ind) + + else: + text_by_textregion.append(" ".join(extracted_texts_merged_un)) + #print(text_by_textregion, 'text_by_textregiontext_by_textregiontext_by_textregiontext_by_textregiontext_by_textregion') indexer = 0 indexer_textregion = 0 From a0647eff9391fbe398c1de9154068f3841ad22a7 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 21 May 2025 17:42:44 +0200 Subject: [PATCH 16/40] enhancing curved lines OCR --- src/eynollah/eynollah.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 6771db0..b510218 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -5532,14 +5532,12 @@ class Eynollah_ocr: width2 = int ( width - common_window ) img_sum = np.sum(image_to_spliited[:,:,0], axis=0) - sum_smoothed = gaussian_filter1d(img_sum, 3) + sum_smoothed = gaussian_filter1d(img_sum, 1) peaks_real, _ = find_peaks(sum_smoothed, height=0) - peaks_real = peaks_real[(peaks_realwidth1)] - + arg_sort = np.argsort(sum_smoothed[peaks_real]) - arg_sort4 =arg_sort[::-1][:4] peaks_sort_4 = peaks_real[arg_sort][::-1][:4] return np.sort(peaks_sort_4) @@ -5585,12 +5583,16 @@ class Eynollah_ocr: img_in_des = img_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) + if w_relative==0: + w_relative = img_in_des.shape[1] img_in_des = resize_image(img_in_des, 32, w_relative) else: img_in_des = np.copy(img_in) w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) + if w_relative==0: + w_relative = img_in_des.shape[1] img_in_des = resize_image(img_in_des, 32, w_relative) w_tot_des+=img_in_des.shape[1] From d4f6e10251f23ff01b15cc7736067c5af30b1278 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 23 May 2025 15:55:03 +0200 Subject: [PATCH 17/40] commit 21ec4fb is picked + rnn ocr at the same time with segmentation + enhancement of mb reading order --- src/eynollah/cli.py | 15 +- src/eynollah/eynollah.py | 775 +++++++++++--------------------- src/eynollah/utils/utils_ocr.py | 435 ++++++++++++++++++ src/eynollah/writer.py | 30 +- 4 files changed, 729 insertions(+), 526 deletions(-) create mode 100644 src/eynollah/utils/utils_ocr.py diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index 99961c9..cd56833 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -225,6 +225,17 @@ def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out) is_flag=True, help="if this parameter set to true, this tool will try to do ocr", ) +@click.option( + "--transformer_ocr", + "-tr/-notr", + is_flag=True, + help="if this parameter set to true, this tool will apply transformer ocr", +) +@click.option( + "--batch_size_ocr", + "-bs_ocr", + help="number of inference batch size of ocr model. Default b_s for trocr and cnn_rnn models are 2 and 8 respectively", +) @click.option( "--num_col_upper", "-ncu", @@ -258,7 +269,7 @@ def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out) help="Override log level globally to this", ) -def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_deskewed, save_all, extract_only_images, save_page, enable_plotting, allow_enhancement, curved_line, textline_light, full_layout, tables, right2left, input_binary, allow_scaling, headers_off, light_version, reading_order_machine_based, do_ocr, num_col_upper, num_col_lower, threshold_art_class_textline, threshold_art_class_layout, skip_layout_and_reading_order, ignore_page_extraction, log_level): +def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_deskewed, save_all, extract_only_images, save_page, enable_plotting, allow_enhancement, curved_line, textline_light, full_layout, tables, right2left, input_binary, allow_scaling, headers_off, light_version, reading_order_machine_based, do_ocr, transformer_ocr, batch_size_ocr, num_col_upper, num_col_lower, threshold_art_class_textline, threshold_art_class_layout, skip_layout_and_reading_order, ignore_page_extraction, log_level): initLogging() if log_level: getLogger('eynollah').setLevel(getLevelName(log_level)) @@ -305,6 +316,8 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ ignore_page_extraction=ignore_page_extraction, reading_order_machine_based=reading_order_machine_based, do_ocr=do_ocr, + transformer_ocr=transformer_ocr, + batch_size_ocr=batch_size_ocr, num_col_upper=num_col_upper, num_col_lower=num_col_lower, skip_layout_and_reading_order=skip_layout_and_reading_order, diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index b510218..2564150 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -80,6 +80,13 @@ from .utils.rotate import ( rotation_not_90_func_full_layout, rotation_image_new ) +from .utils.utils_ocr import ( + return_textline_contour_with_added_box_coordinate, + preprocess_and_resize_image_for_ocrcnn_model, + return_textlines_split_if_needed, + decode_batch_predictions, + return_rnn_cnn_ocr_of_given_textlines +) from .utils.separate_lines import ( textline_contours_postprocessing, separate_lines_new2, @@ -199,6 +206,8 @@ class Eynollah: ignore_page_extraction : bool = False, reading_order_machine_based : bool = False, do_ocr : bool = False, + transformer_ocr: bool = False, + batch_size_ocr: Optional[int] = None, num_col_upper : Optional[int] = None, num_col_lower : Optional[int] = None, threshold_art_class_layout: Optional[float] = None, @@ -232,6 +241,7 @@ class Eynollah: self.ignore_page_extraction = ignore_page_extraction self.skip_layout_and_reading_order = skip_layout_and_reading_order self.ocr = do_ocr + self.tr = transformer_ocr if num_col_upper: self.num_col_upper = int(num_col_upper) else: @@ -273,7 +283,7 @@ class Eynollah: self.model_region_dir_p_ens = dir_models + "/eynollah-main-regions-ensembled_20210425" self.model_region_dir_p_ens_light = dir_models + "/eynollah-main-regions_20220314" self.model_region_dir_p_ens_light_only_images_extraction = dir_models + "/eynollah-main-regions_20231127_672_org_ens_11_13_16_17_18" - self.model_reading_order_dir = dir_models + "/model_step_2500000_mb_ro"#"/model_ens_reading_order_machine_based" + self.model_reading_order_dir = dir_models + "/model_step_4800000_mb_ro"#"/model_ens_reading_order_machine_based" #"/modelens_12sp_elay_0_3_4__3_6_n" #"/modelens_earlylayout_12spaltige_2_3_5_6_7_8" #"/modelens_early12_sp_2_3_5_6_7_8_9_10_12_14_15_16_18" @@ -300,8 +310,10 @@ class Eynollah: else: #"/eynollah-textline_20210425" self.model_textline_dir = dir_models + "/modelens_textline_0_1__2_4_16092024" - if self.ocr: + if self.ocr and self.tr: self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124" + elif self.ocr and not self.tr: + self.model_ocr_dir = dir_models + "/model_step_750000_ocr"#"/model_step_125000_ocr"#"/model_step_25000_ocr"#"/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" if self.tables: if self.light_version: self.model_table_dir = dir_models + "/modelens_table_0t4_201124" @@ -341,11 +353,37 @@ class Eynollah: self.model_region_fl = self.our_load_model(self.model_region_dir_fully) if self.reading_order_machine_based: self.model_reading_order = self.our_load_model(self.model_reading_order_dir) - if self.ocr: + if self.ocr and self.tr: self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #("microsoft/trocr-base-printed")#("microsoft/trocr-base-handwritten") self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") + elif self.ocr and not self.tr: + model_ocr = load_model(self.model_ocr_dir , compile=False) + + self.prediction_model = tf.keras.models.Model( + model_ocr.get_layer(name = "image").input, + model_ocr.get_layer(name = "dense2").output) + if not batch_size_ocr: + self.b_s_ocr = 8 + else: + self.b_s_ocr = int(batch_size_ocr) + + + with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: + characters = json.load(config_file) + + + AUTOTUNE = tf.data.AUTOTUNE + + # Mapping characters to integers. + char_to_num = StringLookup(vocabulary=list(characters), mask_token=None) + + # Mapping integers back to original characters. + self.num_to_char = StringLookup( + vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True + ) + if self.tables: self.model_table = self.our_load_model(self.model_table_dir) @@ -1325,11 +1363,11 @@ class Eynollah: seg_art[seg_art>0] =1 seg_line = label_p_pred[:,:,:,3] - seg_line[seg_line>0.3] =1#seg_line[seg_line>0.5] =1#seg_line[seg_line>0.1] =1 + seg_line[seg_line>0.4] =1#seg_line[seg_line>0.5] =1#seg_line[seg_line>0.1] =1 seg_line[seg_line<1] =0 ##seg[seg_art==1]=4 - seg[(seg_line==1) & (seg==0)]=3 + #seg[(seg_line==1) & (seg==0)]=3 if thresholding_for_artificial_class_in_light_version: seg_art = label_p_pred[:,:,:,2] @@ -2060,7 +2098,7 @@ class Eynollah: ###img_bin = np.copy(prediction_bin) ###else: ###img_bin = np.copy(img_resized) - if self.ocr and not self.input_binary: + if (self.ocr and self.tr) and not self.input_binary: prediction_bin = self.do_prediction(True, img_resized, self.model_bin, n_batch_inference=5) prediction_bin = 255 * (prediction_bin[:,:,0] == 0) prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2) @@ -3485,8 +3523,10 @@ class Eynollah: # 6 is the separators lable in old full layout model # 4 is the drop capital class in old full layout model # in the new full layout drop capital is 3 and separators are 5 - - text_regions_p[:,:][regions_fully[:,:,0]==5]=6 + + # the separators in full layout will not be written on layout + if not self.reading_order_machine_based: + text_regions_p[:,:][regions_fully[:,:,0]==5]=6 ###regions_fully[:, :, 0][regions_fully_only_drop[:, :, 0] == 3] = 4 #text_regions_p[:,:][regions_fully[:,:,0]==6]=6 @@ -3555,11 +3595,37 @@ class Eynollah: return model def do_order_of_regions_with_model(self, contours_only_text_parent, contours_only_text_parent_h, text_regions_p): - #cv2.imwrite('textregions.png', text_regions_p*50) + + height1 =672#448 + width1 = 448#224 + + height2 =672#448 + width2= 448#224 + + height3 =672#448 + width3 = 448#224 + + inference_bs = 3 + + cv2.imwrite('textregions.png', text_regions_p*50) + cv2.imwrite('sep.png', (text_regions_p[:,:]==6)*255) + + ver_kernel = np.ones((5, 1), dtype=np.uint8) + hor_kernel = np.ones((1, 5), dtype=np.uint8) + + + + #separators = (text_regions_p[:,:]==6)*1 + #text_regions_p[text_regions_p[:,:]==6] = 0 + #separators = separators.astype('uint8') + + #separators = cv2.erode(separators , hor_kernel, iterations=1) + #text_regions_p[separators[:,:]==1] = 6 + + #cv2.imwrite('sep_new.png', (text_regions_p[:,:]==6)*255) + min_cont_size_to_be_dilated = 10 if len(contours_only_text_parent)>min_cont_size_to_be_dilated: - ver_kernel = np.ones((5, 1), dtype=np.uint8) - cx_conts, cy_conts, x_min_conts, x_max_conts, y_min_conts, y_max_conts, _ = find_new_features_of_contours(contours_only_text_parent) args_cont_located = np.array(range(len(contours_only_text_parent))) @@ -3595,12 +3661,13 @@ class Eynollah: textregion_par = (text_regions_p[:,:]==1)*1 textregion_par = textregion_par.astype('uint8') - - text_regions_p_textregions_dilated = cv2.dilate(textregion_par , ver_kernel, iterations=8) + text_regions_p_textregions_dilated = cv2.erode(textregion_par , hor_kernel, iterations=2) + text_regions_p_textregions_dilated = cv2.dilate(text_regions_p_textregions_dilated , ver_kernel, iterations=4) + text_regions_p_textregions_dilated = cv2.erode(text_regions_p_textregions_dilated , hor_kernel, iterations=1) + text_regions_p_textregions_dilated = cv2.dilate(text_regions_p_textregions_dilated , ver_kernel, iterations=5) text_regions_p_textregions_dilated[text_regions_p[:,:]>1] = 0 - #cv2.imwrite('textregions_dilated.png', text_regions_p_textregions_dilated*255) - + cv2.imwrite('text_regions_p_textregions_dilated.png', text_regions_p_textregions_dilated*255) contours_only_dilated, hir_on_text_dilated = return_contours_of_image(text_regions_p_textregions_dilated) contours_only_dilated = return_parent_contours(contours_only_dilated, hir_on_text_dilated) @@ -3664,7 +3731,8 @@ class Eynollah: if not len(co_text_all): return [], [] - + print(len(co_text_all), "co_text_all") + print(len(co_text_all_org), "co_text_all_org") labels_con = np.zeros((int(y_len /6.), int(x_len/6.), len(co_text_all)), dtype=bool) co_text_all = [(i/6).astype(int) for i in co_text_all] for i in range(len(co_text_all)): @@ -3675,21 +3743,13 @@ class Eynollah: cv2.fillPoly(img, pts=[co_text_all[i]], color=(1,)) labels_con[:,:,i] = img - height1 =672#448 - width1 = 448#224 - - height2 =672#448 - width2= 448#224 - - height3 =672#448 - width3 = 448#224 labels_con = resize_image(labels_con.astype(np.uint8), height1, width1).astype(bool) img_header_and_sep = resize_image(img_header_and_sep, height1, width1) img_poly = resize_image(img_poly, height3, width3) - inference_bs = 3 + input_1 = np.zeros((inference_bs, height1, width1, 3)) ordered = [list(range(len(co_text_all)))] index_update = 0 @@ -3760,217 +3820,213 @@ class Eynollah: return ordered, region_ids - def return_start_and_end_of_common_text_of_textline_ocr(self, textline_image, ind_tot): - width = np.shape(textline_image)[1] - height = np.shape(textline_image)[0] - common_window = int(0.2*width) + ####def return_start_and_end_of_common_text_of_textline_ocr(self, textline_image, ind_tot): + ####width = np.shape(textline_image)[1] + ####height = np.shape(textline_image)[0] + ####common_window = int(0.2*width) - width1 = int ( width/2. - common_window ) - width2 = int ( width/2. + common_window ) + ####width1 = int ( width/2. - common_window ) + ####width2 = int ( width/2. + common_window ) - img_sum = np.sum(textline_image[:,:,0], axis=0) - sum_smoothed = gaussian_filter1d(img_sum, 3) + ####img_sum = np.sum(textline_image[:,:,0], axis=0) + ####sum_smoothed = gaussian_filter1d(img_sum, 3) - peaks_real, _ = find_peaks(sum_smoothed, height=0) - if len(peaks_real)>70: + ####peaks_real, _ = find_peaks(sum_smoothed, height=0) + ####if len(peaks_real)>70: - peaks_real = peaks_real[(peaks_realwidth1)] + ####peaks_real = peaks_real[(peaks_realwidth1)] - arg_sort = np.argsort(sum_smoothed[peaks_real]) - arg_sort4 =arg_sort[::-1][:4] - peaks_sort_4 = peaks_real[arg_sort][::-1][:4] - argsort_sorted = np.argsort(peaks_sort_4) + ####arg_sort = np.argsort(sum_smoothed[peaks_real]) + ####arg_sort4 =arg_sort[::-1][:4] + ####peaks_sort_4 = peaks_real[arg_sort][::-1][:4] + ####argsort_sorted = np.argsort(peaks_sort_4) - first_4_sorted = peaks_sort_4[argsort_sorted] - y_4_sorted = sum_smoothed[peaks_real][arg_sort4[argsort_sorted]] - #print(first_4_sorted,'first_4_sorted') + ####first_4_sorted = peaks_sort_4[argsort_sorted] + ####y_4_sorted = sum_smoothed[peaks_real][arg_sort4[argsort_sorted]] + #####print(first_4_sorted,'first_4_sorted') - arg_sortnew = np.argsort(y_4_sorted) - peaks_final =np.sort( first_4_sorted[arg_sortnew][2:] ) + ####arg_sortnew = np.argsort(y_4_sorted) + ####peaks_final =np.sort( first_4_sorted[arg_sortnew][2:] ) - #plt.figure(ind_tot) - #plt.imshow(textline_image) - #plt.plot([peaks_final[0], peaks_final[0]], [0, height-1]) - #plt.plot([peaks_final[1], peaks_final[1]], [0, height-1]) - #plt.savefig('./'+str(ind_tot)+'.png') + #####plt.figure(ind_tot) + #####plt.imshow(textline_image) + #####plt.plot([peaks_final[0], peaks_final[0]], [0, height-1]) + #####plt.plot([peaks_final[1], peaks_final[1]], [0, height-1]) + #####plt.savefig('./'+str(ind_tot)+'.png') - return peaks_final[0], peaks_final[1] - else: - pass + ####return peaks_final[0], peaks_final[1] + ####else: + ####pass - def return_start_and_end_of_common_text_of_textline_ocr_without_common_section(self, textline_image, ind_tot): - width = np.shape(textline_image)[1] - height = np.shape(textline_image)[0] - common_window = int(0.06*width) + ##def return_start_and_end_of_common_text_of_textline_ocr_without_common_section(self, textline_image, ind_tot): + ##width = np.shape(textline_image)[1] + ##height = np.shape(textline_image)[0] + ##common_window = int(0.06*width) - width1 = int ( width/2. - common_window ) - width2 = int ( width/2. + common_window ) + ##width1 = int ( width/2. - common_window ) + ##width2 = int ( width/2. + common_window ) - img_sum = np.sum(textline_image[:,:,0], axis=0) - sum_smoothed = gaussian_filter1d(img_sum, 3) + ##img_sum = np.sum(textline_image[:,:,0], axis=0) + ##sum_smoothed = gaussian_filter1d(img_sum, 3) - peaks_real, _ = find_peaks(sum_smoothed, height=0) - if len(peaks_real)>70: - #print(len(peaks_real), 'len(peaks_real)') + ##peaks_real, _ = find_peaks(sum_smoothed, height=0) + ##if len(peaks_real)>70: + ###print(len(peaks_real), 'len(peaks_real)') - peaks_real = peaks_real[(peaks_realwidth1)] + ##peaks_real = peaks_real[(peaks_realwidth1)] - arg_max = np.argmax(sum_smoothed[peaks_real]) - peaks_final = peaks_real[arg_max] + ##arg_max = np.argmax(sum_smoothed[peaks_real]) + ##peaks_final = peaks_real[arg_max] - #plt.figure(ind_tot) - #plt.imshow(textline_image) - #plt.plot([peaks_final, peaks_final], [0, height-1]) - ##plt.plot([peaks_final[1], peaks_final[1]], [0, height-1]) - #plt.savefig('./'+str(ind_tot)+'.png') + ###plt.figure(ind_tot) + ###plt.imshow(textline_image) + ###plt.plot([peaks_final, peaks_final], [0, height-1]) + ####plt.plot([peaks_final[1], peaks_final[1]], [0, height-1]) + ###plt.savefig('./'+str(ind_tot)+'.png') - return peaks_final - else: - return None + ##return peaks_final + ##else: + ##return None - def return_start_and_end_of_common_text_of_textline_ocr_new_splitted( - self, peaks_real, sum_smoothed, start_split, end_split): + ###def return_start_and_end_of_common_text_of_textline_ocr_new_splitted( + ###self, peaks_real, sum_smoothed, start_split, end_split): - peaks_real = peaks_real[(peaks_realstart_split)] + ###peaks_real = peaks_real[(peaks_realstart_split)] - arg_sort = np.argsort(sum_smoothed[peaks_real]) - arg_sort4 =arg_sort[::-1][:4] - peaks_sort_4 = peaks_real[arg_sort][::-1][:4] - argsort_sorted = np.argsort(peaks_sort_4) + ###arg_sort = np.argsort(sum_smoothed[peaks_real]) + ###arg_sort4 =arg_sort[::-1][:4] + ###peaks_sort_4 = peaks_real[arg_sort][::-1][:4] + ###argsort_sorted = np.argsort(peaks_sort_4) - first_4_sorted = peaks_sort_4[argsort_sorted] - y_4_sorted = sum_smoothed[peaks_real][arg_sort4[argsort_sorted]] - #print(first_4_sorted,'first_4_sorted') + ###first_4_sorted = peaks_sort_4[argsort_sorted] + ###y_4_sorted = sum_smoothed[peaks_real][arg_sort4[argsort_sorted]] + ####print(first_4_sorted,'first_4_sorted') - arg_sortnew = np.argsort(y_4_sorted) - peaks_final =np.sort( first_4_sorted[arg_sortnew][3:] ) - return peaks_final[0] + ###arg_sortnew = np.argsort(y_4_sorted) + ###peaks_final =np.sort( first_4_sorted[arg_sortnew][3:] ) + ###return peaks_final[0] - def return_start_and_end_of_common_text_of_textline_ocr_new(self, textline_image, ind_tot): - width = np.shape(textline_image)[1] - height = np.shape(textline_image)[0] - common_window = int(0.15*width) + ###def return_start_and_end_of_common_text_of_textline_ocr_new(self, textline_image, ind_tot): + ###width = np.shape(textline_image)[1] + ###height = np.shape(textline_image)[0] + ###common_window = int(0.15*width) - width1 = int ( width/2. - common_window ) - width2 = int ( width/2. + common_window ) - mid = int(width/2.) + ###width1 = int ( width/2. - common_window ) + ###width2 = int ( width/2. + common_window ) + ###mid = int(width/2.) - img_sum = np.sum(textline_image[:,:,0], axis=0) - sum_smoothed = gaussian_filter1d(img_sum, 3) + ###img_sum = np.sum(textline_image[:,:,0], axis=0) + ###sum_smoothed = gaussian_filter1d(img_sum, 3) - peaks_real, _ = find_peaks(sum_smoothed, height=0) - if len(peaks_real)>70: - peak_start = self.return_start_and_end_of_common_text_of_textline_ocr_new_splitted( - peaks_real, sum_smoothed, width1, mid+2) - peak_end = self.return_start_and_end_of_common_text_of_textline_ocr_new_splitted( - peaks_real, sum_smoothed, mid-2, width2) + ###peaks_real, _ = find_peaks(sum_smoothed, height=0) + ###if len(peaks_real)>70: + ###peak_start = self.return_start_and_end_of_common_text_of_textline_ocr_new_splitted( + ###peaks_real, sum_smoothed, width1, mid+2) + ###peak_end = self.return_start_and_end_of_common_text_of_textline_ocr_new_splitted( + ###peaks_real, sum_smoothed, mid-2, width2) - #plt.figure(ind_tot) - #plt.imshow(textline_image) - #plt.plot([peak_start, peak_start], [0, height-1]) - #plt.plot([peak_end, peak_end], [0, height-1]) - #plt.savefig('./'+str(ind_tot)+'.png') + ####plt.figure(ind_tot) + ####plt.imshow(textline_image) + ####plt.plot([peak_start, peak_start], [0, height-1]) + ####plt.plot([peak_end, peak_end], [0, height-1]) + ####plt.savefig('./'+str(ind_tot)+'.png') - return peak_start, peak_end - else: - pass + ###return peak_start, peak_end + ###else: + ###pass - def return_ocr_of_textline_without_common_section( - self, textline_image, model_ocr, processor, device, width_textline, h2w_ratio,ind_tot): + ##def return_ocr_of_textline_without_common_section( + ##self, textline_image, model_ocr, processor, device, width_textline, h2w_ratio,ind_tot): - if h2w_ratio > 0.05: - pixel_values = processor(textline_image, return_tensors="pt").pixel_values - generated_ids = model_ocr.generate(pixel_values.to(device)) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - else: - #width = np.shape(textline_image)[1] - #height = np.shape(textline_image)[0] - #common_window = int(0.3*width) - #width1 = int ( width/2. - common_window ) - #width2 = int ( width/2. + common_window ) + ##if h2w_ratio > 0.05: + ##pixel_values = processor(textline_image, return_tensors="pt").pixel_values + ##generated_ids = model_ocr.generate(pixel_values.to(device)) + ##generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + ##else: + ###width = np.shape(textline_image)[1] + ###height = np.shape(textline_image)[0] + ###common_window = int(0.3*width) + ###width1 = int ( width/2. - common_window ) + ###width2 = int ( width/2. + common_window ) - split_point = self.return_start_and_end_of_common_text_of_textline_ocr_without_common_section( - textline_image, ind_tot) - if split_point: - image1 = textline_image[:, :split_point,:]# image.crop((0, 0, width2, height)) - image2 = textline_image[:, split_point:,:]#image.crop((width1, 0, width, height)) + ##split_point = self.return_start_and_end_of_common_text_of_textline_ocr_without_common_section( + ##textline_image, ind_tot) + ##if split_point: + ##image1 = textline_image[:, :split_point,:]# image.crop((0, 0, width2, height)) + ##image2 = textline_image[:, split_point:,:]#image.crop((width1, 0, width, height)) - #pixel_values1 = processor(image1, return_tensors="pt").pixel_values - #pixel_values2 = processor(image2, return_tensors="pt").pixel_values + ###pixel_values1 = processor(image1, return_tensors="pt").pixel_values + ###pixel_values2 = processor(image2, return_tensors="pt").pixel_values - pixel_values_merged = processor([image1,image2], return_tensors="pt").pixel_values - generated_ids_merged = model_ocr.generate(pixel_values_merged.to(device)) - generated_text_merged = processor.batch_decode(generated_ids_merged, skip_special_tokens=True) + ##pixel_values_merged = processor([image1,image2], return_tensors="pt").pixel_values + ##generated_ids_merged = model_ocr.generate(pixel_values_merged.to(device)) + ##generated_text_merged = processor.batch_decode(generated_ids_merged, skip_special_tokens=True) - #print(generated_text_merged,'generated_text_merged') + ###print(generated_text_merged,'generated_text_merged') - #generated_ids1 = model_ocr.generate(pixel_values1.to(device)) - #generated_ids2 = model_ocr.generate(pixel_values2.to(device)) + ###generated_ids1 = model_ocr.generate(pixel_values1.to(device)) + ###generated_ids2 = model_ocr.generate(pixel_values2.to(device)) - #generated_text1 = processor.batch_decode(generated_ids1, skip_special_tokens=True)[0] - #generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0] + ###generated_text1 = processor.batch_decode(generated_ids1, skip_special_tokens=True)[0] + ###generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0] - #generated_text = generated_text1 + ' ' + generated_text2 - generated_text = generated_text_merged[0] + ' ' + generated_text_merged[1] + ###generated_text = generated_text1 + ' ' + generated_text2 + ##generated_text = generated_text_merged[0] + ' ' + generated_text_merged[1] - #print(generated_text1,'generated_text1') - #print(generated_text2, 'generated_text2') - #print('########################################') - else: - pixel_values = processor(textline_image, return_tensors="pt").pixel_values - generated_ids = model_ocr.generate(pixel_values.to(device)) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + ###print(generated_text1,'generated_text1') + ###print(generated_text2, 'generated_text2') + ###print('########################################') + ##else: + ##pixel_values = processor(textline_image, return_tensors="pt").pixel_values + ##generated_ids = model_ocr.generate(pixel_values.to(device)) + ##generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - #print(generated_text,'generated_text') - #print('########################################') - return generated_text + ###print(generated_text,'generated_text') + ###print('########################################') + ##return generated_text - def return_ocr_of_textline( - self, textline_image, model_ocr, processor, device, width_textline, h2w_ratio,ind_tot): + ###def return_ocr_of_textline( + ###self, textline_image, model_ocr, processor, device, width_textline, h2w_ratio,ind_tot): - if h2w_ratio > 0.05: - pixel_values = processor(textline_image, return_tensors="pt").pixel_values - generated_ids = model_ocr.generate(pixel_values.to(device)) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - else: - #width = np.shape(textline_image)[1] - #height = np.shape(textline_image)[0] - #common_window = int(0.3*width) - #width1 = int ( width/2. - common_window ) - #width2 = int ( width/2. + common_window ) + ###if h2w_ratio > 0.05: + ###pixel_values = processor(textline_image, return_tensors="pt").pixel_values + ###generated_ids = model_ocr.generate(pixel_values.to(device)) + ###generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + ###else: + ####width = np.shape(textline_image)[1] + ####height = np.shape(textline_image)[0] + ####common_window = int(0.3*width) + ####width1 = int ( width/2. - common_window ) + ####width2 = int ( width/2. + common_window ) - try: - width1, width2 = self.return_start_and_end_of_common_text_of_textline_ocr_new(textline_image, ind_tot) + ###try: + ###width1, width2 = self.return_start_and_end_of_common_text_of_textline_ocr_new(textline_image, ind_tot) - image1 = textline_image[:, :width2,:]# image.crop((0, 0, width2, height)) - image2 = textline_image[:, width1:,:]#image.crop((width1, 0, width, height)) + ###image1 = textline_image[:, :width2,:]# image.crop((0, 0, width2, height)) + ###image2 = textline_image[:, width1:,:]#image.crop((width1, 0, width, height)) - pixel_values1 = processor(image1, return_tensors="pt").pixel_values - pixel_values2 = processor(image2, return_tensors="pt").pixel_values + ###pixel_values1 = processor(image1, return_tensors="pt").pixel_values + ###pixel_values2 = processor(image2, return_tensors="pt").pixel_values - generated_ids1 = model_ocr.generate(pixel_values1.to(device)) - generated_ids2 = model_ocr.generate(pixel_values2.to(device)) + ###generated_ids1 = model_ocr.generate(pixel_values1.to(device)) + ###generated_ids2 = model_ocr.generate(pixel_values2.to(device)) - generated_text1 = processor.batch_decode(generated_ids1, skip_special_tokens=True)[0] - generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0] - #print(generated_text1,'generated_text1') - #print(generated_text2, 'generated_text2') - #print('########################################') + ###generated_text1 = processor.batch_decode(generated_ids1, skip_special_tokens=True)[0] + ###generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0] + ####print(generated_text1,'generated_text1') + ####print(generated_text2, 'generated_text2') + ####print('########################################') - match = sq(None, generated_text1, generated_text2).find_longest_match( - 0, len(generated_text1), 0, len(generated_text2)) - generated_text = generated_text1 + generated_text2[match.b+match.size:] - except: - pixel_values = processor(textline_image, return_tensors="pt").pixel_values - generated_ids = model_ocr.generate(pixel_values.to(device)) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + ###match = sq(None, generated_text1, generated_text2).find_longest_match( + ###0, len(generated_text1), 0, len(generated_text2)) + ###generated_text = generated_text1 + generated_text2[match.b+match.size:] + ###except: + ###pixel_values = processor(textline_image, return_tensors="pt").pixel_values + ###generated_ids = model_ocr.generate(pixel_values.to(device)) + ###generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - return generated_text + ###return generated_text - def return_textline_contour_with_added_box_coordinate(self, textline_contour, box_ind): - textline_contour[:,0] = textline_contour[:,0] + box_ind[2] - textline_contour[:,1] = textline_contour[:,1] + box_ind[0] - return textline_contour def return_list_of_contours_with_desired_order(self, ls_cons, sorted_indexes): return [ls_cons[sorted_indexes[index]] for index in range(len(sorted_indexes))] @@ -4625,6 +4681,7 @@ class Eynollah: raise ValueError("run requires either a single image filename or a directory") for img_filename in self.ls_imgs: + print(img_filename, 'img_filename') self.logger.info(img_filename) t0 = time.time() @@ -4698,13 +4755,19 @@ class Eynollah: all_box_coord_marginals = [] polygons_lines_xml = [] contours_tables = [] - ocr_all_textlines = None conf_contours_textregions =[0] + + if self.ocr and not self.tr: + gc.collect() + ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons, self.prediction_model, self.b_s_ocr, self.num_to_char, textline_light=True) + else: + ocr_all_textlines = None + pcgts = self.writer.build_pagexml_no_full_layout( cont_page, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, - cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines, conf_contours_textregions) + cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines, conf_contours_textregions, self.skip_layout_and_reading_order) return pcgts #print("text region early -1 in %.1fs", time.time() - t0) @@ -5118,7 +5181,7 @@ class Eynollah: tror = time.time() order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( contours_only_text_parent, contours_only_text_parent_h, text_regions_p) - print('time spend for mb ro', time.time()-tror) + print('time spend for mb ro', time.time()-tror) else: if np.abs(slope_deskew) < SLOPE_THRESHOLD: order_text_new, id_of_texts_tot = self.do_order_of_regions( @@ -5160,7 +5223,7 @@ class Eynollah: order_text_new, id_of_texts_tot = self.do_order_of_regions( contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) - if self.ocr: + if self.ocr and self.tr: device = cuda.get_current_device() device.reset() gc.collect() @@ -5207,6 +5270,11 @@ class Eynollah: ocr_textline_in_textregion.append(text_ocr) ind_tot = ind_tot +1 ocr_all_textlines.append(ocr_textline_in_textregion) + + elif self.ocr and not self.tr: + gc.collect() + ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + else: ocr_all_textlines = None @@ -5289,329 +5357,6 @@ class Eynollah_ocr: vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True ) - - def decode_batch_predictions(self, pred, max_len = 128): - # input_len is the product of the batch size and the - # number of time steps. - input_len = np.ones(pred.shape[0]) * pred.shape[1] - - # Decode CTC predictions using greedy search. - # decoded is a tuple with 2 elements. - decoded = tf.keras.backend.ctc_decode(pred, - input_length = input_len, - beam_width = 100) - # The outputs are in the first element of the tuple. - # Additionally, the first element is actually a list, - # therefore we take the first element of that list as well. - #print(decoded,'decoded') - decoded = decoded[0][0][:, :max_len] - - #print(decoded, decoded.shape,'decoded') - - output = [] - for d in decoded: - # Convert the predicted indices to the corresponding chars. - d = tf.strings.reduce_join(self.num_to_char(d)) - d = d.numpy().decode("utf-8") - output.append(d) - return output - - - def distortion_free_resize(self, image, img_size): - w, h = img_size - image = tf.image.resize(image, size=(h, w), preserve_aspect_ratio=True) - - # Check tha amount of padding needed to be done. - pad_height = h - tf.shape(image)[0] - pad_width = w - tf.shape(image)[1] - - # Only necessary if you want to do same amount of padding on both sides. - if pad_height % 2 != 0: - height = pad_height // 2 - pad_height_top = height + 1 - pad_height_bottom = height - else: - pad_height_top = pad_height_bottom = pad_height // 2 - - if pad_width % 2 != 0: - width = pad_width // 2 - pad_width_left = width + 1 - pad_width_right = width - else: - pad_width_left = pad_width_right = pad_width // 2 - - image = tf.pad( - image, - paddings=[ - [pad_height_top, pad_height_bottom], - [pad_width_left, pad_width_right], - [0, 0], - ], - ) - - image = tf.transpose(image, (1, 0, 2)) - image = tf.image.flip_left_right(image) - return image - - def return_start_and_end_of_common_text_of_textline_ocr_without_common_section(self, textline_image): - width = np.shape(textline_image)[1] - height = np.shape(textline_image)[0] - common_window = int(0.22*width) - - width1 = int ( width/2. - common_window ) - width2 = int ( width/2. + common_window ) - - img_sum = np.sum(textline_image[:,:,0], axis=0) - sum_smoothed = gaussian_filter1d(img_sum, 3) - - peaks_real, _ = find_peaks(sum_smoothed, height=0) - - if len(peaks_real)>35: - - #peaks_real = peaks_real[(peaks_realwidth1)] - argsort = np.argsort(sum_smoothed[peaks_real])[::-1] - peaks_real_top_six = peaks_real[argsort[:6]] - midpoint = textline_image.shape[1] / 2. - arg_closest = np.argmin(np.abs(peaks_real_top_six - midpoint)) - - #arg_max = np.argmax(sum_smoothed[peaks_real]) - - peaks_final = peaks_real_top_six[arg_closest]#peaks_real[arg_max] - - return peaks_final - else: - return None - - # Function to fit text inside the given area - def fit_text_single_line(self, draw, text, font_path, max_width, max_height): - initial_font_size = 50 - font_size = initial_font_size - while font_size > 10: # Minimum font size - font = ImageFont.truetype(font_path, font_size) - text_bbox = draw.textbbox((0, 0), text, font=font) # Get text bounding box - text_width = text_bbox[2] - text_bbox[0] - text_height = text_bbox[3] - text_bbox[1] - - if text_width <= max_width and text_height <= max_height: - return font # Return the best-fitting font - - font_size -= 2 # Reduce font size and retry - - return ImageFont.truetype(font_path, 10) # Smallest font fallback - - def return_textlines_split_if_needed(self, textline_image, textline_image_bin): - - split_point = self.return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image) - if split_point: - image1 = textline_image[:, :split_point,:]# image.crop((0, 0, width2, height)) - image2 = textline_image[:, split_point:,:]#image.crop((width1, 0, width, height)) - if self.prediction_with_both_of_rgb_and_bin: - image1_bin = textline_image_bin[:, :split_point,:]# image.crop((0, 0, width2, height)) - image2_bin = textline_image_bin[:, split_point:,:]#image.crop((width1, 0, width, height)) - return [image1, image2], [image1_bin, image2_bin] - else: - return [image1, image2], None - else: - return None, None - def preprocess_and_resize_image_for_ocrcnn_model(self, img, image_height, image_width): - ratio = image_height /float(img.shape[0]) - w_ratio = int(ratio * img.shape[1]) - - if w_ratio <= image_width: - width_new = w_ratio - else: - width_new = image_width - - if width_new == 0: - width_new = img.shape[1] - - ##if width_new+32 >= image_width: - ##width_new = width_new - 32 - - ###patch_zero = np.zeros((32, 32, 3))#+255 - ###patch_zero[9:19,8:18,:] = 0 - - - img = resize_image(img, image_height, width_new) - img_fin = np.ones((image_height, image_width, 3))*255 - ###img_fin[:,:32,:] = patch_zero[:,:,:] - ###img_fin[:,32:32+width_new,:] = img[:,:,:] - img_fin[:,:width_new,:] = img[:,:,:] - img_fin = img_fin / 255. - return img_fin - - def get_deskewed_contour_and_bb_and_image(self, contour, image, deskew_angle): - (h_in, w_in) = image.shape[:2] - center = (w_in // 2, h_in // 2) - - rotation_matrix = cv2.getRotationMatrix2D(center, deskew_angle, 1.0) - - cos_angle = abs(rotation_matrix[0, 0]) - sin_angle = abs(rotation_matrix[0, 1]) - new_w = int((h_in * sin_angle) + (w_in * cos_angle)) - new_h = int((h_in * cos_angle) + (w_in * sin_angle)) - - rotation_matrix[0, 2] += (new_w / 2) - center[0] - rotation_matrix[1, 2] += (new_h / 2) - center[1] - - deskewed_image = cv2.warpAffine(image, rotation_matrix, (new_w, new_h)) - - contour_points = np.array(contour, dtype=np.float32) - transformed_points = cv2.transform(np.array([contour_points]), rotation_matrix)[0] - - x, y, w, h = cv2.boundingRect(np.array(transformed_points, dtype=np.int32)) - cropped_textline = deskewed_image[y:y+h, x:x+w] - - return cropped_textline - - def rotate_image_with_padding(self, image, angle, border_value=(0,0,0)): - # Get image dimensions - (h, w) = image.shape[:2] - - # Calculate the center of the image - center = (w // 2, h // 2) - - # Get the rotation matrix - rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1.0) - - # Compute the new bounding dimensions - cos = abs(rotation_matrix[0, 0]) - sin = abs(rotation_matrix[0, 1]) - new_w = int((h * sin) + (w * cos)) - new_h = int((h * cos) + (w * sin)) - - # Adjust the rotation matrix to account for translation - rotation_matrix[0, 2] += (new_w / 2) - center[0] - rotation_matrix[1, 2] += (new_h / 2) - center[1] - - # Perform the rotation - rotated_image = cv2.warpAffine(image, rotation_matrix, (new_w, new_h), borderValue=border_value) - - return rotated_image - - def get_orientation_moments(self, contour): - moments = cv2.moments(contour) - if moments["mu20"] - moments["mu02"] == 0: # Avoid division by zero - return 90 if moments["mu11"] > 0 else -90 - else: - angle = 0.5 * np.arctan2(2 * moments["mu11"], moments["mu20"] - moments["mu02"]) - return np.degrees(angle) # Convert radians to degrees - - - def get_orientation_moments_of_mask(self, mask): - mask=mask.astype('uint8') - contours, _ = cv2.findContours(mask[:,:,0], cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - - largest_contour = max(contours, key=cv2.contourArea) if contours else None - - moments = cv2.moments(largest_contour) - if moments["mu20"] - moments["mu02"] == 0: # Avoid division by zero - return 90 if moments["mu11"] > 0 else -90 - else: - angle = 0.5 * np.arctan2(2 * moments["mu11"], moments["mu20"] - moments["mu02"]) - return np.degrees(angle) # Convert radians to degrees - - def get_contours_and_bounding_boxes(self, mask): - # Find contours in the binary mask - contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - - largest_contour = max(contours, key=cv2.contourArea) if contours else None - - # Get the bounding rectangle for the contour - x, y, w, h = cv2.boundingRect(largest_contour) - #bounding_boxes.append((x, y, w, h)) - - return x, y, w, h - - def return_splitting_point_of_image(self, image_to_spliited): - width = np.shape(image_to_spliited)[1] - height = np.shape(image_to_spliited)[0] - common_window = int(0.03*width) - - width1 = int ( common_window) - width2 = int ( width - common_window ) - - img_sum = np.sum(image_to_spliited[:,:,0], axis=0) - sum_smoothed = gaussian_filter1d(img_sum, 1) - - peaks_real, _ = find_peaks(sum_smoothed, height=0) - peaks_real = peaks_real[(peaks_realwidth1)] - - arg_sort = np.argsort(sum_smoothed[peaks_real]) - peaks_sort_4 = peaks_real[arg_sort][::-1][:4] - - return np.sort(peaks_sort_4) - - def break_curved_line_into_small_pieces_and_then_merge(self, img_curved, mask_curved): - peaks_4 = self.return_splitting_point_of_image(img_curved) - if len(peaks_4)>0: - imgs_tot = [] - - for ind in range(len(peaks_4)+1): - if ind==0: - img = img_curved[:, :peaks_4[ind], :] - mask = mask_curved[:, :peaks_4[ind], :] - elif ind==len(peaks_4): - img = img_curved[:, peaks_4[ind-1]:, :] - mask = mask_curved[:, peaks_4[ind-1]:, :] - else: - img = img_curved[:, peaks_4[ind-1]:peaks_4[ind], :] - mask = mask_curved[:, peaks_4[ind-1]:peaks_4[ind], :] - - or_ma = self.get_orientation_moments_of_mask(mask) - - imgs_tot.append([img, mask, or_ma] ) - - - w_tot_des_list = [] - w_tot_des = 0 - imgs_deskewed_list = [] - for ind in range(len(imgs_tot)): - img_in = imgs_tot[ind][0] - mask_in = imgs_tot[ind][1] - ori_in = imgs_tot[ind][2] - - if abs(ori_in)<45: - img_in_des = self.rotate_image_with_padding(img_in, ori_in, border_value=(255,255,255) ) - mask_in_des = self.rotate_image_with_padding(mask_in, ori_in) - mask_in_des = mask_in_des.astype('uint8') - - #new bounding box - x_n, y_n, w_n, h_n = self.get_contours_and_bounding_boxes(mask_in_des[:,:,0]) - - mask_in_des = mask_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] - img_in_des = img_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] - - w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) - if w_relative==0: - w_relative = img_in_des.shape[1] - img_in_des = resize_image(img_in_des, 32, w_relative) - - - else: - img_in_des = np.copy(img_in) - w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) - if w_relative==0: - w_relative = img_in_des.shape[1] - img_in_des = resize_image(img_in_des, 32, w_relative) - - w_tot_des+=img_in_des.shape[1] - w_tot_des_list.append(img_in_des.shape[1]) - imgs_deskewed_list.append(img_in_des) - - - - - img_final_deskewed = np.zeros((32, w_tot_des, 3))+255 - - w_indexer = 0 - for ind in range(len(w_tot_des_list)): - img_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_deskewed_list[ind][:,:,:] - w_indexer = w_indexer+w_tot_des_list[ind] - return img_final_deskewed - else: - return img_curved - def run(self): ls_imgs = os.listdir(self.dir_in) @@ -6069,7 +5814,7 @@ class Eynollah_ocr: preds_bin = self.prediction_model.predict(imgs_bin, verbose=0) preds = (preds + preds_bin) / 2. - pred_texts = self.decode_batch_predictions(preds) + pred_texts = self.decode_batch_predictions(preds, self.num_to_char) for ib in range(imgs.shape[0]): pred_texts_ib = pred_texts[ib].replace("[UNK]", "") diff --git a/src/eynollah/utils/utils_ocr.py b/src/eynollah/utils/utils_ocr.py new file mode 100644 index 0000000..44367b6 --- /dev/null +++ b/src/eynollah/utils/utils_ocr.py @@ -0,0 +1,435 @@ +import numpy as np +import cv2 +import tensorflow as tf +from scipy.signal import find_peaks +from scipy.ndimage import gaussian_filter1d +import math +from .resize import resize_image + +def decode_batch_predictions(pred, num_to_char, max_len = 128): + # input_len is the product of the batch size and the + # number of time steps. + input_len = np.ones(pred.shape[0]) * pred.shape[1] + + # Decode CTC predictions using greedy search. + # decoded is a tuple with 2 elements. + decoded = tf.keras.backend.ctc_decode(pred, + input_length = input_len, + beam_width = 100) + # The outputs are in the first element of the tuple. + # Additionally, the first element is actually a list, + # therefore we take the first element of that list as well. + #print(decoded,'decoded') + decoded = decoded[0][0][:, :max_len] + + #print(decoded, decoded.shape,'decoded') + + output = [] + for d in decoded: + # Convert the predicted indices to the corresponding chars. + d = tf.strings.reduce_join(num_to_char(d)) + d = d.numpy().decode("utf-8") + output.append(d) + return output + + +def distortion_free_resize(image, img_size): + w, h = img_size + image = tf.image.resize(image, size=(h, w), preserve_aspect_ratio=True) + + # Check tha amount of padding needed to be done. + pad_height = h - tf.shape(image)[0] + pad_width = w - tf.shape(image)[1] + + # Only necessary if you want to do same amount of padding on both sides. + if pad_height % 2 != 0: + height = pad_height // 2 + pad_height_top = height + 1 + pad_height_bottom = height + else: + pad_height_top = pad_height_bottom = pad_height // 2 + + if pad_width % 2 != 0: + width = pad_width // 2 + pad_width_left = width + 1 + pad_width_right = width + else: + pad_width_left = pad_width_right = pad_width // 2 + + image = tf.pad( + image, + paddings=[ + [pad_height_top, pad_height_bottom], + [pad_width_left, pad_width_right], + [0, 0], + ], + ) + + image = tf.transpose(image, (1, 0, 2)) + image = tf.image.flip_left_right(image) + return image + +def return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image): + width = np.shape(textline_image)[1] + height = np.shape(textline_image)[0] + common_window = int(0.22*width) + + width1 = int ( width/2. - common_window ) + width2 = int ( width/2. + common_window ) + + img_sum = np.sum(textline_image[:,:,0], axis=0) + sum_smoothed = gaussian_filter1d(img_sum, 3) + + peaks_real, _ = find_peaks(sum_smoothed, height=0) + + if len(peaks_real)>35: + + #peaks_real = peaks_real[(peaks_realwidth1)] + argsort = np.argsort(sum_smoothed[peaks_real])[::-1] + peaks_real_top_six = peaks_real[argsort[:6]] + midpoint = textline_image.shape[1] / 2. + arg_closest = np.argmin(np.abs(peaks_real_top_six - midpoint)) + + #arg_max = np.argmax(sum_smoothed[peaks_real]) + + peaks_final = peaks_real_top_six[arg_closest]#peaks_real[arg_max] + + return peaks_final + else: + return None + +# Function to fit text inside the given area +def fit_text_single_line(draw, text, font_path, max_width, max_height): + initial_font_size = 50 + font_size = initial_font_size + while font_size > 10: # Minimum font size + font = ImageFont.truetype(font_path, font_size) + text_bbox = draw.textbbox((0, 0), text, font=font) # Get text bounding box + text_width = text_bbox[2] - text_bbox[0] + text_height = text_bbox[3] - text_bbox[1] + + if text_width <= max_width and text_height <= max_height: + return font # Return the best-fitting font + + font_size -= 2 # Reduce font size and retry + + return ImageFont.truetype(font_path, 10) # Smallest font fallback + +def return_textlines_split_if_needed(textline_image, textline_image_bin, prediction_with_both_of_rgb_and_bin=False): + + split_point = return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image) + if split_point: + image1 = textline_image[:, :split_point,:]# image.crop((0, 0, width2, height)) + image2 = textline_image[:, split_point:,:]#image.crop((width1, 0, width, height)) + if prediction_with_both_of_rgb_and_bin: + image1_bin = textline_image_bin[:, :split_point,:]# image.crop((0, 0, width2, height)) + image2_bin = textline_image_bin[:, split_point:,:]#image.crop((width1, 0, width, height)) + return [image1, image2], [image1_bin, image2_bin] + else: + return [image1, image2], None + else: + return None, None +def preprocess_and_resize_image_for_ocrcnn_model(img, image_height, image_width): + ratio = image_height /float(img.shape[0]) + w_ratio = int(ratio * img.shape[1]) + + if w_ratio <= image_width: + width_new = w_ratio + else: + width_new = image_width + + if width_new == 0: + width_new = img.shape[1] + + + img = resize_image(img, image_height, width_new) + img_fin = np.ones((image_height, image_width, 3))*255 + + img_fin[:,:width_new,:] = img[:,:,:] + img_fin = img_fin / 255. + return img_fin + +def get_deskewed_contour_and_bb_and_image(contour, image, deskew_angle): + (h_in, w_in) = image.shape[:2] + center = (w_in // 2, h_in // 2) + + rotation_matrix = cv2.getRotationMatrix2D(center, deskew_angle, 1.0) + + cos_angle = abs(rotation_matrix[0, 0]) + sin_angle = abs(rotation_matrix[0, 1]) + new_w = int((h_in * sin_angle) + (w_in * cos_angle)) + new_h = int((h_in * cos_angle) + (w_in * sin_angle)) + + rotation_matrix[0, 2] += (new_w / 2) - center[0] + rotation_matrix[1, 2] += (new_h / 2) - center[1] + + deskewed_image = cv2.warpAffine(image, rotation_matrix, (new_w, new_h)) + + contour_points = np.array(contour, dtype=np.float32) + transformed_points = cv2.transform(np.array([contour_points]), rotation_matrix)[0] + + x, y, w, h = cv2.boundingRect(np.array(transformed_points, dtype=np.int32)) + cropped_textline = deskewed_image[y:y+h, x:x+w] + + return cropped_textline + +def rotate_image_with_padding(image, angle, border_value=(0,0,0)): + # Get image dimensions + (h, w) = image.shape[:2] + + # Calculate the center of the image + center = (w // 2, h // 2) + + # Get the rotation matrix + rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1.0) + + # Compute the new bounding dimensions + cos = abs(rotation_matrix[0, 0]) + sin = abs(rotation_matrix[0, 1]) + new_w = int((h * sin) + (w * cos)) + new_h = int((h * cos) + (w * sin)) + + # Adjust the rotation matrix to account for translation + rotation_matrix[0, 2] += (new_w / 2) - center[0] + rotation_matrix[1, 2] += (new_h / 2) - center[1] + + # Perform the rotation + rotated_image = cv2.warpAffine(image, rotation_matrix, (new_w, new_h), borderValue=border_value) + + return rotated_image + +def get_orientation_moments(contour): + moments = cv2.moments(contour) + if moments["mu20"] - moments["mu02"] == 0: # Avoid division by zero + return 90 if moments["mu11"] > 0 else -90 + else: + angle = 0.5 * np.arctan2(2 * moments["mu11"], moments["mu20"] - moments["mu02"]) + return np.degrees(angle) # Convert radians to degrees + + +def get_orientation_moments_of_mask(mask): + mask=mask.astype('uint8') + contours, _ = cv2.findContours(mask[:,:,0], cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + largest_contour = max(contours, key=cv2.contourArea) if contours else None + + moments = cv2.moments(largest_contour) + if moments["mu20"] - moments["mu02"] == 0: # Avoid division by zero + return 90 if moments["mu11"] > 0 else -90 + else: + angle = 0.5 * np.arctan2(2 * moments["mu11"], moments["mu20"] - moments["mu02"]) + return np.degrees(angle) # Convert radians to degrees + +def get_contours_and_bounding_boxes(mask): + # Find contours in the binary mask + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + largest_contour = max(contours, key=cv2.contourArea) if contours else None + + # Get the bounding rectangle for the contour + x, y, w, h = cv2.boundingRect(largest_contour) + #bounding_boxes.append((x, y, w, h)) + + return x, y, w, h + +def return_splitting_point_of_image(image_to_spliited): + width = np.shape(image_to_spliited)[1] + height = np.shape(image_to_spliited)[0] + common_window = int(0.03*width) + + width1 = int ( common_window) + width2 = int ( width - common_window ) + + img_sum = np.sum(image_to_spliited[:,:,0], axis=0) + sum_smoothed = gaussian_filter1d(img_sum, 1) + + peaks_real, _ = find_peaks(sum_smoothed, height=0) + peaks_real = peaks_real[(peaks_realwidth1)] + + arg_sort = np.argsort(sum_smoothed[peaks_real]) + peaks_sort_4 = peaks_real[arg_sort][::-1][:4] + + return np.sort(peaks_sort_4) + +def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved): + peaks_4 = return_splitting_point_of_image(img_curved) + if len(peaks_4)>0: + imgs_tot = [] + + for ind in range(len(peaks_4)+1): + if ind==0: + img = img_curved[:, :peaks_4[ind], :] + mask = mask_curved[:, :peaks_4[ind], :] + elif ind==len(peaks_4): + img = img_curved[:, peaks_4[ind-1]:, :] + mask = mask_curved[:, peaks_4[ind-1]:, :] + else: + img = img_curved[:, peaks_4[ind-1]:peaks_4[ind], :] + mask = mask_curved[:, peaks_4[ind-1]:peaks_4[ind], :] + + or_ma = get_orientation_moments_of_mask(mask) + + imgs_tot.append([img, mask, or_ma] ) + + + w_tot_des_list = [] + w_tot_des = 0 + imgs_deskewed_list = [] + for ind in range(len(imgs_tot)): + img_in = imgs_tot[ind][0] + mask_in = imgs_tot[ind][1] + ori_in = imgs_tot[ind][2] + + if abs(ori_in)<45: + img_in_des = rotate_image_with_padding(img_in, ori_in, border_value=(255,255,255) ) + mask_in_des = rotate_image_with_padding(mask_in, ori_in) + mask_in_des = mask_in_des.astype('uint8') + + #new bounding box + x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_in_des[:,:,0]) + + mask_in_des = mask_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] + img_in_des = img_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] + + w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) + if w_relative==0: + w_relative = img_in_des.shape[1] + img_in_des = resize_image(img_in_des, 32, w_relative) + + + else: + img_in_des = np.copy(img_in) + w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) + if w_relative==0: + w_relative = img_in_des.shape[1] + img_in_des = resize_image(img_in_des, 32, w_relative) + + w_tot_des+=img_in_des.shape[1] + w_tot_des_list.append(img_in_des.shape[1]) + imgs_deskewed_list.append(img_in_des) + + + + + img_final_deskewed = np.zeros((32, w_tot_des, 3))+255 + + w_indexer = 0 + for ind in range(len(w_tot_des_list)): + img_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_deskewed_list[ind][:,:,:] + w_indexer = w_indexer+w_tot_des_list[ind] + return img_final_deskewed + else: + return img_curved + +def return_textline_contour_with_added_box_coordinate(textline_contour, box_ind): + textline_contour[:,0] = textline_contour[:,0] + box_ind[2] + textline_contour[:,1] = textline_contour[:,1] + box_ind[0] + return textline_contour + + +def return_rnn_cnn_ocr_of_given_textlines(image, all_found_textline_polygons, prediction_model, b_s_ocr, num_to_char, textline_light=False, curved_line=False): + max_len = 512 + padding_token = 299 + image_width = 512#max_len * 4 + image_height = 32 + ind_tot = 0 + #cv2.imwrite('./img_out.png', image_page) + ocr_all_textlines = [] + cropped_lines_region_indexer = [] + cropped_lines_meging_indexing = [] + cropped_lines = [] + indexer_text_region = 0 + + for indexing, ind_poly_first in enumerate(all_found_textline_polygons): + #ocr_textline_in_textregion = [] + for indexing2, ind_poly in enumerate(ind_poly_first): + cropped_lines_region_indexer.append(indexer_text_region) + if not (textline_light or curved_line): + ind_poly = copy.deepcopy(ind_poly) + box_ind = all_box_coord[indexing] + + ind_poly = return_textline_contour_with_added_box_coordinate(ind_poly, box_ind) + #print(ind_poly_copy) + ind_poly[ind_poly<0] = 0 + x, y, w, h = cv2.boundingRect(ind_poly) + + w_scaled = w * image_height/float(h) + + mask_poly = np.zeros(image.shape) + + img_poly_on_img = np.copy(image) + + mask_poly = cv2.fillPoly(mask_poly, pts=[ind_poly], color=(1, 1, 1)) + + + + mask_poly = mask_poly[y:y+h, x:x+w, :] + img_crop = img_poly_on_img[y:y+h, x:x+w, :] + + img_crop[mask_poly==0] = 255 + + if w_scaled < 640:#1.5*image_width: + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) + cropped_lines.append(img_fin) + cropped_lines_meging_indexing.append(0) + else: + splited_images, splited_images_bin = return_textlines_split_if_needed(img_crop, None) + + if splited_images: + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[0], image_height, image_width) + cropped_lines.append(img_fin) + cropped_lines_meging_indexing.append(1) + + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[1], image_height, image_width) + + cropped_lines.append(img_fin) + cropped_lines_meging_indexing.append(-1) + + else: + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) + cropped_lines.append(img_fin) + cropped_lines_meging_indexing.append(0) + + indexer_text_region+=1 + + + extracted_texts = [] + + n_iterations = math.ceil(len(cropped_lines) / b_s_ocr) + + for i in range(n_iterations): + if i==(n_iterations-1): + n_start = i*b_s_ocr + imgs = cropped_lines[n_start:] + imgs = np.array(imgs) + imgs = imgs.reshape(imgs.shape[0], image_height, image_width, 3) + + + else: + n_start = i*b_s_ocr + n_end = (i+1)*b_s_ocr + imgs = cropped_lines[n_start:n_end] + imgs = np.array(imgs).reshape(b_s_ocr, image_height, image_width, 3) + + + preds = prediction_model.predict(imgs, verbose=0) + + pred_texts = decode_batch_predictions(preds, num_to_char) + + for ib in range(imgs.shape[0]): + pred_texts_ib = pred_texts[ib].replace("[UNK]", "") + extracted_texts.append(pred_texts_ib) + + extracted_texts_merged = [extracted_texts[ind] if cropped_lines_meging_indexing[ind]==0 else extracted_texts[ind]+" "+extracted_texts[ind+1] if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] + + extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None] + unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer) + + ocr_all_textlines = [] + for ind in unique_cropped_lines_region_indexer: + ocr_textline_in_textregion = [] + extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind] + for it_ind, text_textline in enumerate(extracted_texts_merged_un): + ocr_textline_in_textregion.append(text_textline) + ocr_all_textlines.append(ocr_textline_in_textregion) + return ocr_all_textlines diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index 8cd1c8e..cf0551b 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -168,7 +168,7 @@ class EynollahXmlWriter(): with open(self.output_filename, 'w') as f: f.write(to_xml(pcgts)) - def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines, conf_contours_textregion): + def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines, conf_contours_textregion, skip_layout_reading_order=False): self.logger.debug('enter build_pagexml_no_full_layout') # create the file structure @@ -184,7 +184,7 @@ class EynollahXmlWriter(): for mm in range(len(found_polygons_text_region)): textregion = TextRegionType(id=counter.next_region_id, type_='paragraph', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord), conf=conf_contours_textregion[mm]), + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord, skip_layout_reading_order), conf=conf_contours_textregion[mm]), ) #textregion.set_conf(conf_contours_textregion[mm]) page.add_TextRegion(textregion) @@ -303,18 +303,28 @@ class EynollahXmlWriter(): return pcgts - def calculate_polygon_coords(self, contour, page_coord): + def calculate_polygon_coords(self, contour, page_coord, skip_layout_reading_order=False): self.logger.debug('enter calculate_polygon_coords') coords = '' for value_bbox in contour: - if len(value_bbox) == 2: - coords += str(int((value_bbox[0] + page_coord[2]) / self.scale_x)) - coords += ',' - coords += str(int((value_bbox[1] + page_coord[0]) / self.scale_y)) + if skip_layout_reading_order: + if len(value_bbox) == 2: + coords += str(int((value_bbox[0]) / self.scale_x)) + coords += ',' + coords += str(int((value_bbox[1]) / self.scale_y)) + else: + coords += str(int((value_bbox[0][0]) / self.scale_x)) + coords += ',' + coords += str(int((value_bbox[0][1]) / self.scale_y)) else: - coords += str(int((value_bbox[0][0] + page_coord[2]) / self.scale_x)) - coords += ',' - coords += str(int((value_bbox[0][1] + page_coord[0]) / self.scale_y)) + if len(value_bbox) == 2: + coords += str(int((value_bbox[0] + page_coord[2]) / self.scale_x)) + coords += ',' + coords += str(int((value_bbox[1] + page_coord[0]) / self.scale_y)) + else: + coords += str(int((value_bbox[0][0] + page_coord[2]) / self.scale_x)) + coords += ',' + coords += str(int((value_bbox[0][1] + page_coord[0]) / self.scale_y)) coords=coords + ' ' return coords[:-1] From adcf03c7b7c91ef379404fe700175e8943439e31 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 23 May 2025 18:06:53 +0200 Subject: [PATCH 18/40] enhancing ocr --- src/eynollah/eynollah.py | 47 ++++++++++++++++++--------------- src/eynollah/utils/utils_ocr.py | 1 + 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 2564150..1b50713 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -85,7 +85,12 @@ from .utils.utils_ocr import ( preprocess_and_resize_image_for_ocrcnn_model, return_textlines_split_if_needed, decode_batch_predictions, - return_rnn_cnn_ocr_of_given_textlines + return_rnn_cnn_ocr_of_given_textlines, + fit_text_single_line, + break_curved_line_into_small_pieces_and_then_merge, + get_orientation_moments, + rotate_image_with_padding, + get_contours_and_bounding_boxes ) from .utils.separate_lines import ( textline_contours_postprocessing, @@ -5421,7 +5426,7 @@ class Eynollah_ocr: cropped_lines.append(resize_image(img_crop, tr_ocr_input_height_and_width, tr_ocr_input_height_and_width) ) cropped_lines_meging_indexing.append(0) else: - splited_images, _ = self.return_textlines_split_if_needed(img_crop, None) + splited_images, _ = return_textlines_split_if_needed(img_crop, None) #print(splited_images) if splited_images: cropped_lines.append(resize_image(splited_images[0], tr_ocr_input_height_and_width, tr_ocr_input_height_and_width)) @@ -5474,7 +5479,7 @@ class Eynollah_ocr: w_bb = bb_ind[2] h_bb = bb_ind[3] - font = self.fit_text_single_line(draw, extracted_texts_merged[indexer_text], font_path, w_bb, int(h_bb*0.4) ) + font = fit_text_single_line(draw, extracted_texts_merged[indexer_text], font_path, w_bb, int(h_bb*0.4) ) ##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2) @@ -5607,14 +5612,14 @@ class Eynollah_ocr: #print(file_name, angle_degrees,w*h , mask_poly[:,:,0].sum(), mask_poly[:,:,0].sum() /float(w*h) , 'didi') if not self.do_not_mask_with_textline_contour: if angle_degrees > 15: - better_des_slope = self.get_orientation_moments(textline_coords) + better_des_slope = get_orientation_moments(textline_coords) - img_crop = self.rotate_image_with_padding(img_crop, better_des_slope ) - mask_poly = self.rotate_image_with_padding(mask_poly, better_des_slope ) + img_crop = rotate_image_with_padding(img_crop, better_des_slope ) + mask_poly = rotate_image_with_padding(mask_poly, better_des_slope ) mask_poly = mask_poly.astype('uint8') #new bounding box - x_n, y_n, w_n, h_n = self.get_contours_and_bounding_boxes(mask_poly[:,:,0]) + x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_poly[:,:,0]) mask_poly = mask_poly[y_n:y_n+h_n, x_n:x_n+w_n, :] img_crop = img_crop[y_n:y_n+h_n, x_n:x_n+w_n, :] @@ -5622,13 +5627,13 @@ class Eynollah_ocr: img_crop[mask_poly==0] = 255 if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 100: - img_crop = self.break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) + img_crop = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) #print(file_name,w_n*h_n , mask_poly[:,:,0].sum(), mask_poly[:,:,0].sum() /float(w_n*h_n) , 'ikiiiiii') else: img_crop[mask_poly==0] = 255 if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 100: - img_crop = self.break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) + img_crop = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) @@ -5638,7 +5643,7 @@ class Eynollah_ocr: if not self.export_textline_images_and_text: if w_scaled < 640:#1.5*image_width: - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) cropped_lines.append(img_fin) if angle_degrees > 15: cropped_lines_ver_index.append(1) @@ -5647,15 +5652,15 @@ class Eynollah_ocr: cropped_lines_meging_indexing.append(0) if self.prediction_with_both_of_rgb_and_bin: - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width) cropped_lines_bin.append(img_fin) else: if self.prediction_with_both_of_rgb_and_bin: - splited_images, splited_images_bin = self.return_textlines_split_if_needed(img_crop, img_crop_bin) + splited_images, splited_images_bin = return_textlines_split_if_needed(img_crop, img_crop_bin) else: - splited_images, splited_images_bin = self.return_textlines_split_if_needed(img_crop, None) + splited_images, splited_images_bin = return_textlines_split_if_needed(img_crop, None) if splited_images: - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(splited_images[0], image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[0], image_height, image_width) cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(1) @@ -5664,7 +5669,7 @@ class Eynollah_ocr: else: cropped_lines_ver_index.append(0) - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(splited_images[1], image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[1], image_height, image_width) cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(-1) @@ -5675,13 +5680,13 @@ class Eynollah_ocr: cropped_lines_ver_index.append(0) if self.prediction_with_both_of_rgb_and_bin: - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(splited_images_bin[0], image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images_bin[0], image_height, image_width) cropped_lines_bin.append(img_fin) - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(splited_images_bin[1], image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images_bin[1], image_height, image_width) cropped_lines_bin.append(img_fin) else: - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(0) @@ -5691,7 +5696,7 @@ class Eynollah_ocr: cropped_lines_ver_index.append(0) if self.prediction_with_both_of_rgb_and_bin: - img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width) + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width) cropped_lines_bin.append(img_fin) if self.export_textline_images_and_text: @@ -5814,7 +5819,7 @@ class Eynollah_ocr: preds_bin = self.prediction_model.predict(imgs_bin, verbose=0) preds = (preds + preds_bin) / 2. - pred_texts = self.decode_batch_predictions(preds, self.num_to_char) + pred_texts = decode_batch_predictions(preds, self.num_to_char) for ib in range(imgs.shape[0]): pred_texts_ib = pred_texts[ib].replace("[UNK]", "") @@ -5844,7 +5849,7 @@ class Eynollah_ocr: w_bb = bb_ind[2] h_bb = bb_ind[3] - font = self.fit_text_single_line(draw, extracted_texts_merged[indexer_text], font_path, w_bb, int(h_bb*0.4) ) + font = fit_text_single_line(draw, extracted_texts_merged[indexer_text], font_path, w_bb, int(h_bb*0.4) ) ##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2) diff --git a/src/eynollah/utils/utils_ocr.py b/src/eynollah/utils/utils_ocr.py index 44367b6..339b38a 100644 --- a/src/eynollah/utils/utils_ocr.py +++ b/src/eynollah/utils/utils_ocr.py @@ -4,6 +4,7 @@ import tensorflow as tf from scipy.signal import find_peaks from scipy.ndimage import gaussian_filter1d import math +from PIL import Image, ImageDraw, ImageFont from .resize import resize_image def decode_batch_predictions(pred, num_to_char, max_len = 128): From 27c4b0d0e09ff9d7dabe31074f225adedb3ee5d1 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Sun, 25 May 2025 01:12:58 +0200 Subject: [PATCH 19/40] Drop capitals are written separately and are not attached to their corresponding text line. The OCR use case also supports single-image input. --- src/eynollah/cli.py | 11 ++++++++-- src/eynollah/eynollah.py | 46 +++++++++++++++++++++++++++++----------- src/eynollah/writer.py | 8 +++---- 3 files changed, 47 insertions(+), 18 deletions(-) diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index cd56833..0c18b2c 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -331,6 +331,12 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ @main.command() +@click.option( + "--image", + "-i", + help="image filename", + type=click.Path(exists=True, dir_okay=False), +) @click.option( "--dir_in", "-di", @@ -415,7 +421,7 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ help="Override log level globally to this", ) -def ocr(dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, prediction_with_both_of_rgb_and_bin, batch_size, dataset_abbrevation, log_level): +def ocr(image, dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, prediction_with_both_of_rgb_and_bin, batch_size, dataset_abbrevation, log_level): initLogging() if log_level: getLogger('eynollah').setLevel(getLevelName(log_level)) @@ -426,8 +432,9 @@ def ocr(dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, ex assert not export_textline_images_and_text or not dir_out_image_text, "Exporting textline and text -etit can not be set alongside directory of images with predicted text -doit" assert not export_textline_images_and_text or not draw_texts_on_image, "Exporting textline and text -etit can not be set alongside draw text on image -dtoi" assert not export_textline_images_and_text or not prediction_with_both_of_rgb_and_bin, "Exporting textline and text -etit can not be set alongside prediction with both rgb and bin -brb" - + assert (bool(image) ^ bool(dir_in)), "Either -i (single image) or -di (directory) must be provided, but not both." eynollah_ocr = Eynollah_ocr( + image_filename=image, dir_xmls=dir_xmls, dir_out_image_text=dir_out_image_text, dir_in=dir_in, diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 1b50713..aa38274 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -5134,10 +5134,10 @@ class Eynollah: pixel_img = 4 polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img) - all_found_textline_polygons = adhere_drop_capital_region_into_corresponding_textline( - text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, - all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, - kernel=KERNEL, curved_line=self.curved_line, textline_light=self.textline_light) + ##all_found_textline_polygons = adhere_drop_capital_region_into_corresponding_textline( + ##text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, + ##all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, + ##kernel=KERNEL, curved_line=self.curved_line, textline_light=self.textline_light) if not self.reading_order_machine_based: pixel_seps = 6 @@ -5299,6 +5299,7 @@ class Eynollah_ocr: dir_models, dir_xmls=None, dir_in=None, + image_filename=None, dir_in_bin=None, dir_out=None, dir_out_image_text=None, @@ -5312,6 +5313,7 @@ class Eynollah_ocr: logger=None, ): self.dir_in = dir_in + self.image_filename = image_filename self.dir_in_bin = dir_in_bin self.dir_out = dir_out self.dir_xmls = dir_xmls @@ -5363,13 +5365,20 @@ class Eynollah_ocr: ) def run(self): - ls_imgs = os.listdir(self.dir_in) + if self.dir_in: + ls_imgs = os.listdir(self.dir_in) + else: + ls_imgs = [self.image_filename] if self.tr_ocr: tr_ocr_input_height_and_width = 384 for ind_img in ls_imgs: - file_name = Path(ind_img).stem - dir_img = os.path.join(self.dir_in, ind_img) + if self.dir_in: + file_name = Path(ind_img).stem + dir_img = os.path.join(self.dir_in, ind_img) + else: + file_name = Path(self.image_filename).stem + dir_img = self.image_filename dir_xml = os.path.join(self.dir_xmls, file_name+'.xml') out_file_ocr = os.path.join(self.dir_out, file_name+'.xml') img = cv2.imread(dir_img) @@ -5541,8 +5550,15 @@ class Eynollah_ocr: img_size=(image_width, image_height) for ind_img in ls_imgs: - file_name = Path(ind_img).stem - dir_img = os.path.join(self.dir_in, ind_img) + if self.dir_in: + file_name = Path(ind_img).stem + dir_img = os.path.join(self.dir_in, ind_img) + else: + file_name = Path(self.image_filename).stem + dir_img = self.image_filename + + #file_name = Path(ind_img).stem + #dir_img = os.path.join(self.dir_in, ind_img) dir_xml = os.path.join(self.dir_xmls, file_name+'.xml') out_file_ocr = os.path.join(self.dir_out, file_name+'.xml') img = cv2.imread(dir_img) @@ -5576,6 +5592,7 @@ class Eynollah_ocr: indexer_text_region = 0 indexer_textlines = 0 for nn in root1.iter(region_tags): + type_textregion = nn.attrib['type'] for child_textregion in nn: if child_textregion.tag.endswith("TextLine"): for child_textlines in child_textregion: @@ -5589,7 +5606,9 @@ class Eynollah_ocr: angle_radians = math.atan2(h, w) # Convert to degrees angle_degrees = math.degrees(angle_radians) - + if type_textregion=='drop-capital': + angle_degrees = 0 + if self.draw_texts_on_image: total_bb_coordinates.append([x,y,w,h]) @@ -5632,8 +5651,11 @@ class Eynollah_ocr: #print(file_name,w_n*h_n , mask_poly[:,:,0].sum(), mask_poly[:,:,0].sum() /float(w_n*h_n) , 'ikiiiiii') else: img_crop[mask_poly==0] = 255 - if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 100: - img_crop = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) + if type_textregion=='drop-capital': + pass + else: + if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 100: + img_crop = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index cf0551b..f07abf6 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -283,14 +283,14 @@ class EynollahXmlWriter(): Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord))) page.add_TextRegion(marginal) self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) - + for mm in range(len(found_polygons_drop_capitals)): dropcapital = TextRegionType(id=counter.next_region_id, type_='drop-capital', Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord))) page.add_TextRegion(dropcapital) - ###all_box_coord_drop = None - ###slopes_drop = None - ###self.serialize_lines_in_dropcapital(dropcapital, [found_polygons_drop_capitals[mm]], mm, page_coord, all_box_coord_drop, slopes_drop, counter, ocr_all_textlines_textregion=None) + all_box_coord_drop = None + slopes_drop = None + self.serialize_lines_in_dropcapital(dropcapital, [found_polygons_drop_capitals[mm]], mm, page_coord, all_box_coord_drop, slopes_drop, counter, ocr_all_textlines_textregion=None) for mm in range(len(found_polygons_text_region_img)): page.add_ImageRegion(ImageRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_img[mm], page_coord)))) From 097520bfd275f8260eebd698bae42b0c33eafd3c Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Sun, 25 May 2025 03:33:54 +0200 Subject: [PATCH 20/40] rnn ocr for all layout textregion types --- src/eynollah/eynollah.py | 41 ++++++++++++++++++++++++++-------------- src/eynollah/writer.py | 31 ++++++++++++++++++++++-------- 2 files changed, 50 insertions(+), 22 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index aa38274..0ee3d14 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4715,11 +4715,10 @@ class Eynollah: if self.extract_only_images: text_regions_p_1, erosion_hurts, polygons_lines_xml, polygons_of_images, image_page, page_coord, cont_page = \ self.get_regions_light_v_extract_only_images(img_res, is_image_enhanced, num_col_classifier) - ocr_all_textlines = None pcgts = self.writer.build_pagexml_no_full_layout( [], page_coord, [], [], [], [], polygons_of_images, [], [], [], [], [], - cont_page, [], [], ocr_all_textlines, []) + cont_page, [], []) if self.plotter: self.plotter.write_images_into_directory(polygons_of_images, image_page) return pcgts @@ -4772,7 +4771,7 @@ class Eynollah: cont_page, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, - cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines, conf_contours_textregions, self.skip_layout_and_reading_order) + cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines=ocr_all_textlines, conf_contours_textregion=conf_contours_textregions, skip_layout_reading_order=self.skip_layout_and_reading_order) return pcgts #print("text region early -1 in %.1fs", time.time() - t0) @@ -4822,10 +4821,9 @@ class Eynollah: if not num_col: self.logger.info("No columns detected, outputting an empty PAGE-XML") - ocr_all_textlines = None pcgts = self.writer.build_pagexml_no_full_layout( [], page_coord, [], [], [], [], [], [], [], [], [], [], - cont_page, [], [], ocr_all_textlines, []) + cont_page, [], []) return pcgts #print("text region early in %.1fs", time.time() - t0) @@ -5004,13 +5002,13 @@ class Eynollah: [], [], page_coord, [], [], [], [], [], [], polygons_of_images, contours_tables, [], polygons_of_marginals, empty_marginals, empty_marginals, [], [], [], - cont_page, polygons_lines_xml, [], [], []) + cont_page, polygons_lines_xml) else: pcgts = self.writer.build_pagexml_no_full_layout( [], page_coord, [], [], [], [], polygons_of_images, polygons_of_marginals, empty_marginals, empty_marginals, [], [], - cont_page, polygons_lines_xml, contours_tables, [], []) + cont_page, polygons_lines_xml, contours_tables) return pcgts @@ -5196,16 +5194,28 @@ class Eynollah: contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d) self.logger.info("detection of reading order took %.1fs", time.time() - t_order) - if self.ocr: - ocr_all_textlines = [] + if self.ocr and not self.tr: + gc.collect() + if len(all_found_textline_polygons)>0: + ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + if all_found_textline_polygons_marginals and len(all_found_textline_polygons_marginals)>0: + ocr_all_textlines_marginals = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + + if all_found_textline_polygons_h and len(all_found_textline_polygons)>0: + ocr_all_textlines_h = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_h, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + if polygons_of_drop_capitals and len(polygons_of_drop_capitals)>0: + ocr_all_textlines_drop = return_rnn_cnn_ocr_of_given_textlines(image_page, polygons_of_drop_capitals, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines = None + ocr_all_textlines_marginals = None + ocr_all_textlines_h = None + ocr_all_textlines_drop = None pcgts = self.writer.build_pagexml_full_layout( contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, - cont_page, polygons_lines_xml, ocr_all_textlines, conf_contours_textregions, conf_contours_textregions_h) + cont_page, polygons_lines_xml, ocr_all_textlines, ocr_all_textlines_h, ocr_all_textlines_marginals, ocr_all_textlines_drop, conf_contours_textregions, conf_contours_textregions_h) return pcgts contours_only_text_parent_h = None @@ -5278,18 +5288,21 @@ class Eynollah: elif self.ocr and not self.tr: gc.collect() - ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) - + if len(all_found_textline_polygons)>0: + ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + if all_found_textline_polygons_marginals and len(all_found_textline_polygons_marginals)>0: + ocr_all_textlines_marginals = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines = None - #print(ocr_all_textlines) + ocr_all_textlines_marginals = None self.logger.info("detection of reading order took %.1fs", time.time() - t_order) + pcgts = self.writer.build_pagexml_no_full_layout( txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, - cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines, conf_contours_textregions) + cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines, ocr_all_textlines_marginals, conf_contours_textregions) return pcgts diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index f07abf6..085ee6f 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -56,10 +56,12 @@ class EynollahXmlWriter(): points_page_print = points_page_print + ' ' return points_page_print[:-1] - def serialize_lines_in_marginal(self, marginal_region, all_found_textline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, counter): + def serialize_lines_in_marginal(self, marginal_region, all_found_textline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, counter, ocr_all_textlines_textregion): for j in range(len(all_found_textline_polygons_marginals[marginal_idx])): coords = CoordsType() textline = TextLineType(id=counter.next_line_id, Coords=coords) + if ocr_all_textlines_textregion: + textline.set_TextEquiv( [ TextEquivType(Unicode=ocr_all_textlines_textregion[j]) ] ) marginal_region.add_TextLine(textline) marginal_region.set_orientation(-slopes_marginals[marginal_idx]) points_co = '' @@ -168,7 +170,7 @@ class EynollahXmlWriter(): with open(self.output_filename, 'w') as f: f.write(to_xml(pcgts)) - def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines, conf_contours_textregion, skip_layout_reading_order=False): + def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines=None, ocr_all_textlines_marginals=None, conf_contours_textregion=None, skip_layout_reading_order=False): self.logger.debug('enter build_pagexml_no_full_layout') # create the file structure @@ -198,7 +200,12 @@ class EynollahXmlWriter(): marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord))) page.add_TextRegion(marginal) - self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) + if ocr_all_textlines_marginals: + ocr_textlines = ocr_all_textlines_marginals[mm] + else: + ocr_textlines = None + + self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter, ocr_textlines) for mm in range(len(found_polygons_text_region_img)): img_region = ImageRegionType(id=counter.next_region_id, Coords=CoordsType()) @@ -242,7 +249,7 @@ class EynollahXmlWriter(): return pcgts - def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, ocr_all_textlines, conf_contours_textregion, conf_contours_textregion_h): + def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, ocr_all_textlines=None, ocr_all_textlines_h=None, ocr_all_textlines_marginals=None, ocr_all_textlines_drop=None, conf_contours_textregion=None, conf_contours_textregion_h=None): self.logger.debug('enter build_pagexml_full_layout') # create the file structure @@ -272,8 +279,8 @@ class EynollahXmlWriter(): Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord))) page.add_TextRegion(textregion) - if ocr_all_textlines: - ocr_textlines = ocr_all_textlines[mm] + if ocr_all_textlines_h: + ocr_textlines = ocr_all_textlines_h[mm] else: ocr_textlines = None self.serialize_lines_in_region(textregion, all_found_textline_polygons_h, mm, page_coord, all_box_coord_h, slopes_h, counter, ocr_textlines) @@ -282,7 +289,11 @@ class EynollahXmlWriter(): marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord))) page.add_TextRegion(marginal) - self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) + if ocr_all_textlines_marginals: + ocr_textlines = ocr_all_textlines_marginals[mm] + else: + ocr_textlines = None + self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter, ocr_textlines) for mm in range(len(found_polygons_drop_capitals)): dropcapital = TextRegionType(id=counter.next_region_id, type_='drop-capital', @@ -290,7 +301,11 @@ class EynollahXmlWriter(): page.add_TextRegion(dropcapital) all_box_coord_drop = None slopes_drop = None - self.serialize_lines_in_dropcapital(dropcapital, [found_polygons_drop_capitals[mm]], mm, page_coord, all_box_coord_drop, slopes_drop, counter, ocr_all_textlines_textregion=None) + if ocr_all_textlines_drop: + ocr_textlines = ocr_all_textlines_drop[mm] + else: + ocr_textlines = None + self.serialize_lines_in_dropcapital(dropcapital, [found_polygons_drop_capitals[mm]], mm, page_coord, all_box_coord_drop, slopes_drop, counter, ocr_all_textlines_textregion=ocr_textlines) for mm in range(len(found_polygons_text_region_img)): page.add_ImageRegion(ImageRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_img[mm], page_coord)))) From 0f154c605a870c14556d0d0df539f19511735410 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Sun, 25 May 2025 21:44:36 +0200 Subject: [PATCH 21/40] strings alignment function is added + new changes needed for prediction with both bin and rgb inputs is implemented --- requirements.txt | 1 + src/eynollah/eynollah.py | 78 +++++++++++++++++++++++++++------ src/eynollah/utils/utils_ocr.py | 47 +++++++++++++++++--- 3 files changed, 107 insertions(+), 19 deletions(-) diff --git a/requirements.txt b/requirements.txt index aeffd47..4bc0c6a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ tensorflow < 2.13 numba <= 0.58.1 scikit-image loky +biopython diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 0ee3d14..1f79995 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -5647,6 +5647,10 @@ class Eynollah_ocr: better_des_slope = get_orientation_moments(textline_coords) img_crop = rotate_image_with_padding(img_crop, better_des_slope ) + + if self.prediction_with_both_of_rgb_and_bin: + img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope ) + mask_poly = rotate_image_with_padding(mask_poly, better_des_slope ) mask_poly = mask_poly.astype('uint8') @@ -5655,26 +5659,35 @@ class Eynollah_ocr: mask_poly = mask_poly[y_n:y_n+h_n, x_n:x_n+w_n, :] img_crop = img_crop[y_n:y_n+h_n, x_n:x_n+w_n, :] - + img_crop[mask_poly==0] = 255 + if self.prediction_with_both_of_rgb_and_bin: + img_crop_bin = img_crop_bin[y_n:y_n+h_n, x_n:x_n+w_n, :] + img_crop_bin[mask_poly==0] = 255 + if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 100: - img_crop = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) - - #print(file_name,w_n*h_n , mask_poly[:,:,0].sum(), mask_poly[:,:,0].sum() /float(w_n*h_n) , 'ikiiiiii') + if self.prediction_with_both_of_rgb_and_bin: + img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin) + else: + img_crop, _ = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) + + else: img_crop[mask_poly==0] = 255 + if self.prediction_with_both_of_rgb_and_bin: + img_crop_bin[mask_poly==0] = 255 if type_textregion=='drop-capital': pass else: if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 100: - img_crop = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) + if self.prediction_with_both_of_rgb_and_bin: + img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin) + else: + img_crop, _ = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) - - - if self.prediction_with_both_of_rgb_and_bin: - img_crop_bin[mask_poly==0] = 255 + if not self.export_textline_images_and_text: if w_scaled < 640:#1.5*image_width: @@ -5796,6 +5809,14 @@ class Eynollah_ocr: imgs_bin = cropped_lines_bin[n_start:] imgs_bin = np.array(imgs_bin) imgs_bin = imgs_bin.reshape(imgs_bin.shape[0], image_height, image_width, 3) + + if len(indices_ver)>0: + imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:] + imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:] + #print(imgs_ver_flipped, 'imgs_ver_flipped') + + else: + imgs_bin_ver_flipped = None else: n_start = i*self.b_s n_end = (i+1)*self.b_s @@ -5817,22 +5838,25 @@ class Eynollah_ocr: if self.prediction_with_both_of_rgb_and_bin: imgs_bin = cropped_lines_bin[n_start:n_end] imgs_bin = np.array(imgs_bin).reshape(self.b_s, image_height, image_width, 3) + + + if len(indices_ver)>0: + imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:] + imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:] + #print(imgs_ver_flipped, 'imgs_ver_flipped') + else: + imgs_bin_ver_flipped = None preds = self.prediction_model.predict(imgs, verbose=0) if len(indices_ver)>0: - #cv2.imwrite('flipped.png', (imgs_ver_flipped[0, :,:,:]*255).astype('uint8')) - #cv2.imwrite('original.png', (imgs[0, :,:,:]*255).astype('uint8')) - #sys.exit() - #print(imgs_ver_flipped.shape, 'imgs_ver_flipped.shape') preds_flipped = self.prediction_model.predict(imgs_ver_flipped, verbose=0) preds_max_fliped = np.max(preds_flipped, axis=2 ) preds_max_args_flipped = np.argmax(preds_flipped, axis=2 ) pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=256 masked_means_flipped = np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / np.sum(pred_max_not_unk_mask_bool_flipped, axis=1) masked_means_flipped[np.isnan(masked_means_flipped)] = 0 - #print(masked_means_flipped, 'masked_means_flipped') preds_max = np.max(preds, axis=2 ) preds_max_args = np.argmax(preds, axis=2 ) @@ -5852,6 +5876,32 @@ class Eynollah_ocr: preds[indices_to_be_replaced,:,:] = preds_flipped[indices_where_flipped_conf_value_is_higher, :, :] if self.prediction_with_both_of_rgb_and_bin: preds_bin = self.prediction_model.predict(imgs_bin, verbose=0) + + if len(indices_ver)>0: + preds_flipped = self.prediction_model.predict(imgs_bin_ver_flipped, verbose=0) + preds_max_fliped = np.max(preds_flipped, axis=2 ) + preds_max_args_flipped = np.argmax(preds_flipped, axis=2 ) + pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=256 + masked_means_flipped = np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / np.sum(pred_max_not_unk_mask_bool_flipped, axis=1) + masked_means_flipped[np.isnan(masked_means_flipped)] = 0 + + preds_max = np.max(preds, axis=2 ) + preds_max_args = np.argmax(preds, axis=2 ) + pred_max_not_unk_mask_bool = preds_max_args[:,:]!=256 + + masked_means = np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / np.sum(pred_max_not_unk_mask_bool, axis=1) + masked_means[np.isnan(masked_means)] = 0 + + masked_means_ver = masked_means[indices_ver] + #print(masked_means_ver, 'pred_max_not_unk') + + indices_where_flipped_conf_value_is_higher = np.where(masked_means_flipped > masked_means_ver)[0] + + #print(indices_where_flipped_conf_value_is_higher, 'indices_where_flipped_conf_value_is_higher') + if len(indices_where_flipped_conf_value_is_higher)>0: + indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher] + preds_bin[indices_to_be_replaced,:,:] = preds_flipped[indices_where_flipped_conf_value_is_higher, :, :] + preds = (preds + preds_bin) / 2. pred_texts = decode_batch_predictions(preds, self.num_to_char) diff --git a/src/eynollah/utils/utils_ocr.py b/src/eynollah/utils/utils_ocr.py index 339b38a..524e7ce 100644 --- a/src/eynollah/utils/utils_ocr.py +++ b/src/eynollah/utils/utils_ocr.py @@ -5,6 +5,7 @@ from scipy.signal import find_peaks from scipy.ndimage import gaussian_filter1d import math from PIL import Image, ImageDraw, ImageFont +from Bio import pairwise2 from .resize import resize_image def decode_batch_predictions(pred, num_to_char, max_len = 128): @@ -252,7 +253,7 @@ def return_splitting_point_of_image(image_to_spliited): return np.sort(peaks_sort_4) -def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved): +def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved, img_bin_curved=None): peaks_4 = return_splitting_point_of_image(img_curved) if len(peaks_4)>0: imgs_tot = [] @@ -260,29 +261,44 @@ def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved): for ind in range(len(peaks_4)+1): if ind==0: img = img_curved[:, :peaks_4[ind], :] + if img_bin_curved: + img_bin = img_curved_bin[:, :peaks_4[ind], :] mask = mask_curved[:, :peaks_4[ind], :] elif ind==len(peaks_4): img = img_curved[:, peaks_4[ind-1]:, :] + if img_bin_curved: + img_bin = img_curved_bin[:, peaks_4[ind-1]:, :] mask = mask_curved[:, peaks_4[ind-1]:, :] else: img = img_curved[:, peaks_4[ind-1]:peaks_4[ind], :] + if img_bin_curved: + img_bin = img_curved_bin[:, peaks_4[ind-1]:peaks_4[ind], :] mask = mask_curved[:, peaks_4[ind-1]:peaks_4[ind], :] or_ma = get_orientation_moments_of_mask(mask) - - imgs_tot.append([img, mask, or_ma] ) + + if img_bin_curved: + imgs_tot.append([img, mask, or_ma, img_bin] ) + else: + imgs_tot.append([img, mask, or_ma] ) w_tot_des_list = [] w_tot_des = 0 imgs_deskewed_list = [] + imgs_bin_deskewed_list = [] + for ind in range(len(imgs_tot)): img_in = imgs_tot[ind][0] mask_in = imgs_tot[ind][1] ori_in = imgs_tot[ind][2] + if img_bin_curved: + img_bin_in = imgs_tot[ind][3] if abs(ori_in)<45: img_in_des = rotate_image_with_padding(img_in, ori_in, border_value=(255,255,255) ) + if img_bin_curved: + img_bin_in_des = rotate_image_with_padding(img_bin_in, ori_in, border_value=(255,255,255) ) mask_in_des = rotate_image_with_padding(mask_in, ori_in) mask_in_des = mask_in_des.astype('uint8') @@ -291,36 +307,52 @@ def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved): mask_in_des = mask_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] img_in_des = img_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] + if img_bin_curved: + img_bin_in_des = img_bin_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) if w_relative==0: w_relative = img_in_des.shape[1] img_in_des = resize_image(img_in_des, 32, w_relative) + if img_bin_curved: + img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative) else: img_in_des = np.copy(img_in) + if img_bin_curved: + img_bin_in_des = np.copy(img_bin_in) w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) if w_relative==0: w_relative = img_in_des.shape[1] img_in_des = resize_image(img_in_des, 32, w_relative) + if img_bin_curved: + img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative) w_tot_des+=img_in_des.shape[1] w_tot_des_list.append(img_in_des.shape[1]) imgs_deskewed_list.append(img_in_des) + if img_bin_curved: + imgs_bin_deskewed_list.append(img_bin_in_des) img_final_deskewed = np.zeros((32, w_tot_des, 3))+255 + if img_bin_curved: + img_bin_final_deskewed = np.zeros((32, w_tot_des, 3))+255 + else: + img_bin_final_deskewed = None w_indexer = 0 for ind in range(len(w_tot_des_list)): img_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_deskewed_list[ind][:,:,:] + if img_bin_curved: + img_bin_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_bin_deskewed_list[ind][:,:,:] w_indexer = w_indexer+w_tot_des_list[ind] - return img_final_deskewed + return img_final_deskewed, img_bin_final_deskewed else: - return img_curved + return img_curved, img_bin_curved def return_textline_contour_with_added_box_coordinate(textline_contour, box_ind): textline_contour[:,0] = textline_contour[:,0] + box_ind[2] @@ -434,3 +466,8 @@ def return_rnn_cnn_ocr_of_given_textlines(image, all_found_textline_polygons, pr ocr_textline_in_textregion.append(text_textline) ocr_all_textlines.append(ocr_textline_in_textregion) return ocr_all_textlines + +def biopython_align(str1, str2): + alignments = pairwise2.align.globalms(str1, str2, 2, -1, -2, -2) + best_alignment = alignments[0] # Get the best alignment + return best_alignment.seqA, best_alignment.seqB From b93fc112bf8c414186e64de6cc092b1839239128 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 27 May 2025 23:45:22 +0200 Subject: [PATCH 22/40] updating ocr --- src/eynollah/cli.py | 10 ++++-- src/eynollah/eynollah.py | 24 ++++++++++++-- src/eynollah/utils/utils_ocr.py | 55 +++++++++++++++++---------------- 3 files changed, 58 insertions(+), 31 deletions(-) diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index 0c18b2c..2d0d6f9 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -337,6 +337,12 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ help="image filename", type=click.Path(exists=True, dir_okay=False), ) +@click.option( + "--overwrite", + "-O", + help="overwrite (instead of skipping) if output xml exists", + is_flag=True, +) @click.option( "--dir_in", "-di", @@ -421,7 +427,7 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ help="Override log level globally to this", ) -def ocr(image, dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, prediction_with_both_of_rgb_and_bin, batch_size, dataset_abbrevation, log_level): +def ocr(image, overwrite, dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, prediction_with_both_of_rgb_and_bin, batch_size, dataset_abbrevation, log_level): initLogging() if log_level: getLogger('eynollah').setLevel(getLevelName(log_level)) @@ -449,7 +455,7 @@ def ocr(image, dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ batch_size=batch_size, pref_of_dataset=dataset_abbrevation, ) - eynollah_ocr.run() + eynollah_ocr.run(overwrite=overwrite) if __name__ == "__main__": main() diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 1f79995..efa1dde 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -5338,6 +5338,8 @@ class Eynollah_ocr: self.dir_out_image_text = dir_out_image_text self.prediction_with_both_of_rgb_and_bin = prediction_with_both_of_rgb_and_bin self.pref_of_dataset = pref_of_dataset + self.logger = logger if logger else getLogger('eynollah') + if not export_textline_images_and_text: if tr_ocr: self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") @@ -5351,7 +5353,7 @@ class Eynollah_ocr: self.b_s = int(batch_size) else: - self.model_ocr_dir = dir_models + "/model_step_750000_ocr"#"/model_step_125000_ocr"#"/model_step_25000_ocr"#"/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" + self.model_ocr_dir = dir_models + "/model_step_1075000_ocr"#"/model_step_125000_ocr"#"/model_step_25000_ocr"#"/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" model_ocr = load_model(self.model_ocr_dir , compile=False) self.prediction_model = tf.keras.models.Model( @@ -5377,7 +5379,7 @@ class Eynollah_ocr: vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True ) - def run(self): + def run(self, overwrite : bool = False): if self.dir_in: ls_imgs = os.listdir(self.dir_in) else: @@ -5394,6 +5396,14 @@ class Eynollah_ocr: dir_img = self.image_filename dir_xml = os.path.join(self.dir_xmls, file_name+'.xml') out_file_ocr = os.path.join(self.dir_out, file_name+'.xml') + + if os.path.exists(out_file_ocr): + if overwrite: + self.logger.warning("will overwrite existing output file '%s'", out_file_ocr) + else: + self.logger.warning("will skip input for existing output file '%s'", out_file_ocr) + continue + img = cv2.imread(dir_img) if self.draw_texts_on_image: @@ -5574,6 +5584,14 @@ class Eynollah_ocr: #dir_img = os.path.join(self.dir_in, ind_img) dir_xml = os.path.join(self.dir_xmls, file_name+'.xml') out_file_ocr = os.path.join(self.dir_out, file_name+'.xml') + + if os.path.exists(out_file_ocr): + if overwrite: + self.logger.warning("will overwrite existing output file '%s'", out_file_ocr) + else: + self.logger.warning("will skip input for existing output file '%s'", out_file_ocr) + continue + img = cv2.imread(dir_img) if self.prediction_with_both_of_rgb_and_bin: cropped_lines_bin = [] @@ -5704,7 +5722,7 @@ class Eynollah_ocr: cropped_lines_bin.append(img_fin) else: if self.prediction_with_both_of_rgb_and_bin: - splited_images, splited_images_bin = return_textlines_split_if_needed(img_crop, img_crop_bin) + splited_images, splited_images_bin = return_textlines_split_if_needed(img_crop, img_crop_bin, prediction_with_both_of_rgb_and_bin=self.prediction_with_both_of_rgb_and_bin) else: splited_images, splited_images_bin = return_textlines_split_if_needed(img_crop, None) if splited_images: diff --git a/src/eynollah/utils/utils_ocr.py b/src/eynollah/utils/utils_ocr.py index 524e7ce..9ef344a 100644 --- a/src/eynollah/utils/utils_ocr.py +++ b/src/eynollah/utils/utils_ocr.py @@ -74,32 +74,24 @@ def distortion_free_resize(image, img_size): def return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image): width = np.shape(textline_image)[1] height = np.shape(textline_image)[0] - common_window = int(0.22*width) + common_window = int(0.06*width) width1 = int ( width/2. - common_window ) width2 = int ( width/2. + common_window ) - + img_sum = np.sum(textline_image[:,:,0], axis=0) sum_smoothed = gaussian_filter1d(img_sum, 3) - + peaks_real, _ = find_peaks(sum_smoothed, height=0) - - if len(peaks_real)>35: + if len(peaks_real)>70: - #peaks_real = peaks_real[(peaks_realwidth1)] - argsort = np.argsort(sum_smoothed[peaks_real])[::-1] - peaks_real_top_six = peaks_real[argsort[:6]] - midpoint = textline_image.shape[1] / 2. - arg_closest = np.argmin(np.abs(peaks_real_top_six - midpoint)) + peaks_real = peaks_real[(peaks_realwidth1)] - #arg_max = np.argmax(sum_smoothed[peaks_real]) - - peaks_final = peaks_real_top_six[arg_closest]#peaks_real[arg_max] - + arg_max = np.argmax(sum_smoothed[peaks_real]) + peaks_final = peaks_real[arg_max] return peaks_final else: return None - # Function to fit text inside the given area def fit_text_single_line(draw, text, font_path, max_width, max_height): initial_font_size = 50 @@ -305,17 +297,28 @@ def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved, #new bounding box x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_in_des[:,:,0]) - mask_in_des = mask_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] - img_in_des = img_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] - if img_bin_curved: - img_bin_in_des = img_bin_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] - - w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) - if w_relative==0: - w_relative = img_in_des.shape[1] - img_in_des = resize_image(img_in_des, 32, w_relative) - if img_bin_curved: - img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative) + if w_n==0 or h_n==0: + img_in_des = np.copy(img_in) + if img_bin_curved: + img_bin_in_des = np.copy(img_bin_in) + w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) + if w_relative==0: + w_relative = img_in_des.shape[1] + img_in_des = resize_image(img_in_des, 32, w_relative) + if img_bin_curved: + img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative) + else: + mask_in_des = mask_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] + img_in_des = img_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] + if img_bin_curved: + img_bin_in_des = img_bin_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] + + w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) + if w_relative==0: + w_relative = img_in_des.shape[1] + img_in_des = resize_image(img_in_des, 32, w_relative) + if img_bin_curved: + img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative) else: From 48285ce3f5f132cfe3df84f91d7957b5da8e14e8 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 28 May 2025 01:17:21 +0200 Subject: [PATCH 23/40] updating ocr --- src/eynollah/eynollah.py | 2 +- src/eynollah/utils/utils_ocr.py | 36 ++++++++++++++++----------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index efa1dde..0a9248e 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -5353,7 +5353,7 @@ class Eynollah_ocr: self.b_s = int(batch_size) else: - self.model_ocr_dir = dir_models + "/model_step_1075000_ocr"#"/model_step_125000_ocr"#"/model_step_25000_ocr"#"/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" + self.model_ocr_dir = dir_models + "/model_step_1150000_ocr"#"/model_step_125000_ocr"#"/model_step_25000_ocr"#"/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" model_ocr = load_model(self.model_ocr_dir , compile=False) self.prediction_model = tf.keras.models.Model( diff --git a/src/eynollah/utils/utils_ocr.py b/src/eynollah/utils/utils_ocr.py index 9ef344a..aa1efa6 100644 --- a/src/eynollah/utils/utils_ocr.py +++ b/src/eynollah/utils/utils_ocr.py @@ -253,23 +253,23 @@ def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved, for ind in range(len(peaks_4)+1): if ind==0: img = img_curved[:, :peaks_4[ind], :] - if img_bin_curved: - img_bin = img_curved_bin[:, :peaks_4[ind], :] + if img_bin_curved is not None: + img_bin = img_bin_curved[:, :peaks_4[ind], :] mask = mask_curved[:, :peaks_4[ind], :] elif ind==len(peaks_4): img = img_curved[:, peaks_4[ind-1]:, :] - if img_bin_curved: - img_bin = img_curved_bin[:, peaks_4[ind-1]:, :] + if img_bin_curved is not None: + img_bin = img_bin_curved[:, peaks_4[ind-1]:, :] mask = mask_curved[:, peaks_4[ind-1]:, :] else: img = img_curved[:, peaks_4[ind-1]:peaks_4[ind], :] - if img_bin_curved: - img_bin = img_curved_bin[:, peaks_4[ind-1]:peaks_4[ind], :] + if img_bin_curved is not None: + img_bin = img_bin_curved[:, peaks_4[ind-1]:peaks_4[ind], :] mask = mask_curved[:, peaks_4[ind-1]:peaks_4[ind], :] or_ma = get_orientation_moments_of_mask(mask) - if img_bin_curved: + if img_bin_curved is not None: imgs_tot.append([img, mask, or_ma, img_bin] ) else: imgs_tot.append([img, mask, or_ma] ) @@ -284,12 +284,12 @@ def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved, img_in = imgs_tot[ind][0] mask_in = imgs_tot[ind][1] ori_in = imgs_tot[ind][2] - if img_bin_curved: + if img_bin_curved is not None: img_bin_in = imgs_tot[ind][3] if abs(ori_in)<45: img_in_des = rotate_image_with_padding(img_in, ori_in, border_value=(255,255,255) ) - if img_bin_curved: + if img_bin_curved is not None: img_bin_in_des = rotate_image_with_padding(img_bin_in, ori_in, border_value=(255,255,255) ) mask_in_des = rotate_image_with_padding(mask_in, ori_in) mask_in_des = mask_in_des.astype('uint8') @@ -299,50 +299,50 @@ def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved, if w_n==0 or h_n==0: img_in_des = np.copy(img_in) - if img_bin_curved: + if img_bin_curved is not None: img_bin_in_des = np.copy(img_bin_in) w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) if w_relative==0: w_relative = img_in_des.shape[1] img_in_des = resize_image(img_in_des, 32, w_relative) - if img_bin_curved: + if img_bin_curved is not None: img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative) else: mask_in_des = mask_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] img_in_des = img_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] - if img_bin_curved: + if img_bin_curved is not None: img_bin_in_des = img_bin_in_des[y_n:y_n+h_n, x_n:x_n+w_n, :] w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) if w_relative==0: w_relative = img_in_des.shape[1] img_in_des = resize_image(img_in_des, 32, w_relative) - if img_bin_curved: + if img_bin_curved is not None: img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative) else: img_in_des = np.copy(img_in) - if img_bin_curved: + if img_bin_curved is not None: img_bin_in_des = np.copy(img_bin_in) w_relative = int(32 * img_in_des.shape[1]/float(img_in_des.shape[0]) ) if w_relative==0: w_relative = img_in_des.shape[1] img_in_des = resize_image(img_in_des, 32, w_relative) - if img_bin_curved: + if img_bin_curved is not None: img_bin_in_des = resize_image(img_bin_in_des, 32, w_relative) w_tot_des+=img_in_des.shape[1] w_tot_des_list.append(img_in_des.shape[1]) imgs_deskewed_list.append(img_in_des) - if img_bin_curved: + if img_bin_curved is not None: imgs_bin_deskewed_list.append(img_bin_in_des) img_final_deskewed = np.zeros((32, w_tot_des, 3))+255 - if img_bin_curved: + if img_bin_curved is not None: img_bin_final_deskewed = np.zeros((32, w_tot_des, 3))+255 else: img_bin_final_deskewed = None @@ -350,7 +350,7 @@ def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved, w_indexer = 0 for ind in range(len(w_tot_des_list)): img_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_deskewed_list[ind][:,:,:] - if img_bin_curved: + if img_bin_curved is not None: img_bin_final_deskewed[:,w_indexer:w_indexer+w_tot_des_list[ind],:] = imgs_bin_deskewed_list[ind][:,:,:] w_indexer = w_indexer+w_tot_des_list[ind] return img_final_deskewed, img_bin_final_deskewed From 928a548b70197c22a26721073fde208f6b4f81b5 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Sat, 31 May 2025 01:09:14 +0200 Subject: [PATCH 24/40] Parametrize OCR for handling curved lines --- src/eynollah/eynollah.py | 10 +++++----- src/eynollah/utils/utils_ocr.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 0a9248e..6c00329 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -5353,7 +5353,7 @@ class Eynollah_ocr: self.b_s = int(batch_size) else: - self.model_ocr_dir = dir_models + "/model_step_1150000_ocr"#"/model_step_125000_ocr"#"/model_step_25000_ocr"#"/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" + self.model_ocr_dir = dir_models + "/model_step_1225000_ocr"#"/model_step_125000_ocr"#"/model_step_25000_ocr"#"/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" model_ocr = load_model(self.model_ocr_dir , compile=False) self.prediction_model = tf.keras.models.Model( @@ -5642,7 +5642,7 @@ class Eynollah_ocr: if self.draw_texts_on_image: total_bb_coordinates.append([x,y,w,h]) - + w_scaled = w * image_height/float(h) img_poly_on_img = np.copy(img) @@ -5684,7 +5684,7 @@ class Eynollah_ocr: img_crop_bin = img_crop_bin[y_n:y_n+h_n, x_n:x_n+w_n, :] img_crop_bin[mask_poly==0] = 255 - if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 100: + if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 90: if self.prediction_with_both_of_rgb_and_bin: img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin) else: @@ -5698,7 +5698,7 @@ class Eynollah_ocr: if type_textregion=='drop-capital': pass else: - if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 100: + if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 90: if self.prediction_with_both_of_rgb_and_bin: img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin) else: @@ -5708,7 +5708,7 @@ class Eynollah_ocr: if not self.export_textline_images_and_text: - if w_scaled < 640:#1.5*image_width: + if w_scaled < 530:#640:#1.5*image_width: img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) cropped_lines.append(img_fin) if angle_degrees > 15: diff --git a/src/eynollah/utils/utils_ocr.py b/src/eynollah/utils/utils_ocr.py index aa1efa6..81a8ae1 100644 --- a/src/eynollah/utils/utils_ocr.py +++ b/src/eynollah/utils/utils_ocr.py @@ -241,7 +241,7 @@ def return_splitting_point_of_image(image_to_spliited): peaks_real = peaks_real[(peaks_realwidth1)] arg_sort = np.argsort(sum_smoothed[peaks_real]) - peaks_sort_4 = peaks_real[arg_sort][::-1][:4] + peaks_sort_4 = peaks_real[arg_sort][::-1][:3] return np.sort(peaks_sort_4) From cc36694dfdab852e27780187f15da1155423bd02 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Sun, 1 Jun 2025 15:53:04 +0200 Subject: [PATCH 25/40] image enhancer is integrated --- src/eynollah/cli.py | 69 +++ src/eynollah/eynollah.py | 234 +--------- src/eynollah/image_enhancer.py | 756 +++++++++++++++++++++++++++++++++ 3 files changed, 830 insertions(+), 229 deletions(-) create mode 100644 src/eynollah/image_enhancer.py diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index 2d0d6f9..840bc4b 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -3,6 +3,7 @@ import click from ocrd_utils import initLogging, getLevelName, getLogger from eynollah.eynollah import Eynollah, Eynollah_ocr from eynollah.sbb_binarize import SbbBinarizer +from eynollah.image_enhancer import Enhancer @click.group() def main(): @@ -70,6 +71,74 @@ def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out) +@main.command() +@click.option( + "--image", + "-i", + help="image filename", + type=click.Path(exists=True, dir_okay=False), +) + +@click.option( + "--out", + "-o", + help="directory to write output xml data", + type=click.Path(exists=True, file_okay=False), + required=True, +) +@click.option( + "--overwrite", + "-O", + help="overwrite (instead of skipping) if output xml exists", + is_flag=True, +) +@click.option( + "--dir_in", + "-di", + help="directory of images", + type=click.Path(exists=True, file_okay=False), +) +@click.option( + "--model", + "-m", + help="directory of models", + type=click.Path(exists=True, file_okay=False), + required=True, +) + +@click.option( + "--num_col_upper", + "-ncu", + help="lower limit of columns in document image", +) +@click.option( + "--num_col_lower", + "-ncl", + help="upper limit of columns in document image", +) +@click.option( + "--log_level", + "-l", + type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']), + help="Override log level globally to this", +) + +def enhancement(image, out, overwrite, dir_in, model, num_col_upper, num_col_lower, log_level): + initLogging() + if log_level: + getLogger('enhancement').setLevel(getLevelName(log_level)) + assert image or dir_in, "Either a single image -i or a dir_in -di is required" + enhancer_object = Enhancer( + model, + logger=getLogger('enhancement'), + dir_out=out, + num_col_upper=num_col_upper, + num_col_lower=num_col_lower, + ) + if dir_in: + enhancer_object.run(dir_in=dir_in, overwrite=overwrite) + else: + enhancer_object.run(image_filename=image, overwrite=overwrite) @main.command() @click.option( diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 6c00329..cf540d3 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -3612,25 +3612,12 @@ class Eynollah: inference_bs = 3 - cv2.imwrite('textregions.png', text_regions_p*50) - cv2.imwrite('sep.png', (text_regions_p[:,:]==6)*255) - ver_kernel = np.ones((5, 1), dtype=np.uint8) hor_kernel = np.ones((1, 5), dtype=np.uint8) - - #separators = (text_regions_p[:,:]==6)*1 - #text_regions_p[text_regions_p[:,:]==6] = 0 - #separators = separators.astype('uint8') - - #separators = cv2.erode(separators , hor_kernel, iterations=1) - #text_regions_p[separators[:,:]==1] = 6 - - #cv2.imwrite('sep_new.png', (text_regions_p[:,:]==6)*255) - min_cont_size_to_be_dilated = 10 - if len(contours_only_text_parent)>min_cont_size_to_be_dilated: + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: cx_conts, cy_conts, x_min_conts, x_max_conts, y_min_conts, y_max_conts, _ = find_new_features_of_contours(contours_only_text_parent) args_cont_located = np.array(range(len(contours_only_text_parent))) @@ -3672,7 +3659,6 @@ class Eynollah: text_regions_p_textregions_dilated = cv2.dilate(text_regions_p_textregions_dilated , ver_kernel, iterations=5) text_regions_p_textregions_dilated[text_regions_p[:,:]>1] = 0 - cv2.imwrite('text_regions_p_textregions_dilated.png', text_regions_p_textregions_dilated*255) contours_only_dilated, hir_on_text_dilated = return_contours_of_image(text_regions_p_textregions_dilated) contours_only_dilated = return_parent_contours(contours_only_dilated, hir_on_text_dilated) @@ -3723,21 +3709,20 @@ class Eynollah: img_header_and_sep[int(y_max_main[j]):int(y_max_main[j])+12, int(x_min_main[j]):int(x_max_main[j])] = 1 co_text_all_org = contours_only_text_parent + contours_only_text_parent_h - if len(contours_only_text_parent)>min_cont_size_to_be_dilated: + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: co_text_all = contours_only_dilated + contours_only_text_parent_h else: co_text_all = contours_only_text_parent + contours_only_text_parent_h else: co_text_all_org = contours_only_text_parent - if len(contours_only_text_parent)>min_cont_size_to_be_dilated: + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: co_text_all = contours_only_dilated else: co_text_all = contours_only_text_parent if not len(co_text_all): return [], [] - print(len(co_text_all), "co_text_all") - print(len(co_text_all_org), "co_text_all_org") + labels_con = np.zeros((int(y_len /6.), int(x_len/6.), len(co_text_all)), dtype=bool) co_text_all = [(i/6).astype(int) for i in co_text_all] for i in range(len(co_text_all)): @@ -3805,7 +3790,7 @@ class Eynollah: ordered = [i[0] for i in ordered] - if len(contours_only_text_parent)>min_cont_size_to_be_dilated: + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: org_contours_indexes = [] for ind in range(len(ordered)): region_with_curr_order = ordered[ind] @@ -3823,215 +3808,6 @@ class Eynollah: else: region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] return ordered, region_ids - - - ####def return_start_and_end_of_common_text_of_textline_ocr(self, textline_image, ind_tot): - ####width = np.shape(textline_image)[1] - ####height = np.shape(textline_image)[0] - ####common_window = int(0.2*width) - - ####width1 = int ( width/2. - common_window ) - ####width2 = int ( width/2. + common_window ) - - ####img_sum = np.sum(textline_image[:,:,0], axis=0) - ####sum_smoothed = gaussian_filter1d(img_sum, 3) - - ####peaks_real, _ = find_peaks(sum_smoothed, height=0) - ####if len(peaks_real)>70: - - ####peaks_real = peaks_real[(peaks_realwidth1)] - - ####arg_sort = np.argsort(sum_smoothed[peaks_real]) - ####arg_sort4 =arg_sort[::-1][:4] - ####peaks_sort_4 = peaks_real[arg_sort][::-1][:4] - ####argsort_sorted = np.argsort(peaks_sort_4) - - ####first_4_sorted = peaks_sort_4[argsort_sorted] - ####y_4_sorted = sum_smoothed[peaks_real][arg_sort4[argsort_sorted]] - #####print(first_4_sorted,'first_4_sorted') - - ####arg_sortnew = np.argsort(y_4_sorted) - ####peaks_final =np.sort( first_4_sorted[arg_sortnew][2:] ) - - #####plt.figure(ind_tot) - #####plt.imshow(textline_image) - #####plt.plot([peaks_final[0], peaks_final[0]], [0, height-1]) - #####plt.plot([peaks_final[1], peaks_final[1]], [0, height-1]) - #####plt.savefig('./'+str(ind_tot)+'.png') - - ####return peaks_final[0], peaks_final[1] - ####else: - ####pass - - ##def return_start_and_end_of_common_text_of_textline_ocr_without_common_section(self, textline_image, ind_tot): - ##width = np.shape(textline_image)[1] - ##height = np.shape(textline_image)[0] - ##common_window = int(0.06*width) - - ##width1 = int ( width/2. - common_window ) - ##width2 = int ( width/2. + common_window ) - - ##img_sum = np.sum(textline_image[:,:,0], axis=0) - ##sum_smoothed = gaussian_filter1d(img_sum, 3) - - ##peaks_real, _ = find_peaks(sum_smoothed, height=0) - ##if len(peaks_real)>70: - ###print(len(peaks_real), 'len(peaks_real)') - - ##peaks_real = peaks_real[(peaks_realwidth1)] - - ##arg_max = np.argmax(sum_smoothed[peaks_real]) - ##peaks_final = peaks_real[arg_max] - - ###plt.figure(ind_tot) - ###plt.imshow(textline_image) - ###plt.plot([peaks_final, peaks_final], [0, height-1]) - ####plt.plot([peaks_final[1], peaks_final[1]], [0, height-1]) - ###plt.savefig('./'+str(ind_tot)+'.png') - - ##return peaks_final - ##else: - ##return None - - ###def return_start_and_end_of_common_text_of_textline_ocr_new_splitted( - ###self, peaks_real, sum_smoothed, start_split, end_split): - - ###peaks_real = peaks_real[(peaks_realstart_split)] - - ###arg_sort = np.argsort(sum_smoothed[peaks_real]) - ###arg_sort4 =arg_sort[::-1][:4] - ###peaks_sort_4 = peaks_real[arg_sort][::-1][:4] - ###argsort_sorted = np.argsort(peaks_sort_4) - - ###first_4_sorted = peaks_sort_4[argsort_sorted] - ###y_4_sorted = sum_smoothed[peaks_real][arg_sort4[argsort_sorted]] - ####print(first_4_sorted,'first_4_sorted') - - ###arg_sortnew = np.argsort(y_4_sorted) - ###peaks_final =np.sort( first_4_sorted[arg_sortnew][3:] ) - ###return peaks_final[0] - - ###def return_start_and_end_of_common_text_of_textline_ocr_new(self, textline_image, ind_tot): - ###width = np.shape(textline_image)[1] - ###height = np.shape(textline_image)[0] - ###common_window = int(0.15*width) - - ###width1 = int ( width/2. - common_window ) - ###width2 = int ( width/2. + common_window ) - ###mid = int(width/2.) - - ###img_sum = np.sum(textline_image[:,:,0], axis=0) - ###sum_smoothed = gaussian_filter1d(img_sum, 3) - - ###peaks_real, _ = find_peaks(sum_smoothed, height=0) - ###if len(peaks_real)>70: - ###peak_start = self.return_start_and_end_of_common_text_of_textline_ocr_new_splitted( - ###peaks_real, sum_smoothed, width1, mid+2) - ###peak_end = self.return_start_and_end_of_common_text_of_textline_ocr_new_splitted( - ###peaks_real, sum_smoothed, mid-2, width2) - - ####plt.figure(ind_tot) - ####plt.imshow(textline_image) - ####plt.plot([peak_start, peak_start], [0, height-1]) - ####plt.plot([peak_end, peak_end], [0, height-1]) - ####plt.savefig('./'+str(ind_tot)+'.png') - - ###return peak_start, peak_end - ###else: - ###pass - - ##def return_ocr_of_textline_without_common_section( - ##self, textline_image, model_ocr, processor, device, width_textline, h2w_ratio,ind_tot): - - ##if h2w_ratio > 0.05: - ##pixel_values = processor(textline_image, return_tensors="pt").pixel_values - ##generated_ids = model_ocr.generate(pixel_values.to(device)) - ##generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - ##else: - ###width = np.shape(textline_image)[1] - ###height = np.shape(textline_image)[0] - ###common_window = int(0.3*width) - ###width1 = int ( width/2. - common_window ) - ###width2 = int ( width/2. + common_window ) - - ##split_point = self.return_start_and_end_of_common_text_of_textline_ocr_without_common_section( - ##textline_image, ind_tot) - ##if split_point: - ##image1 = textline_image[:, :split_point,:]# image.crop((0, 0, width2, height)) - ##image2 = textline_image[:, split_point:,:]#image.crop((width1, 0, width, height)) - - ###pixel_values1 = processor(image1, return_tensors="pt").pixel_values - ###pixel_values2 = processor(image2, return_tensors="pt").pixel_values - - ##pixel_values_merged = processor([image1,image2], return_tensors="pt").pixel_values - ##generated_ids_merged = model_ocr.generate(pixel_values_merged.to(device)) - ##generated_text_merged = processor.batch_decode(generated_ids_merged, skip_special_tokens=True) - - ###print(generated_text_merged,'generated_text_merged') - - ###generated_ids1 = model_ocr.generate(pixel_values1.to(device)) - ###generated_ids2 = model_ocr.generate(pixel_values2.to(device)) - - ###generated_text1 = processor.batch_decode(generated_ids1, skip_special_tokens=True)[0] - ###generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0] - - ###generated_text = generated_text1 + ' ' + generated_text2 - ##generated_text = generated_text_merged[0] + ' ' + generated_text_merged[1] - - ###print(generated_text1,'generated_text1') - ###print(generated_text2, 'generated_text2') - ###print('########################################') - ##else: - ##pixel_values = processor(textline_image, return_tensors="pt").pixel_values - ##generated_ids = model_ocr.generate(pixel_values.to(device)) - ##generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - - ###print(generated_text,'generated_text') - ###print('########################################') - ##return generated_text - - ###def return_ocr_of_textline( - ###self, textline_image, model_ocr, processor, device, width_textline, h2w_ratio,ind_tot): - - ###if h2w_ratio > 0.05: - ###pixel_values = processor(textline_image, return_tensors="pt").pixel_values - ###generated_ids = model_ocr.generate(pixel_values.to(device)) - ###generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - ###else: - ####width = np.shape(textline_image)[1] - ####height = np.shape(textline_image)[0] - ####common_window = int(0.3*width) - ####width1 = int ( width/2. - common_window ) - ####width2 = int ( width/2. + common_window ) - - ###try: - ###width1, width2 = self.return_start_and_end_of_common_text_of_textline_ocr_new(textline_image, ind_tot) - - ###image1 = textline_image[:, :width2,:]# image.crop((0, 0, width2, height)) - ###image2 = textline_image[:, width1:,:]#image.crop((width1, 0, width, height)) - - ###pixel_values1 = processor(image1, return_tensors="pt").pixel_values - ###pixel_values2 = processor(image2, return_tensors="pt").pixel_values - - ###generated_ids1 = model_ocr.generate(pixel_values1.to(device)) - ###generated_ids2 = model_ocr.generate(pixel_values2.to(device)) - - ###generated_text1 = processor.batch_decode(generated_ids1, skip_special_tokens=True)[0] - ###generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0] - ####print(generated_text1,'generated_text1') - ####print(generated_text2, 'generated_text2') - ####print('########################################') - - ###match = sq(None, generated_text1, generated_text2).find_longest_match( - ###0, len(generated_text1), 0, len(generated_text2)) - ###generated_text = generated_text1 + generated_text2[match.b+match.size:] - ###except: - ###pixel_values = processor(textline_image, return_tensors="pt").pixel_values - ###generated_ids = model_ocr.generate(pixel_values.to(device)) - ###generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - - ###return generated_text - def return_list_of_contours_with_desired_order(self, ls_cons, sorted_indexes): return [ls_cons[sorted_indexes[index]] for index in range(len(sorted_indexes))] diff --git a/src/eynollah/image_enhancer.py b/src/eynollah/image_enhancer.py new file mode 100644 index 0000000..71445f7 --- /dev/null +++ b/src/eynollah/image_enhancer.py @@ -0,0 +1,756 @@ +""" +Image enhancer. The output can be written as same scale of input or in new predicted scale. +""" + +from logging import Logger +from difflib import SequenceMatcher as sq +from PIL import Image, ImageDraw, ImageFont +import math +import os +import sys +import time +from typing import Optional +import atexit +import warnings +from functools import partial +from pathlib import Path +from multiprocessing import cpu_count +import gc +import copy +from loky import ProcessPoolExecutor +import xml.etree.ElementTree as ET +import cv2 +import numpy as np +from ocrd import OcrdPage +from ocrd_utils import getLogger, tf_disable_interactive_logs +import statistics +from tensorflow.keras.models import load_model +from .utils.resize import resize_image +from .utils import ( + crop_image_inside_box +) + +DPI_THRESHOLD = 298 +KERNEL = np.ones((5, 5), np.uint8) + + +class Enhancer: + def __init__( + self, + dir_models : str, + dir_out : Optional[str] = None, + num_col_upper : Optional[int] = None, + num_col_lower : Optional[int] = None, + logger : Optional[Logger] = None, + ): + self.dir_out = dir_out + self.input_binary = False + self.light_version = False + if num_col_upper: + self.num_col_upper = int(num_col_upper) + else: + self.num_col_upper = num_col_upper + if num_col_lower: + self.num_col_lower = int(num_col_lower) + else: + self.num_col_lower = num_col_lower + + self.logger = logger if logger else getLogger('enhancement') + # for parallelization of CPU-intensive tasks: + self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200) + atexit.register(self.executor.shutdown) + self.dir_models = dir_models + self.model_dir_of_enhancement = dir_models + "/eynollah-enhancement_20210425" + self.model_dir_of_col_classifier = dir_models + "/eynollah-column-classifier_20210425" + self.model_page_dir = dir_models + "/eynollah-page-extraction_20210425" + + try: + for device in tf.config.list_physical_devices('GPU'): + tf.config.experimental.set_memory_growth(device, True) + except: + self.logger.warning("no GPU device available") + + self.model_page = self.our_load_model(self.model_page_dir) + self.model_classifier = self.our_load_model(self.model_dir_of_col_classifier) + self.model_enhancement = self.our_load_model(self.model_dir_of_enhancement) + + def cache_images(self, image_filename=None, image_pil=None, dpi=None): + ret = {} + t_c0 = time.time() + if image_filename: + ret['img'] = cv2.imread(image_filename) + if self.light_version: + self.dpi = 100 + else: + self.dpi = 0#check_dpi(image_filename) + else: + ret['img'] = pil2cv(image_pil) + if self.light_version: + self.dpi = 100 + else: + self.dpi = 0#check_dpi(image_pil) + ret['img_grayscale'] = cv2.cvtColor(ret['img'], cv2.COLOR_BGR2GRAY) + for prefix in ('', '_grayscale'): + ret[f'img{prefix}_uint8'] = ret[f'img{prefix}'].astype(np.uint8) + self._imgs = ret + if dpi is not None: + self.dpi = dpi + + def reset_file_name_dir(self, image_filename): + t_c = time.time() + self.cache_images(image_filename=image_filename) + self.output_filename = os.path.join(self.dir_out, Path(image_filename).stem +'.png') + + def imread(self, grayscale=False, uint8=True): + key = 'img' + if grayscale: + key += '_grayscale' + if uint8: + key += '_uint8' + return self._imgs[key].copy() + + def isNaN(self, num): + return num != num + + @staticmethod + def our_load_model(model_file): + if model_file.endswith('.h5') and Path(model_file[:-3]).exists(): + # prefer SavedModel over HDF5 format if it exists + model_file = model_file[:-3] + try: + model = load_model(model_file, compile=False) + except: + model = load_model(model_file, compile=False, custom_objects={ + "PatchEncoder": PatchEncoder, "Patches": Patches}) + return model + + def predict_enhancement(self, img): + self.logger.debug("enter predict_enhancement") + + img_height_model = self.model_enhancement.layers[-1].output_shape[1] + img_width_model = self.model_enhancement.layers[-1].output_shape[2] + if img.shape[0] < img_height_model: + img = cv2.resize(img, (img.shape[1], img_width_model), interpolation=cv2.INTER_NEAREST) + if img.shape[1] < img_width_model: + img = cv2.resize(img, (img_height_model, img.shape[0]), interpolation=cv2.INTER_NEAREST) + margin = int(0.1 * img_width_model) + width_mid = img_width_model - 2 * margin + height_mid = img_height_model - 2 * margin + img = img / 255. + img_h = img.shape[0] + img_w = img.shape[1] + + prediction_true = np.zeros((img_h, img_w, 3)) + nxf = img_w / float(width_mid) + nyf = img_h / float(height_mid) + nxf = int(nxf) + 1 if nxf > int(nxf) else int(nxf) + nyf = int(nyf) + 1 if nyf > int(nyf) else int(nyf) + + for i in range(nxf): + for j in range(nyf): + if i == 0: + index_x_d = i * width_mid + index_x_u = index_x_d + img_width_model + else: + index_x_d = i * width_mid + index_x_u = index_x_d + img_width_model + if j == 0: + index_y_d = j * height_mid + index_y_u = index_y_d + img_height_model + else: + index_y_d = j * height_mid + index_y_u = index_y_d + img_height_model + + if index_x_u > img_w: + index_x_u = img_w + index_x_d = img_w - img_width_model + if index_y_u > img_h: + index_y_u = img_h + index_y_d = img_h - img_height_model + + img_patch = img[np.newaxis, index_y_d:index_y_u, index_x_d:index_x_u, :] + label_p_pred = self.model_enhancement.predict(img_patch, verbose=0) + seg = label_p_pred[0, :, :, :] * 255 + + if i == 0 and j == 0: + prediction_true[index_y_d + 0:index_y_u - margin, + index_x_d + 0:index_x_u - margin] = \ + seg[0:-margin or None, + 0:-margin or None] + elif i == nxf - 1 and j == nyf - 1: + prediction_true[index_y_d + margin:index_y_u - 0, + index_x_d + margin:index_x_u - 0] = \ + seg[margin:, + margin:] + elif i == 0 and j == nyf - 1: + prediction_true[index_y_d + margin:index_y_u - 0, + index_x_d + 0:index_x_u - margin] = \ + seg[margin:, + 0:-margin or None] + elif i == nxf - 1 and j == 0: + prediction_true[index_y_d + 0:index_y_u - margin, + index_x_d + margin:index_x_u - 0] = \ + seg[0:-margin or None, + margin:] + elif i == 0 and j != 0 and j != nyf - 1: + prediction_true[index_y_d + margin:index_y_u - margin, + index_x_d + 0:index_x_u - margin] = \ + seg[margin:-margin or None, + 0:-margin or None] + elif i == nxf - 1 and j != 0 and j != nyf - 1: + prediction_true[index_y_d + margin:index_y_u - margin, + index_x_d + margin:index_x_u - 0] = \ + seg[margin:-margin or None, + margin:] + elif i != 0 and i != nxf - 1 and j == 0: + prediction_true[index_y_d + 0:index_y_u - margin, + index_x_d + margin:index_x_u - margin] = \ + seg[0:-margin or None, + margin:-margin or None] + elif i != 0 and i != nxf - 1 and j == nyf - 1: + prediction_true[index_y_d + margin:index_y_u - 0, + index_x_d + margin:index_x_u - margin] = \ + seg[margin:, + margin:-margin or None] + else: + prediction_true[index_y_d + margin:index_y_u - margin, + index_x_d + margin:index_x_u - margin] = \ + seg[margin:-margin or None, + margin:-margin or None] + + prediction_true = prediction_true.astype(int) + return prediction_true + + def calculate_width_height_by_columns(self, img, num_col, width_early, label_p_pred): + self.logger.debug("enter calculate_width_height_by_columns") + if num_col == 1 and width_early < 1100: + img_w_new = 2000 + elif num_col == 1 and width_early >= 2500: + img_w_new = 2000 + elif num_col == 1 and width_early >= 1100 and width_early < 2500: + img_w_new = width_early + elif num_col == 2 and width_early < 2000: + img_w_new = 2400 + elif num_col == 2 and width_early >= 3500: + img_w_new = 2400 + elif num_col == 2 and width_early >= 2000 and width_early < 3500: + img_w_new = width_early + elif num_col == 3 and width_early < 2000: + img_w_new = 3000 + elif num_col == 3 and width_early >= 4000: + img_w_new = 3000 + elif num_col == 3 and width_early >= 2000 and width_early < 4000: + img_w_new = width_early + elif num_col == 4 and width_early < 2500: + img_w_new = 4000 + elif num_col == 4 and width_early >= 5000: + img_w_new = 4000 + elif num_col == 4 and width_early >= 2500 and width_early < 5000: + img_w_new = width_early + elif num_col == 5 and width_early < 3700: + img_w_new = 5000 + elif num_col == 5 and width_early >= 7000: + img_w_new = 5000 + elif num_col == 5 and width_early >= 3700 and width_early < 7000: + img_w_new = width_early + elif num_col == 6 and width_early < 4500: + img_w_new = 6500 # 5400 + else: + img_w_new = width_early + img_h_new = img_w_new * img.shape[0] // img.shape[1] + + if label_p_pred[0][int(num_col - 1)] < 0.9 and img_w_new < width_early: + img_new = np.copy(img) + num_column_is_classified = False + #elif label_p_pred[0][int(num_col - 1)] < 0.8 and img_h_new >= 8000: + elif img_h_new >= 8000: + img_new = np.copy(img) + num_column_is_classified = False + else: + img_new = resize_image(img, img_h_new, img_w_new) + num_column_is_classified = True + + return img_new, num_column_is_classified + + def early_page_for_num_of_column_classification(self,img_bin): + self.logger.debug("enter early_page_for_num_of_column_classification") + if self.input_binary: + img = np.copy(img_bin).astype(np.uint8) + else: + img = self.imread() + img = cv2.GaussianBlur(img, (5, 5), 0) + img_page_prediction = self.do_prediction(False, img, self.model_page) + + imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY) + _, thresh = cv2.threshold(imgray, 0, 255, 0) + thresh = cv2.dilate(thresh, KERNEL, iterations=3) + contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + if len(contours)>0: + cnt_size = np.array([cv2.contourArea(contours[j]) + for j in range(len(contours))]) + cnt = contours[np.argmax(cnt_size)] + box = cv2.boundingRect(cnt) + else: + box = [0, 0, img.shape[1], img.shape[0]] + cropped_page, page_coord = crop_image_inside_box(box, img) + + self.logger.debug("exit early_page_for_num_of_column_classification") + return cropped_page, page_coord + + def calculate_width_height_by_columns_1_2(self, img, num_col, width_early, label_p_pred): + self.logger.debug("enter calculate_width_height_by_columns") + if num_col == 1: + img_w_new = 1000 + else: + img_w_new = 1300 + img_h_new = img_w_new * img.shape[0] // img.shape[1] + + if label_p_pred[0][int(num_col - 1)] < 0.9 and img_w_new < width_early: + img_new = np.copy(img) + num_column_is_classified = False + #elif label_p_pred[0][int(num_col - 1)] < 0.8 and img_h_new >= 8000: + elif img_h_new >= 8000: + img_new = np.copy(img) + num_column_is_classified = False + else: + img_new = resize_image(img, img_h_new, img_w_new) + num_column_is_classified = True + + return img_new, num_column_is_classified + + def resize_and_enhance_image_with_column_classifier(self, light_version): + self.logger.debug("enter resize_and_enhance_image_with_column_classifier") + dpi = 0#self.dpi + self.logger.info("Detected %s DPI", dpi) + if self.input_binary: + img = self.imread() + prediction_bin = self.do_prediction(True, img, self.model_bin, n_batch_inference=5) + prediction_bin = 255 * (prediction_bin[:,:,0]==0) + prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2).astype(np.uint8) + img= np.copy(prediction_bin) + img_bin = prediction_bin + else: + img = self.imread() + self.h_org, self.w_org = img.shape[:2] + img_bin = None + + width_early = img.shape[1] + t1 = time.time() + _, page_coord = self.early_page_for_num_of_column_classification(img_bin) + + self.image_page_org_size = img[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3], :] + self.page_coord = page_coord + + if self.num_col_upper and not self.num_col_lower: + num_col = self.num_col_upper + label_p_pred = [np.ones(6)] + elif self.num_col_lower and not self.num_col_upper: + num_col = self.num_col_lower + label_p_pred = [np.ones(6)] + elif not self.num_col_upper and not self.num_col_lower: + if self.input_binary: + img_in = np.copy(img) + img_in = img_in / 255.0 + img_in = cv2.resize(img_in, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = img_in.reshape(1, 448, 448, 3) + else: + img_1ch = self.imread(grayscale=True) + width_early = img_1ch.shape[1] + img_1ch = img_1ch[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] + + img_1ch = img_1ch / 255.0 + img_1ch = cv2.resize(img_1ch, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = np.zeros((1, img_1ch.shape[0], img_1ch.shape[1], 3)) + img_in[0, :, :, 0] = img_1ch[:, :] + img_in[0, :, :, 1] = img_1ch[:, :] + img_in[0, :, :, 2] = img_1ch[:, :] + + label_p_pred = self.model_classifier.predict(img_in, verbose=0) + num_col = np.argmax(label_p_pred[0]) + 1 + elif (self.num_col_upper and self.num_col_lower) and (self.num_col_upper!=self.num_col_lower): + if self.input_binary: + img_in = np.copy(img) + img_in = img_in / 255.0 + img_in = cv2.resize(img_in, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = img_in.reshape(1, 448, 448, 3) + else: + img_1ch = self.imread(grayscale=True) + width_early = img_1ch.shape[1] + img_1ch = img_1ch[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] + + img_1ch = img_1ch / 255.0 + img_1ch = cv2.resize(img_1ch, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = np.zeros((1, img_1ch.shape[0], img_1ch.shape[1], 3)) + img_in[0, :, :, 0] = img_1ch[:, :] + img_in[0, :, :, 1] = img_1ch[:, :] + img_in[0, :, :, 2] = img_1ch[:, :] + + label_p_pred = self.model_classifier.predict(img_in, verbose=0) + num_col = np.argmax(label_p_pred[0]) + 1 + + if num_col > self.num_col_upper: + num_col = self.num_col_upper + label_p_pred = [np.ones(6)] + if num_col < self.num_col_lower: + num_col = self.num_col_lower + label_p_pred = [np.ones(6)] + else: + num_col = self.num_col_upper + label_p_pred = [np.ones(6)] + + self.logger.info("Found %d columns (%s)", num_col, np.around(label_p_pred, decimals=5)) + + if dpi < DPI_THRESHOLD: + if light_version and num_col in (1,2): + img_new, num_column_is_classified = self.calculate_width_height_by_columns_1_2( + img, num_col, width_early, label_p_pred) + else: + img_new, num_column_is_classified = self.calculate_width_height_by_columns( + img, num_col, width_early, label_p_pred) + if light_version: + image_res = np.copy(img_new) + else: + image_res = self.predict_enhancement(img_new) + is_image_enhanced = True + + else: + num_column_is_classified = True + image_res = np.copy(img) + is_image_enhanced = False + + self.logger.debug("exit resize_and_enhance_image_with_column_classifier") + return is_image_enhanced, img, image_res, num_col, num_column_is_classified, img_bin + def do_prediction( + self, patches, img, model, + n_batch_inference=1, marginal_of_patch_percent=0.1, + thresholding_for_some_classes_in_light_version=False, + thresholding_for_artificial_class_in_light_version=False, thresholding_for_fl_light_version=False, threshold_art_class_textline=0.1): + + self.logger.debug("enter do_prediction") + img_height_model = model.layers[-1].output_shape[1] + img_width_model = model.layers[-1].output_shape[2] + + if not patches: + img_h_page = img.shape[0] + img_w_page = img.shape[1] + img = img / float(255.0) + img = resize_image(img, img_height_model, img_width_model) + + label_p_pred = model.predict(img[np.newaxis], verbose=0) + seg = np.argmax(label_p_pred, axis=3)[0] + + if thresholding_for_artificial_class_in_light_version: + seg_art = label_p_pred[0,:,:,2] + + seg_art[seg_art0] =1 + + skeleton_art = skeletonize(seg_art) + skeleton_art = skeleton_art*1 + + seg[skeleton_art==1]=2 + + if thresholding_for_fl_light_version: + seg_header = label_p_pred[0,:,:,2] + + seg_header[seg_header<0.2] = 0 + seg_header[seg_header>0] =1 + + seg[seg_header==1]=2 + + seg_color = np.repeat(seg[:, :, np.newaxis], 3, axis=2) + prediction_true = resize_image(seg_color, img_h_page, img_w_page).astype(np.uint8) + return prediction_true + + if img.shape[0] < img_height_model: + img = resize_image(img, img_height_model, img.shape[1]) + if img.shape[1] < img_width_model: + img = resize_image(img, img.shape[0], img_width_model) + + self.logger.debug("Patch size: %sx%s", img_height_model, img_width_model) + margin = int(marginal_of_patch_percent * img_height_model) + width_mid = img_width_model - 2 * margin + height_mid = img_height_model - 2 * margin + img = img / 255. + #img = img.astype(np.float16) + img_h = img.shape[0] + img_w = img.shape[1] + prediction_true = np.zeros((img_h, img_w, 3)) + mask_true = np.zeros((img_h, img_w)) + nxf = img_w / float(width_mid) + nyf = img_h / float(height_mid) + nxf = int(nxf) + 1 if nxf > int(nxf) else int(nxf) + nyf = int(nyf) + 1 if nyf > int(nyf) else int(nyf) + + list_i_s = [] + list_j_s = [] + list_x_u = [] + list_x_d = [] + list_y_u = [] + list_y_d = [] + + batch_indexer = 0 + img_patch = np.zeros((n_batch_inference, img_height_model, img_width_model, 3)) + for i in range(nxf): + for j in range(nyf): + if i == 0: + index_x_d = i * width_mid + index_x_u = index_x_d + img_width_model + else: + index_x_d = i * width_mid + index_x_u = index_x_d + img_width_model + if j == 0: + index_y_d = j * height_mid + index_y_u = index_y_d + img_height_model + else: + index_y_d = j * height_mid + index_y_u = index_y_d + img_height_model + if index_x_u > img_w: + index_x_u = img_w + index_x_d = img_w - img_width_model + if index_y_u > img_h: + index_y_u = img_h + index_y_d = img_h - img_height_model + + list_i_s.append(i) + list_j_s.append(j) + list_x_u.append(index_x_u) + list_x_d.append(index_x_d) + list_y_d.append(index_y_d) + list_y_u.append(index_y_u) + + img_patch[batch_indexer,:,:,:] = img[index_y_d:index_y_u, index_x_d:index_x_u, :] + batch_indexer += 1 + + if (batch_indexer == n_batch_inference or + # last batch + i == nxf - 1 and j == nyf - 1): + self.logger.debug("predicting patches on %s", str(img_patch.shape)) + label_p_pred = model.predict(img_patch, verbose=0) + seg = np.argmax(label_p_pred, axis=3) + + if thresholding_for_some_classes_in_light_version: + seg_not_base = label_p_pred[:,:,:,4] + seg_not_base[seg_not_base>0.03] =1 + seg_not_base[seg_not_base<1] =0 + + seg_line = label_p_pred[:,:,:,3] + seg_line[seg_line>0.1] =1 + seg_line[seg_line<1] =0 + + seg_background = label_p_pred[:,:,:,0] + seg_background[seg_background>0.25] =1 + seg_background[seg_background<1] =0 + + seg[seg_not_base==1]=4 + seg[seg_background==1]=0 + seg[(seg_line==1) & (seg==0)]=3 + if thresholding_for_artificial_class_in_light_version: + seg_art = label_p_pred[:,:,:,2] + + seg_art[seg_art0] =1 + + ##seg[seg_art==1]=2 + + indexer_inside_batch = 0 + for i_batch, j_batch in zip(list_i_s, list_j_s): + seg_in = seg[indexer_inside_batch] + + if thresholding_for_artificial_class_in_light_version: + seg_in_art = seg_art[indexer_inside_batch] + + index_y_u_in = list_y_u[indexer_inside_batch] + index_y_d_in = list_y_d[indexer_inside_batch] + + index_x_u_in = list_x_u[indexer_inside_batch] + index_x_d_in = list_x_d[indexer_inside_batch] + + if i_batch == 0 and j_batch == 0: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin] = \ + seg_in[0:-margin or None, + 0:-margin or None, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[0:-margin or None, + 0:-margin or None] + + elif i_batch == nxf - 1 and j_batch == nyf - 1: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - 0] = \ + seg_in[margin:, + margin:, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[margin:, + margin:] + + elif i_batch == 0 and j_batch == nyf - 1: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + 0:index_x_u_in - margin] = \ + seg_in[margin:, + 0:-margin or None, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[margin:, + 0:-margin or None] + + elif i_batch == nxf - 1 and j_batch == 0: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0] = \ + seg_in[0:-margin or None, + margin:, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[0:-margin or None, + margin:] + + elif i_batch == 0 and j_batch != 0 and j_batch != nyf - 1: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin] = \ + seg_in[margin:-margin or None, + 0:-margin or None, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + 0:index_x_u_in - margin, 1] = \ + seg_in_art[margin:-margin or None, + 0:-margin or None] + + elif i_batch == nxf - 1 and j_batch != 0 and j_batch != nyf - 1: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0] = \ + seg_in[margin:-margin or None, + margin:, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - 0, 1] = \ + seg_in_art[margin:-margin or None, + margin:] + + elif i_batch != 0 and i_batch != nxf - 1 and j_batch == 0: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin] = \ + seg_in[0:-margin or None, + margin:-margin or None, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + 0:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[0:-margin or None, + margin:-margin or None] + + elif i_batch != 0 and i_batch != nxf - 1 and j_batch == nyf - 1: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - margin] = \ + seg_in[margin:, + margin:-margin or None, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - 0, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[margin:, + margin:-margin or None] + + else: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin] = \ + seg_in[margin:-margin or None, + margin:-margin or None, + np.newaxis] + if thresholding_for_artificial_class_in_light_version: + prediction_true[index_y_d_in + margin:index_y_u_in - margin, + index_x_d_in + margin:index_x_u_in - margin, 1] = \ + seg_in_art[margin:-margin or None, + margin:-margin or None] + indexer_inside_batch += 1 + + + list_i_s = [] + list_j_s = [] + list_x_u = [] + list_x_d = [] + list_y_u = [] + list_y_d = [] + + batch_indexer = 0 + img_patch[:] = 0 + + prediction_true = prediction_true.astype(np.uint8) + + if thresholding_for_artificial_class_in_light_version: + kernel_min = np.ones((3, 3), np.uint8) + prediction_true[:,:,0][prediction_true[:,:,0]==2] = 0 + + skeleton_art = skeletonize(prediction_true[:,:,1]) + skeleton_art = skeleton_art*1 + + skeleton_art = skeleton_art.astype('uint8') + + skeleton_art = cv2.dilate(skeleton_art, kernel_min, iterations=1) + + prediction_true[:,:,0][skeleton_art==1]=2 + #del model + gc.collect() + return prediction_true + + def run_enhancement(self, light_version): + t_in = time.time() + self.logger.info("Resizing and enhancing image...") + is_image_enhanced, img_org, img_res, num_col_classifier, num_column_is_classified, img_bin = \ + self.resize_and_enhance_image_with_column_classifier(light_version) + + self.logger.info("Image was %senhanced.", '' if is_image_enhanced else 'not ') + return img_res, is_image_enhanced, num_col_classifier, num_column_is_classified + + + def run_single(self): + t0 = time.time() + img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(light_version=False) + + return img_res + + + def run(self, image_filename : Optional[str] = None, dir_in : Optional[str] = None, overwrite : bool = False): + """ + Get image and scales, then extract the page of scanned image + """ + self.logger.debug("enter run") + t0_tot = time.time() + + if dir_in: + self.ls_imgs = os.listdir(dir_in) + elif image_filename: + self.ls_imgs = [image_filename] + else: + raise ValueError("run requires either a single image filename or a directory") + + for img_filename in self.ls_imgs: + self.logger.info(img_filename) + t0 = time.time() + + self.reset_file_name_dir(os.path.join(dir_in or "", img_filename)) + #print("text region early -11 in %.1fs", time.time() - t0) + + if os.path.exists(self.output_filename): + if overwrite: + self.logger.warning("will overwrite existing output file '%s'", self.output_filename) + else: + self.logger.warning("will skip input for existing output file '%s'", self.output_filename) + continue + + image_enhanced = self.run_single() + img_enhanced_org_scale = resize_image(image_enhanced, self.h_org, self.w_org) + + cv2.imwrite(self.output_filename, img_enhanced_org_scale) + From d14bd162caa82030a9dee28ec2f063215bd64dce Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Sun, 1 Jun 2025 22:10:13 +0200 Subject: [PATCH 26/40] saving enhanced image in org or scaled resolution --- src/eynollah/cli.py | 9 ++++++++- src/eynollah/eynollah.py | 5 ++--- src/eynollah/image_enhancer.py | 7 +++++-- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index 840bc4b..9398c47 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -116,6 +116,12 @@ def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out) "-ncl", help="upper limit of columns in document image", ) +@click.option( + "--save_org_scale/--no_save_org_scale", + "-sos/-nosos", + is_flag=True, + help="if this parameter set to true, this tool will save the enhanced image in org scale.", +) @click.option( "--log_level", "-l", @@ -123,7 +129,7 @@ def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out) help="Override log level globally to this", ) -def enhancement(image, out, overwrite, dir_in, model, num_col_upper, num_col_lower, log_level): +def enhancement(image, out, overwrite, dir_in, model, num_col_upper, num_col_lower, save_org_scale, log_level): initLogging() if log_level: getLogger('enhancement').setLevel(getLevelName(log_level)) @@ -134,6 +140,7 @@ def enhancement(image, out, overwrite, dir_in, model, num_col_upper, num_col_low dir_out=out, num_col_upper=num_col_upper, num_col_lower=num_col_lower, + save_org_scale=save_org_scale, ) if dir_in: enhancer_object.run(dir_in=dir_in, overwrite=overwrite) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index cf540d3..9c834e2 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -5434,10 +5434,9 @@ class Eynollah_ocr: img_crop = img_poly_on_img[y:y+h, x:x+w, :] - #print(file_name, angle_degrees,w*h , mask_poly[:,:,0].sum(), mask_poly[:,:,0].sum() /float(w*h) , 'didi') if not self.do_not_mask_with_textline_contour: - if angle_degrees > 15: + if angle_degrees > 3: better_des_slope = get_orientation_moments(textline_coords) img_crop = rotate_image_with_padding(img_crop, better_des_slope ) @@ -5484,7 +5483,7 @@ class Eynollah_ocr: if not self.export_textline_images_and_text: - if w_scaled < 530:#640:#1.5*image_width: + if w_scaled < 640:#1.5*image_width: img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) cropped_lines.append(img_fin) if angle_degrees > 15: diff --git a/src/eynollah/image_enhancer.py b/src/eynollah/image_enhancer.py index 71445f7..c89f532 100644 --- a/src/eynollah/image_enhancer.py +++ b/src/eynollah/image_enhancer.py @@ -41,11 +41,13 @@ class Enhancer: dir_out : Optional[str] = None, num_col_upper : Optional[int] = None, num_col_lower : Optional[int] = None, + save_org_scale : bool = False, logger : Optional[Logger] = None, ): self.dir_out = dir_out self.input_binary = False self.light_version = False + self.save_org_scale = save_org_scale if num_col_upper: self.num_col_upper = int(num_col_upper) else: @@ -750,7 +752,8 @@ class Enhancer: continue image_enhanced = self.run_single() - img_enhanced_org_scale = resize_image(image_enhanced, self.h_org, self.w_org) + if self.save_org_scale: + image_enhanced = resize_image(image_enhanced, self.h_org, self.w_org) - cv2.imwrite(self.output_filename, img_enhanced_org_scale) + cv2.imwrite(self.output_filename, image_enhanced) From 7996afac69f6f7b8508fb24bb66ca3d5cd577c1d Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Sun, 1 Jun 2025 22:44:50 +0200 Subject: [PATCH 27/40] image enhancer updated --- src/eynollah/image_enhancer.py | 40 +++++++--------------------------- 1 file changed, 8 insertions(+), 32 deletions(-) diff --git a/src/eynollah/image_enhancer.py b/src/eynollah/image_enhancer.py index c89f532..983712d 100644 --- a/src/eynollah/image_enhancer.py +++ b/src/eynollah/image_enhancer.py @@ -225,47 +225,23 @@ class Enhancer: def calculate_width_height_by_columns(self, img, num_col, width_early, label_p_pred): self.logger.debug("enter calculate_width_height_by_columns") - if num_col == 1 and width_early < 1100: + if num_col == 1: img_w_new = 2000 - elif num_col == 1 and width_early >= 2500: - img_w_new = 2000 - elif num_col == 1 and width_early >= 1100 and width_early < 2500: - img_w_new = width_early - elif num_col == 2 and width_early < 2000: + elif num_col == 2: img_w_new = 2400 - elif num_col == 2 and width_early >= 3500: - img_w_new = 2400 - elif num_col == 2 and width_early >= 2000 and width_early < 3500: - img_w_new = width_early - elif num_col == 3 and width_early < 2000: + elif num_col == 3: img_w_new = 3000 - elif num_col == 3 and width_early >= 4000: - img_w_new = 3000 - elif num_col == 3 and width_early >= 2000 and width_early < 4000: - img_w_new = width_early - elif num_col == 4 and width_early < 2500: + elif num_col == 4: img_w_new = 4000 - elif num_col == 4 and width_early >= 5000: - img_w_new = 4000 - elif num_col == 4 and width_early >= 2500 and width_early < 5000: - img_w_new = width_early - elif num_col == 5 and width_early < 3700: + elif num_col == 5: img_w_new = 5000 - elif num_col == 5 and width_early >= 7000: - img_w_new = 5000 - elif num_col == 5 and width_early >= 3700 and width_early < 7000: - img_w_new = width_early - elif num_col == 6 and width_early < 4500: - img_w_new = 6500 # 5400 + elif num_col == 6: + img_w_new = 6500 else: img_w_new = width_early img_h_new = img_w_new * img.shape[0] // img.shape[1] - if label_p_pred[0][int(num_col - 1)] < 0.9 and img_w_new < width_early: - img_new = np.copy(img) - num_column_is_classified = False - #elif label_p_pred[0][int(num_col - 1)] < 0.8 and img_h_new >= 8000: - elif img_h_new >= 8000: + if img_h_new >= 8000: img_new = np.copy(img) num_column_is_classified = False else: From 065f1f9a9368def46ac0e4df4888bd29c168dea1 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Mon, 2 Jun 2025 18:21:33 +0200 Subject: [PATCH 28/40] Fix: Resolved OCR bug when text region type is undefined --- src/eynollah/eynollah.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 9c834e2..fc60f2e 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -5399,7 +5399,10 @@ class Eynollah_ocr: indexer_text_region = 0 indexer_textlines = 0 for nn in root1.iter(region_tags): - type_textregion = nn.attrib['type'] + try: + type_textregion = nn.attrib['type'] + except: + type_textregion = 'paragraph' for child_textregion in nn: if child_textregion.tag.endswith("TextLine"): for child_textlines in child_textregion: @@ -5467,6 +5470,7 @@ class Eynollah_ocr: else: + better_des_slope = 0 img_crop[mask_poly==0] = 255 if self.prediction_with_both_of_rgb_and_bin: img_crop_bin[mask_poly==0] = 255 @@ -5486,7 +5490,7 @@ class Eynollah_ocr: if w_scaled < 640:#1.5*image_width: img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) cropped_lines.append(img_fin) - if angle_degrees > 15: + if abs(better_des_slope) > 45: cropped_lines_ver_index.append(1) else: cropped_lines_ver_index.append(0) @@ -5505,7 +5509,7 @@ class Eynollah_ocr: cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(1) - if angle_degrees > 15: + if abs(better_des_slope) > 45: cropped_lines_ver_index.append(1) else: cropped_lines_ver_index.append(0) @@ -5515,7 +5519,7 @@ class Eynollah_ocr: cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(-1) - if angle_degrees > 15: + if abs(better_des_slope) > 45: cropped_lines_ver_index.append(1) else: cropped_lines_ver_index.append(0) @@ -5531,7 +5535,7 @@ class Eynollah_ocr: cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(0) - if angle_degrees > 15: + if abs(better_des_slope) > 45: cropped_lines_ver_index.append(1) else: cropped_lines_ver_index.append(0) From 59ea493803e5bf9f8038e8411777d137d91e27b9 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 3 Jul 2025 11:50:47 +0200 Subject: [PATCH 29/40] decorated with confidence value for cnnrnn ocr model --- src/eynollah/eynollah.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index fc60f2e..3b9d898 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -5129,7 +5129,7 @@ class Eynollah_ocr: self.b_s = int(batch_size) else: - self.model_ocr_dir = dir_models + "/model_step_1225000_ocr"#"/model_step_125000_ocr"#"/model_step_25000_ocr"#"/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" + self.model_ocr_dir = dir_models + "/model_step_900000_ocr"#"/model_step_25000_ocr"#"/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" model_ocr = load_model(self.model_ocr_dir , compile=False) self.prediction_model = tf.keras.models.Model( @@ -5487,7 +5487,7 @@ class Eynollah_ocr: if not self.export_textline_images_and_text: - if w_scaled < 640:#1.5*image_width: + if w_scaled < 750:#1.5*image_width: img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) cropped_lines.append(img_fin) if abs(better_des_slope) > 45: @@ -5580,6 +5580,7 @@ class Eynollah_ocr: if not self.export_textline_images_and_text: extracted_texts = [] + extracted_conf_value = [] n_iterations = math.ceil(len(cropped_lines) / self.b_s) @@ -5700,12 +5701,19 @@ class Eynollah_ocr: preds_bin[indices_to_be_replaced,:,:] = preds_flipped[indices_where_flipped_conf_value_is_higher, :, :] preds = (preds + preds_bin) / 2. + pred_texts = decode_batch_predictions(preds, self.num_to_char) + + preds_max = np.max(preds, axis=2 ) + preds_max_args = np.argmax(preds, axis=2 ) + pred_max_not_unk_mask_bool = preds_max_args[:,:]!=256 + masked_means = np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / np.sum(pred_max_not_unk_mask_bool, axis=1) for ib in range(imgs.shape[0]): pred_texts_ib = pred_texts[ib].replace("[UNK]", "") extracted_texts.append(pred_texts_ib) + extracted_conf_value.append(masked_means[ib]) del cropped_lines if self.prediction_with_both_of_rgb_and_bin: @@ -5713,7 +5721,10 @@ class Eynollah_ocr: gc.collect() extracted_texts_merged = [extracted_texts[ind] if cropped_lines_meging_indexing[ind]==0 else extracted_texts[ind]+" "+extracted_texts[ind+1] if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] + + extracted_conf_value_merged = [extracted_conf_value[ind] if cropped_lines_meging_indexing[ind]==0 else (extracted_conf_value[ind]+extracted_conf_value[ind+1])/2. if cropped_lines_meging_indexing[ind]==1 else None for ind in range(len(cropped_lines_meging_indexing))] + extracted_conf_value_merged = [extracted_conf_value_merged[ind_cfm] for ind_cfm in range(len(extracted_texts_merged)) if extracted_texts_merged[ind_cfm] is not None] extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None] unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer) @@ -5791,6 +5802,7 @@ class Eynollah_ocr: if not is_textline_text: text_subelement = ET.SubElement(child_textregion, 'TextEquiv') + text_subelement.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}") unicode_textline = ET.SubElement(text_subelement, 'Unicode') unicode_textline.text = extracted_texts_merged[indexer] else: @@ -5798,6 +5810,7 @@ class Eynollah_ocr: if childtest3.tag.endswith("TextEquiv"): for child_uc in childtest3: if child_uc.tag.endswith("Unicode"): + childtest3.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}") child_uc.text = extracted_texts_merged[indexer] indexer = indexer + 1 From e54ebaa23e89d0381157415e33d6324c3dd8aecd Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 3 Jul 2025 15:24:52 +0200 Subject: [PATCH 30/40] ocr: make sure that image height or width is not zero --- src/eynollah/eynollah.py | 4 ---- src/eynollah/utils/utils_ocr.py | 34 +++++++++++++++++++-------------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 3b9d898..1260a96 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -5435,7 +5435,6 @@ class Eynollah_ocr: mask_poly = mask_poly[y:y+h, x:x+w, :] img_crop = img_poly_on_img[y:y+h, x:x+w, :] - #print(file_name, angle_degrees,w*h , mask_poly[:,:,0].sum(), mask_poly[:,:,0].sum() /float(w*h) , 'didi') if not self.do_not_mask_with_textline_contour: @@ -5482,9 +5481,6 @@ class Eynollah_ocr: img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin) else: img_crop, _ = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) - - - if not self.export_textline_images_and_text: if w_scaled < 750:#1.5*image_width: diff --git a/src/eynollah/utils/utils_ocr.py b/src/eynollah/utils/utils_ocr.py index 81a8ae1..1e9162a 100644 --- a/src/eynollah/utils/utils_ocr.py +++ b/src/eynollah/utils/utils_ocr.py @@ -124,23 +124,26 @@ def return_textlines_split_if_needed(textline_image, textline_image_bin, predict else: return None, None def preprocess_and_resize_image_for_ocrcnn_model(img, image_height, image_width): - ratio = image_height /float(img.shape[0]) - w_ratio = int(ratio * img.shape[1]) - - if w_ratio <= image_width: - width_new = w_ratio + if img.shape[0]==0 or img.shape[1]==0: + img_fin = np.ones((image_height, image_width, 3)) else: - width_new = image_width + ratio = image_height /float(img.shape[0]) + w_ratio = int(ratio * img.shape[1]) - if width_new == 0: - width_new = img.shape[1] + if w_ratio <= image_width: + width_new = w_ratio + else: + width_new = image_width + + if width_new == 0: + width_new = img.shape[1] + - - img = resize_image(img, image_height, width_new) - img_fin = np.ones((image_height, image_width, 3))*255 + img = resize_image(img, image_height, width_new) + img_fin = np.ones((image_height, image_width, 3))*255 - img_fin[:,:width_new,:] = img[:,:,:] - img_fin = img_fin / 255. + img_fin[:,:width_new,:] = img[:,:,:] + img_fin = img_fin / 255. return img_fin def get_deskewed_contour_and_bb_and_image(contour, image, deskew_angle): @@ -188,7 +191,10 @@ def rotate_image_with_padding(image, angle, border_value=(0,0,0)): rotation_matrix[1, 2] += (new_h / 2) - center[1] # Perform the rotation - rotated_image = cv2.warpAffine(image, rotation_matrix, (new_w, new_h), borderValue=border_value) + try: + rotated_image = cv2.warpAffine(image, rotation_matrix, (new_w, new_h), borderValue=border_value) + except: + rotated_image = np.copy(image) return rotated_image From e0f4a007e45255fc870f0ca12ad5c2870ea00ef1 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 16 Jul 2025 14:00:12 +0200 Subject: [PATCH 31/40] ocr model renamed - image text font for ocr result is now using Charis-7.000 font (downloaded from here https://software.sil.org/charis/download/) --- src/eynollah/eynollah.py | 148 +++++++++++++++++++++------------------ 1 file changed, 78 insertions(+), 70 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 1260a96..bf11dec 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -318,7 +318,7 @@ class Eynollah: if self.ocr and self.tr: self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124" elif self.ocr and not self.tr: - self.model_ocr_dir = dir_models + "/model_step_750000_ocr"#"/model_step_125000_ocr"#"/model_step_25000_ocr"#"/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" + self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250716" if self.tables: if self.light_version: self.model_table_dir = dir_models + "/modelens_table_0t4_201124" @@ -5129,7 +5129,7 @@ class Eynollah_ocr: self.b_s = int(batch_size) else: - self.model_ocr_dir = dir_models + "/model_step_900000_ocr"#"/model_step_25000_ocr"#"/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" + self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250716" model_ocr = load_model(self.model_ocr_dir , compile=False) self.prediction_model = tf.keras.models.Model( @@ -5276,7 +5276,7 @@ class Eynollah_ocr: if self.draw_texts_on_image: - font_path = "NotoSans-Regular.ttf" # Make sure this file exists! + font_path = "Charis-7.000/Charis-Regular.ttf" # Make sure this file exists! font = ImageFont.truetype(font_path, 40) for indexer_text, bb_ind in enumerate(total_bb_coordinates): @@ -5340,8 +5340,8 @@ class Eynollah_ocr: tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf8",default_namespace=None) #print("Job done in %.1fs", time.time() - t0) else: - max_len = 512 - padding_token = 299 + max_len = 512#280#512 + padding_token = 299#1500#299 image_width = 512#max_len * 4 image_height = 32 @@ -5435,52 +5435,57 @@ class Eynollah_ocr: mask_poly = mask_poly[y:y+h, x:x+w, :] img_crop = img_poly_on_img[y:y+h, x:x+w, :] - - #print(file_name, angle_degrees,w*h , mask_poly[:,:,0].sum(), mask_poly[:,:,0].sum() /float(w*h) , 'didi') - if not self.do_not_mask_with_textline_contour: - if angle_degrees > 3: - better_des_slope = get_orientation_moments(textline_coords) - - img_crop = rotate_image_with_padding(img_crop, better_des_slope ) - - if self.prediction_with_both_of_rgb_and_bin: - img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope ) - - mask_poly = rotate_image_with_padding(mask_poly, better_des_slope ) - mask_poly = mask_poly.astype('uint8') - - #new bounding box - x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_poly[:,:,0]) - - mask_poly = mask_poly[y_n:y_n+h_n, x_n:x_n+w_n, :] - img_crop = img_crop[y_n:y_n+h_n, x_n:x_n+w_n, :] - + + if self.export_textline_images_and_text: + if not self.do_not_mask_with_textline_contour: img_crop[mask_poly==0] = 255 - - if self.prediction_with_both_of_rgb_and_bin: - img_crop_bin = img_crop_bin[y_n:y_n+h_n, x_n:x_n+w_n, :] - img_crop_bin[mask_poly==0] = 255 - - if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 90: + + else: + #print(file_name, angle_degrees,w*h , mask_poly[:,:,0].sum(), mask_poly[:,:,0].sum() /float(w*h) , 'didi') + if not self.do_not_mask_with_textline_contour: + if angle_degrees > 3: + better_des_slope = get_orientation_moments(textline_coords) + + img_crop = rotate_image_with_padding(img_crop, better_des_slope ) + if self.prediction_with_both_of_rgb_and_bin: - img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin) - else: - img_crop, _ = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) - + img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope ) + + mask_poly = rotate_image_with_padding(mask_poly, better_des_slope ) + mask_poly = mask_poly.astype('uint8') - else: - better_des_slope = 0 - img_crop[mask_poly==0] = 255 - if self.prediction_with_both_of_rgb_and_bin: - img_crop_bin[mask_poly==0] = 255 - if type_textregion=='drop-capital': - pass - else: - if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 90: + #new bounding box + x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_poly[:,:,0]) + + mask_poly = mask_poly[y_n:y_n+h_n, x_n:x_n+w_n, :] + img_crop = img_crop[y_n:y_n+h_n, x_n:x_n+w_n, :] + + img_crop[mask_poly==0] = 255 + + if self.prediction_with_both_of_rgb_and_bin: + img_crop_bin = img_crop_bin[y_n:y_n+h_n, x_n:x_n+w_n, :] + img_crop_bin[mask_poly==0] = 255 + + if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 90: if self.prediction_with_both_of_rgb_and_bin: img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin) else: img_crop, _ = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) + + + else: + better_des_slope = 0 + img_crop[mask_poly==0] = 255 + if self.prediction_with_both_of_rgb_and_bin: + img_crop_bin[mask_poly==0] = 255 + if type_textregion=='drop-capital': + pass + else: + if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 90: + if self.prediction_with_both_of_rgb_and_bin: + img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin) + else: + img_crop, _ = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) if not self.export_textline_images_and_text: if w_scaled < 750:#1.5*image_width: @@ -5541,35 +5546,38 @@ class Eynollah_ocr: cropped_lines_bin.append(img_fin) if self.export_textline_images_and_text: - if child_textlines.tag.endswith("TextEquiv"): - for cheild_text in child_textlines: - if cheild_text.tag.endswith("Unicode"): - textline_text = cheild_text.text - if textline_text: - if self.do_not_mask_with_textline_contour: - if self.pref_of_dataset: - with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'.txt'), 'w') as text_file: - text_file.write(textline_text) + if img_crop.shape[0]==0 or img_crop.shape[1]==0: + pass + else: + if child_textlines.tag.endswith("TextEquiv"): + for cheild_text in child_textlines: + if cheild_text.tag.endswith("Unicode"): + textline_text = cheild_text.text + if textline_text: + if self.do_not_mask_with_textline_contour: + if self.pref_of_dataset: + with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'.txt'), 'w') as text_file: + text_file.write(textline_text) - cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'.png'), img_crop ) + cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'.png'), img_crop ) + else: + with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.txt'), 'w') as text_file: + text_file.write(textline_text) + + cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.png'), img_crop ) else: - with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.txt'), 'w') as text_file: - text_file.write(textline_text) + if self.pref_of_dataset: + with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'_masked.txt'), 'w') as text_file: + text_file.write(textline_text) - cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.png'), img_crop ) - else: - if self.pref_of_dataset: - with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'_masked.txt'), 'w') as text_file: - text_file.write(textline_text) + cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'_masked.png'), img_crop ) + else: + with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_masked.txt'), 'w') as text_file: + text_file.write(textline_text) - cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'_masked.png'), img_crop ) - else: - with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_masked.txt'), 'w') as text_file: - text_file.write(textline_text) - - cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_masked.png'), img_crop ) - - indexer_textlines+=1 + cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_masked.png'), img_crop ) + + indexer_textlines+=1 if not self.export_textline_images_and_text: indexer_text_region = indexer_text_region +1 @@ -5727,7 +5735,7 @@ class Eynollah_ocr: if self.draw_texts_on_image: - font_path = "NotoSans-Regular.ttf" # Make sure this file exists! + font_path = "Charis-7.000/Charis-Regular.ttf" # Make sure this file exists! font = ImageFont.truetype(font_path, 40) for indexer_text, bb_ind in enumerate(total_bb_coordinates): From 920705c3b1a70ee5f18f6731b92b5a775b6a2fa0 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Mon, 21 Jul 2025 10:54:20 +0200 Subject: [PATCH 32/40] update model names --- src/eynollah/eynollah.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index bf11dec..12acff7 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -5129,7 +5129,7 @@ class Eynollah_ocr: self.b_s = int(batch_size) else: - self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250716" + self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250716"#"/model_ens_ocrcnn_new6"#"/model_ens_ocrcnn_new2"# model_ocr = load_model(self.model_ocr_dir , compile=False) self.prediction_model = tf.keras.models.Model( @@ -5143,7 +5143,6 @@ class Eynollah_ocr: with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: characters = json.load(config_file) - AUTOTUNE = tf.data.AUTOTUNE @@ -5154,6 +5153,7 @@ class Eynollah_ocr: self.num_to_char = StringLookup( vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True ) + self.end_character = len(characters) + 2 def run(self, overwrite : bool = False): if self.dir_in: @@ -5340,8 +5340,8 @@ class Eynollah_ocr: tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf8",default_namespace=None) #print("Job done in %.1fs", time.time() - t0) else: - max_len = 512#280#512 - padding_token = 299#1500#299 + ###max_len = 280#512#280#512 + ###padding_token = 1500#299#1500#299 image_width = 512#max_len * 4 image_height = 32 @@ -5656,13 +5656,13 @@ class Eynollah_ocr: preds_flipped = self.prediction_model.predict(imgs_ver_flipped, verbose=0) preds_max_fliped = np.max(preds_flipped, axis=2 ) preds_max_args_flipped = np.argmax(preds_flipped, axis=2 ) - pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=256 + pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=self.end_character masked_means_flipped = np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / np.sum(pred_max_not_unk_mask_bool_flipped, axis=1) masked_means_flipped[np.isnan(masked_means_flipped)] = 0 preds_max = np.max(preds, axis=2 ) preds_max_args = np.argmax(preds, axis=2 ) - pred_max_not_unk_mask_bool = preds_max_args[:,:]!=256 + pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character masked_means = np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / np.sum(pred_max_not_unk_mask_bool, axis=1) masked_means[np.isnan(masked_means)] = 0 @@ -5683,13 +5683,13 @@ class Eynollah_ocr: preds_flipped = self.prediction_model.predict(imgs_bin_ver_flipped, verbose=0) preds_max_fliped = np.max(preds_flipped, axis=2 ) preds_max_args_flipped = np.argmax(preds_flipped, axis=2 ) - pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=256 + pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=self.end_character masked_means_flipped = np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / np.sum(pred_max_not_unk_mask_bool_flipped, axis=1) masked_means_flipped[np.isnan(masked_means_flipped)] = 0 preds_max = np.max(preds, axis=2 ) preds_max_args = np.argmax(preds, axis=2 ) - pred_max_not_unk_mask_bool = preds_max_args[:,:]!=256 + pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character masked_means = np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / np.sum(pred_max_not_unk_mask_bool, axis=1) masked_means[np.isnan(masked_means)] = 0 @@ -5711,7 +5711,7 @@ class Eynollah_ocr: preds_max = np.max(preds, axis=2 ) preds_max_args = np.argmax(preds, axis=2 ) - pred_max_not_unk_mask_bool = preds_max_args[:,:]!=256 + pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character masked_means = np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / np.sum(pred_max_not_unk_mask_bool, axis=1) for ib in range(imgs.shape[0]): From d968a306e4f55ee9be01baf8c88c4abd47cd0ef5 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Mon, 21 Jul 2025 14:50:05 +0200 Subject: [PATCH 33/40] should merged text for the whole page be written in xml? --- src/eynollah/eynollah.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 12acff7..bdb8f1a 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -5129,7 +5129,7 @@ class Eynollah_ocr: self.b_s = int(batch_size) else: - self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250716"#"/model_ens_ocrcnn_new6"#"/model_ens_ocrcnn_new2"# + self.model_ocr_dir = dir_models + "/model_ens_ocrcnn_new6"#"/model_eynollah_ocr_cnnrnn_20250716"#"/model_ens_ocrcnn_new6"#"/model_ens_ocrcnn_new2"# model_ocr = load_model(self.model_ocr_dir , compile=False) self.prediction_model = tf.keras.models.Model( @@ -5141,7 +5141,7 @@ class Eynollah_ocr: self.b_s = int(batch_size) - with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: + with open(os.path.join(self.model_ocr_dir, "characters_20250707_all_lang.txt"),"r") as config_file: characters = json.load(config_file) AUTOTUNE = tf.data.AUTOTUNE @@ -5780,9 +5780,24 @@ class Eynollah_ocr: text_by_textregion.append(" ".join(extracted_texts_merged_un)) #print(text_by_textregion, 'text_by_textregiontext_by_textregiontext_by_textregiontext_by_textregiontext_by_textregion') + + ###index_tot_regions = [] + ###tot_region_ref = [] + + ###for jj in root1.iter(link+'RegionRefIndexed'): + ###index_tot_regions.append(jj.attrib['index']) + ###tot_region_ref.append(jj.attrib['regionRef']) + + ###id_to_order = {tid: ro for tid, ro in zip(tot_region_ref, index_tot_regions)} + + id_textregions = [] + textregions_by_existing_ids = [] indexer = 0 indexer_textregion = 0 for nn in root1.iter(region_tags): + id_textregion = nn.attrib['id'] + id_textregions.append(id_textregion) + textregions_by_existing_ids.append(text_by_textregion[indexer_textregion]) is_textregion_text = False for childtest in nn: @@ -5829,7 +5844,17 @@ class Eynollah_ocr: else: unicode_textregion.text = text_by_textregion[indexer_textregion] indexer_textregion = indexer_textregion + 1 - + + ###sample_order = [(id_to_order[tid], text) for tid, text in zip(id_textregions, textregions_by_existing_ids) if tid in id_to_order] + + ##ordered_texts_sample = [text for _, text in sorted(sample_order)] + ##tot_page_text = ' '.join(ordered_texts_sample) + + ##for page_element in root1.iter(link+'Page'): + ##text_page = ET.SubElement(page_element, 'TextEquiv') + ##unicode_textpage = ET.SubElement(text_page, 'Unicode') + ##unicode_textpage.text = tot_page_text + ET.register_namespace("",name_space) tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf8",default_namespace=None) #print("Job done in %.1fs", time.time() - t0) From 0803881f3675a38558145fc81e40f9a9802f59fb Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 25 Jul 2025 13:18:38 +0200 Subject: [PATCH 34/40] threshold for textline ocr + new ocr model --- src/eynollah/cli.py | 8 ++- src/eynollah/eynollah.py | 117 +++++++++++++++++++++++---------------- 2 files changed, 76 insertions(+), 49 deletions(-) diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index 9398c47..a313860 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -496,6 +496,11 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ "-ds_pref", help="in the case of extracting textline and text from a xml GT file user can add an abbrevation of dataset name to generated dataset", ) +@click.option( + "--min_conf_value_of_textline_text", + "-min_conf", + help="minimum OCR confidence value. Text lines with a confidence value lower than this threshold will not be included in the output XML file.", +) @click.option( "--log_level", "-l", @@ -503,7 +508,7 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ help="Override log level globally to this", ) -def ocr(image, overwrite, dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, prediction_with_both_of_rgb_and_bin, batch_size, dataset_abbrevation, log_level): +def ocr(image, overwrite, dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, prediction_with_both_of_rgb_and_bin, batch_size, dataset_abbrevation, min_conf_value_of_textline_text, log_level): initLogging() if log_level: getLogger('eynollah').setLevel(getLevelName(log_level)) @@ -530,6 +535,7 @@ def ocr(image, overwrite, dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, prediction_with_both_of_rgb_and_bin=prediction_with_both_of_rgb_and_bin, batch_size=batch_size, pref_of_dataset=dataset_abbrevation, + min_conf_value_of_textline_text=min_conf_value_of_textline_text, ) eynollah_ocr.run(overwrite=overwrite) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index bdb8f1a..aa1b2e1 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -318,7 +318,7 @@ class Eynollah: if self.ocr and self.tr: self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124" elif self.ocr and not self.tr: - self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250716" + self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250725" if self.tables: if self.light_version: self.model_table_dir = dir_models + "/modelens_table_0t4_201124" @@ -4974,13 +4974,23 @@ class Eynollah: gc.collect() if len(all_found_textline_polygons)>0: ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + else: + ocr_all_textlines = None + if all_found_textline_polygons_marginals and len(all_found_textline_polygons_marginals)>0: ocr_all_textlines_marginals = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + else: + ocr_all_textlines_marginals = None if all_found_textline_polygons_h and len(all_found_textline_polygons)>0: ocr_all_textlines_h = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_h, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + else: + ocr_all_textlines_h = None + if polygons_of_drop_capitals and len(polygons_of_drop_capitals)>0: ocr_all_textlines_drop = return_rnn_cnn_ocr_of_given_textlines(image_page, polygons_of_drop_capitals, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + else: + ocr_all_textlines_drop = None else: ocr_all_textlines = None ocr_all_textlines_marginals = None @@ -5098,7 +5108,8 @@ class Eynollah_ocr: do_not_mask_with_textline_contour=False, draw_texts_on_image=False, prediction_with_both_of_rgb_and_bin=False, - pref_of_dataset = None, + pref_of_dataset=None, + min_conf_value_of_textline_text : Optional[float]=None, logger=None, ): self.dir_in = dir_in @@ -5117,6 +5128,10 @@ class Eynollah_ocr: self.logger = logger if logger else getLogger('eynollah') if not export_textline_images_and_text: + if min_conf_value_of_textline_text: + self.min_conf_value_of_textline_text = float(min_conf_value_of_textline_text) + else: + self.min_conf_value_of_textline_text = 0.3 if tr_ocr: self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -5129,7 +5144,7 @@ class Eynollah_ocr: self.b_s = int(batch_size) else: - self.model_ocr_dir = dir_models + "/model_ens_ocrcnn_new6"#"/model_eynollah_ocr_cnnrnn_20250716"#"/model_ens_ocrcnn_new6"#"/model_ens_ocrcnn_new2"# + self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250725"#"/model_step_1020000_ocr"#"/model_ens_ocrcnn_new10"#"/model_step_255000_ocr"#"/model_ens_ocrcnn_new9"#"/model_step_900000_ocr"#"/model_eynollah_ocr_cnnrnn_20250716"#"/model_ens_ocrcnn_new6"#"/model_ens_ocrcnn_new2"# model_ocr = load_model(self.model_ocr_dir , compile=False) self.prediction_model = tf.keras.models.Model( @@ -5139,9 +5154,8 @@ class Eynollah_ocr: self.b_s = 8 else: self.b_s = int(batch_size) - - with open(os.path.join(self.model_ocr_dir, "characters_20250707_all_lang.txt"),"r") as config_file: + with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: characters = json.load(config_file) AUTOTUNE = tf.data.AUTOTUNE @@ -5442,50 +5456,54 @@ class Eynollah_ocr: else: #print(file_name, angle_degrees,w*h , mask_poly[:,:,0].sum(), mask_poly[:,:,0].sum() /float(w*h) , 'didi') - if not self.do_not_mask_with_textline_contour: - if angle_degrees > 3: - better_des_slope = get_orientation_moments(textline_coords) + + if angle_degrees > 3: + better_des_slope = get_orientation_moments(textline_coords) + + img_crop = rotate_image_with_padding(img_crop, better_des_slope ) + + if self.prediction_with_both_of_rgb_and_bin: + img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope ) - img_crop = rotate_image_with_padding(img_crop, better_des_slope ) + mask_poly = rotate_image_with_padding(mask_poly, better_des_slope ) + mask_poly = mask_poly.astype('uint8') + + #new bounding box + x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_poly[:,:,0]) + + mask_poly = mask_poly[y_n:y_n+h_n, x_n:x_n+w_n, :] + img_crop = img_crop[y_n:y_n+h_n, x_n:x_n+w_n, :] - if self.prediction_with_both_of_rgb_and_bin: - img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope ) - - mask_poly = rotate_image_with_padding(mask_poly, better_des_slope ) - mask_poly = mask_poly.astype('uint8') - - #new bounding box - x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_poly[:,:,0]) - - mask_poly = mask_poly[y_n:y_n+h_n, x_n:x_n+w_n, :] - img_crop = img_crop[y_n:y_n+h_n, x_n:x_n+w_n, :] - + if not self.do_not_mask_with_textline_contour: img_crop[mask_poly==0] = 255 - - if self.prediction_with_both_of_rgb_and_bin: - img_crop_bin = img_crop_bin[y_n:y_n+h_n, x_n:x_n+w_n, :] + + if self.prediction_with_both_of_rgb_and_bin: + img_crop_bin = img_crop_bin[y_n:y_n+h_n, x_n:x_n+w_n, :] + if not self.do_not_mask_with_textline_contour: img_crop_bin[mask_poly==0] = 255 + + if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 90: + if self.prediction_with_both_of_rgb_and_bin: + img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin) + else: + img_crop, _ = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) + - if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 90: + else: + better_des_slope = 0 + if not self.do_not_mask_with_textline_contour: + img_crop[mask_poly==0] = 255 + if self.prediction_with_both_of_rgb_and_bin: + if not self.do_not_mask_with_textline_contour: + img_crop_bin[mask_poly==0] = 255 + if type_textregion=='drop-capital': + pass + else: + if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 90: if self.prediction_with_both_of_rgb_and_bin: img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin) else: img_crop, _ = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) - - - else: - better_des_slope = 0 - img_crop[mask_poly==0] = 255 - if self.prediction_with_both_of_rgb_and_bin: - img_crop_bin[mask_poly==0] = 255 - if type_textregion=='drop-capital': - pass - else: - if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 90: - if self.prediction_with_both_of_rgb_and_bin: - img_crop, img_crop_bin = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly, img_crop_bin) - else: - img_crop, _ = break_curved_line_into_small_pieces_and_then_merge(img_crop, mask_poly) if not self.export_textline_images_and_text: if w_scaled < 750:#1.5*image_width: @@ -5716,9 +5734,12 @@ class Eynollah_ocr: for ib in range(imgs.shape[0]): pred_texts_ib = pred_texts[ib].replace("[UNK]", "") - extracted_texts.append(pred_texts_ib) - extracted_conf_value.append(masked_means[ib]) - + if masked_means[ib] >= self.min_conf_value_of_textline_text: + extracted_texts.append(pred_texts_ib) + extracted_conf_value.append(masked_means[ib]) + else: + extracted_texts.append("") + extracted_conf_value.append(0) del cropped_lines if self.prediction_with_both_of_rgb_and_bin: del cropped_lines_bin @@ -5790,14 +5811,14 @@ class Eynollah_ocr: ###id_to_order = {tid: ro for tid, ro in zip(tot_region_ref, index_tot_regions)} - id_textregions = [] - textregions_by_existing_ids = [] + #id_textregions = [] + #textregions_by_existing_ids = [] indexer = 0 indexer_textregion = 0 for nn in root1.iter(region_tags): - id_textregion = nn.attrib['id'] - id_textregions.append(id_textregion) - textregions_by_existing_ids.append(text_by_textregion[indexer_textregion]) + #id_textregion = nn.attrib['id'] + #id_textregions.append(id_textregion) + #textregions_by_existing_ids.append(text_by_textregion[indexer_textregion]) is_textregion_text = False for childtest in nn: From a0c19c57bea82af2db65421d46cdd8c740b65455 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 5 Aug 2025 14:22:22 +0200 Subject: [PATCH 35/40] use the latest ocr model with balanced fraktur-antiqua training dataset --- src/eynollah/cli.py | 4 ++-- src/eynollah/eynollah.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index a313860..5135534 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -325,12 +325,12 @@ def enhancement(image, out, overwrite, dir_in, model, num_col_upper, num_col_low @click.option( "--threshold_art_class_layout", "-tharl", - help="threshold of artifical class in the case of layout detection", + help="threshold of artifical class in the case of layout detection. The default value is 0.1", ) @click.option( "--threshold_art_class_textline", "-thart", - help="threshold of artifical class in the case of textline detection", + help="threshold of artifical class in the case of textline detection. The default value is 0.1", ) @click.option( "--skip_layout_and_reading_order", diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index aa1b2e1..9e5ba51 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -318,7 +318,7 @@ class Eynollah: if self.ocr and self.tr: self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124" elif self.ocr and not self.tr: - self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250725" + self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250805" if self.tables: if self.light_version: self.model_table_dir = dir_models + "/modelens_table_0t4_201124" @@ -5144,7 +5144,7 @@ class Eynollah_ocr: self.b_s = int(batch_size) else: - self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250725"#"/model_step_1020000_ocr"#"/model_ens_ocrcnn_new10"#"/model_step_255000_ocr"#"/model_ens_ocrcnn_new9"#"/model_step_900000_ocr"#"/model_eynollah_ocr_cnnrnn_20250716"#"/model_ens_ocrcnn_new6"#"/model_ens_ocrcnn_new2"# + self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250805" model_ocr = load_model(self.model_ocr_dir , compile=False) self.prediction_model = tf.keras.models.Model( From 5db3e9fa64d39c128bd9bee27c9d0fb73b3459d2 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 8 Aug 2025 11:32:02 +0200 Subject: [PATCH 36/40] deskewing with faster multiprocessing --- src/eynollah/eynollah.py | 9 +-- src/eynollah/utils/separate_lines.py | 103 +++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 4 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 9e5ba51..5299d3e 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -96,6 +96,7 @@ from .utils.separate_lines import ( textline_contours_postprocessing, separate_lines_new2, return_deskew_slop, + return_deskew_slop_old_mp, do_work_of_slopes_new, do_work_of_slopes_new_curved, do_work_of_slopes_new_light, @@ -1936,8 +1937,8 @@ class Eynollah: y_diff_mean = find_contours_mean_y_diff(textline_con_fil) sigma_des = max(1, int(y_diff_mean * (4.0 / 40.0))) crop_img[crop_img > 0] = 1 - slope_corresponding_textregion = return_deskew_slop(crop_img, sigma_des, - map=self.executor.map, logger=self.logger, plotter=self.plotter) + slope_corresponding_textregion = return_deskew_slop_old_mp(crop_img, sigma_des, + logger=self.logger, plotter=self.plotter) except Exception as why: self.logger.error(why) slope_corresponding_textregion = MAX_SLOPE @@ -3203,8 +3204,8 @@ class Eynollah: def run_deskew(self, textline_mask_tot_ea): #print(textline_mask_tot_ea.shape, 'textline_mask_tot_ea deskew') - slope_deskew = return_deskew_slop(cv2.erode(textline_mask_tot_ea, KERNEL, iterations=2), 2, 30, True, - map=self.executor.map, logger=self.logger, plotter=self.plotter) + slope_deskew = return_deskew_slop_old_mp(cv2.erode(textline_mask_tot_ea, KERNEL, iterations=2), 2, 30, True, + logger=self.logger, plotter=self.plotter) slope_first = 0 if self.plotter: diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index 6289d4d..ead5cfb 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -5,6 +5,8 @@ import numpy as np import cv2 from scipy.signal import find_peaks from scipy.ndimage import gaussian_filter1d +from multiprocessing import Process, Queue, cpu_count +from multiprocessing import Pool from .rotate import rotate_image from .resize import resize_image from .contour import ( @@ -1526,6 +1528,107 @@ def get_smallest_skew(img, sigma_des, angles, logger=None, plotter=None, map=map angle = 0 return angle + +def return_deskew_slop_old_mp(img_patch_org, sigma_des,n_tot_angles=100, + main_page=False, logger=None, plotter=None): + if main_page and plotter: + plotter.save_plot_of_textline_density(img_patch_org) + + img_int=np.zeros((img_patch_org.shape[0],img_patch_org.shape[1])) + img_int[:,:]=img_patch_org[:,:]#img_patch_org[:,:,0] + + max_shape=np.max(img_int.shape) + img_resized=np.zeros((int( max_shape*(1.1) ) , int( max_shape*(1.1) ) )) + + onset_x=int((img_resized.shape[1]-img_int.shape[1])/2.) + onset_y=int((img_resized.shape[0]-img_int.shape[0])/2.) + + img_resized[ onset_y:onset_y+img_int.shape[0] , onset_x:onset_x+img_int.shape[1] ]=img_int[:,:] + + if main_page and img_patch_org.shape[1] > img_patch_org.shape[0]: + angles = np.array([-45, 0, 45, 90,]) + angle = get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=plotter) + + angles = np.linspace(angle - 22.5, angle + 22.5, n_tot_angles) + angle = get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=plotter) + elif main_page: + angles = np.linspace(-12, 12, n_tot_angles)#np.array([0 , 45 , 90 , -45]) + angle = get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=plotter) + + early_slope_edge=11 + if abs(angle) > early_slope_edge: + if angle < 0: + angles = np.linspace(-90, -12, n_tot_angles) + else: + angles = np.linspace(90, 12, n_tot_angles) + angle = get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=plotter) + else: + angles = np.linspace(-25, 25, int(0.5 * n_tot_angles) + 10) + angle = get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=plotter) + + early_slope_edge=22 + if abs(angle) > early_slope_edge: + if angle < 0: + angles = np.linspace(-90, -25, int(0.5 * n_tot_angles) + 10) + else: + angles = np.linspace(90, 25, int(0.5 * n_tot_angles) + 10) + angle = get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=plotter) + + return angle + +def do_image_rotation_omp(queue_of_all_params,angles_per_process, img_resized, sigma_des): + vars_per_each_subprocess = [] + angles_per_each_subprocess = [] + for mv in range(len(angles_per_process)): + img_rot=rotate_image(img_resized,angles_per_process[mv]) + img_rot[img_rot!=0]=1 + try: + var_spectrum=find_num_col_deskew(img_rot,sigma_des,20.3 ) + except: + var_spectrum=0 + vars_per_each_subprocess.append(var_spectrum) + angles_per_each_subprocess.append(angles_per_process[mv]) + + queue_of_all_params.put([vars_per_each_subprocess, angles_per_each_subprocess]) + +def get_smallest_skew_omp(img_resized, sigma_des, angles, plotter=None): + num_cores = cpu_count() + + queue_of_all_params = Queue() + processes = [] + nh = np.linspace(0, len(angles), num_cores + 1) + + for i in range(num_cores): + angles_per_process = angles[int(nh[i]) : int(nh[i + 1])] + processes.append(Process(target=do_image_rotation_omp, args=(queue_of_all_params, angles_per_process, img_resized, sigma_des))) + + for i in range(num_cores): + processes[i].start() + + var_res=[] + all_angles = [] + for i in range(num_cores): + list_all_par = queue_of_all_params.get(True) + vars_for_subprocess = list_all_par[0] + angles_sub_process = list_all_par[1] + for j in range(len(vars_for_subprocess)): + var_res.append(vars_for_subprocess[j]) + all_angles.append(angles_sub_process[j]) + + for i in range(num_cores): + processes[i].join() + + if plotter: + plotter.save_plot_of_rotation_angle(all_angles, var_res) + + + try: + var_res=np.array(var_res) + ang_int=all_angles[np.argmax(var_res)]#angels_sorted[arg_final]#angels[arg_sort_early[arg_sort[arg_final]]]#angels[arg_fin] + except: + ang_int=0 + return ang_int + def do_work_of_slopes_new( box_text, contour, contour_par, index_r_con, textline_mask_tot_ea, image_page_rotated, slope_deskew, From 20614d1678fa7c586299680f017e5b7d8c12521c Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 12 Aug 2025 12:50:15 +0200 Subject: [PATCH 37/40] avoiding float in range --- src/eynollah/utils/__init__.py | 41 +++++++++++++++++----------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 7fa4a7b..ca86047 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1801,8 +1801,8 @@ def return_boxes_of_images_by_order_of_reading_new( #print(y_type_2_up,x_starting_up,x_ending_up,'didid') nodes_in = [] for ij in range(len(x_starting_up)): - nodes_in = nodes_in + list(range(x_starting_up[ij], - x_ending_up[ij])) + nodes_in = nodes_in + list(range(int(x_starting_up[ij]), + int(x_ending_up[ij]))) nodes_in = np.unique(nodes_in) #print(nodes_in,'nodes_in') @@ -1825,8 +1825,8 @@ def return_boxes_of_images_by_order_of_reading_new( elif len(y_diff_main_separator_up)==0: nodes_in = [] for ij in range(len(x_starting_up)): - nodes_in = nodes_in + list(range(x_starting_up[ij], - x_ending_up[ij])) + nodes_in = nodes_in + list(range(int(x_starting_up[ij]), + int(x_ending_up[ij]))) nodes_in = np.unique(nodes_in) #print(nodes_in,'nodes_in2') #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') @@ -1866,8 +1866,8 @@ def return_boxes_of_images_by_order_of_reading_new( columns_covered_by_mothers = [] for dj in range(len(x_start_without_mother)): columns_covered_by_mothers = columns_covered_by_mothers + \ - list(range(x_start_without_mother[dj], - x_end_without_mother[dj])) + list(range(int(x_start_without_mother[dj]), + int(x_end_without_mother[dj]))) columns_covered_by_mothers = list(set(columns_covered_by_mothers)) all_columns=np.arange(len(peaks_neg_tot)-1) @@ -1909,8 +1909,8 @@ def return_boxes_of_images_by_order_of_reading_new( columns_covered_by_mothers = [] for dj in range(len(x_start_without_mother)): columns_covered_by_mothers = columns_covered_by_mothers + \ - list(range(x_start_without_mother[dj], - x_end_without_mother[dj])) + list(range(int(x_start_without_mother[dj]), + int(x_end_without_mother[dj]))) columns_covered_by_mothers = list(set(columns_covered_by_mothers)) all_columns=np.arange(len(peaks_neg_tot)-1) @@ -1926,8 +1926,8 @@ def return_boxes_of_images_by_order_of_reading_new( columns_covered_by_with_child_no_mothers = [] for dj in range(len(x_end_with_child_without_mother)): columns_covered_by_with_child_no_mothers = columns_covered_by_with_child_no_mothers + \ - list(range(x_start_with_child_without_mother[dj], - x_end_with_child_without_mother[dj])) + list(range(int(x_start_with_child_without_mother[dj]), + int(x_end_with_child_without_mother[dj]))) columns_covered_by_with_child_no_mothers = list(set(columns_covered_by_with_child_no_mothers)) all_columns = np.arange(len(peaks_neg_tot)-1) @@ -1970,8 +1970,8 @@ def return_boxes_of_images_by_order_of_reading_new( columns_covered_by_mothers = [] for dj in range(len(x_starting_all_between_nm_wc)): columns_covered_by_mothers = columns_covered_by_mothers + \ - list(range(x_starting_all_between_nm_wc[dj], - x_ending_all_between_nm_wc[dj])) + list(range(int(x_starting_all_between_nm_wc[dj]), + int(x_ending_all_between_nm_wc[dj]))) columns_covered_by_mothers = list(set(columns_covered_by_mothers)) all_columns=np.arange(i_s_nc, x_end_biggest_column) @@ -1979,8 +1979,8 @@ def return_boxes_of_images_by_order_of_reading_new( should_longest_line_be_extended=0 if (len(x_diff_all_between_nm_wc) > 0 and - set(list(range(x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest])) + + set(list(range(int(x_starting_all_between_nm_wc[biggest]), + int(x_ending_all_between_nm_wc[biggest]))) + list(columns_not_covered)) != set(all_columns)): should_longest_line_be_extended=1 index_lines_so_close_to_top_separator = \ @@ -2012,7 +2012,7 @@ def return_boxes_of_images_by_order_of_reading_new( x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, np.array(columns_not_covered) + 1) ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) - for column in range(i_s_nc, x_end_biggest_column): + for column in range(int(i_s_nc), int(x_end_biggest_column)): ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') @@ -2064,7 +2064,7 @@ def return_boxes_of_images_by_order_of_reading_new( x_end_itself=x_end_copy.pop(il) #print(y_copy,'y_copy2') - for column in range(x_start_itself, x_end_itself+1): + for column in range(int(x_start_itself), int(x_end_itself)+1): #print(column,'cols') y_in_cols=[] for yic in range(len(y_copy)): @@ -2095,11 +2095,11 @@ def return_boxes_of_images_by_order_of_reading_new( all_columns = np.arange(len(peaks_neg_tot)-1) columns_covered_by_lines_covered_more_than_2col = [] for dj in range(len(x_starting)): - if set(list(range(x_starting[dj],x_ending[dj]))) == set(all_columns): + if set(list(range(int(x_starting[dj]),int(x_ending[dj]) ))) == set(all_columns): pass else: columns_covered_by_lines_covered_more_than_2col = columns_covered_by_lines_covered_more_than_2col + \ - list(range(x_starting[dj],x_ending[dj])) + list(range(int(x_starting[dj]),int(x_ending[dj]) )) columns_covered_by_lines_covered_more_than_2col = list(set(columns_covered_by_lines_covered_more_than_2col)) columns_not_covered = list(set(all_columns) - set(columns_covered_by_lines_covered_more_than_2col)) @@ -2124,7 +2124,7 @@ def return_boxes_of_images_by_order_of_reading_new( x_ending = np.append(x_ending, np.array(columns_not_covered) + 1) ind_args=np.array(range(len(y_type_2))) - #ind_args=np.array(ind_args) + for column in range(len(peaks_neg_tot)-1): #print(column,'column') ind_args_in_col=ind_args[x_starting==column] @@ -2155,8 +2155,7 @@ def return_boxes_of_images_by_order_of_reading_new( x_start_itself=x_start_copy.pop(il) x_end_itself=x_end_copy.pop(il) - #print(y_copy,'y_copy2') - for column in range(x_start_itself, x_end_itself+1): + for column in range(int(x_start_itself), int(x_end_itself)+1): #print(column,'cols') y_in_cols=[] for yic in range(len(y_copy)): From 8dc2fab9faf70c4ed92ab07f5a5b3d763a14d994 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Mon, 18 Aug 2025 02:31:13 +0200 Subject: [PATCH 38/40] reading order on given layout --- src/eynollah/cli.py | 48 +- src/eynollah/mb_ro_on_layout.py | 1134 +++++++++++++++++++++++++++++++ 2 files changed, 1158 insertions(+), 24 deletions(-) create mode 100644 src/eynollah/mb_ro_on_layout.py diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index 5135534..67fd57e 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -4,6 +4,7 @@ from ocrd_utils import initLogging, getLevelName, getLogger from eynollah.eynollah import Eynollah, Eynollah_ocr from eynollah.sbb_binarize import SbbBinarizer from eynollah.image_enhancer import Enhancer +from eynollah.mb_ro_on_layout import machine_based_reading_order_on_layout @click.group() def main(): @@ -13,38 +14,37 @@ def main(): @click.option( "--dir_xml", "-dx", - help="directory of GT page-xml files", + help="directory of page-xml files", type=click.Path(exists=True, file_okay=False), ) @click.option( - "--dir_out_modal_image", - "-domi", - help="directory where ground truth images would be written", + "--xml_file", + "-xml", + help="xml filename", + type=click.Path(exists=True, dir_okay=False), +) +@click.option( + "--dir_out", + "-do", + help="directory for output images", type=click.Path(exists=True, file_okay=False), ) @click.option( - "--dir_out_classes", - "-docl", - help="directory where ground truth classes would be written", + "--model", + "-m", + help="directory of models", type=click.Path(exists=True, file_okay=False), + required=True, ) -@click.option( - "--input_height", - "-ih", - help="input height", -) -@click.option( - "--input_width", - "-iw", - help="input width", -) -@click.option( - "--min_area_size", - "-min", - help="min area size of regions considered for reading order training.", -) -def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, input_height, input_width, min_area_size): - xml_files_ind = os.listdir(dir_xml) + +def machine_based_reading_order(dir_xml, xml_file, dir_out, model): + raedingorder_object = machine_based_reading_order_on_layout(model, dir_out=dir_out, logger=getLogger('enhancement')) + + if dir_xml: + raedingorder_object.run(dir_in=dir_xml) + else: + raedingorder_object.run(xml_filename=xml_file) + @main.command() @click.option('--patches/--no-patches', default=True, help='by enabling this parameter you let the model to see the image in patches.') diff --git a/src/eynollah/mb_ro_on_layout.py b/src/eynollah/mb_ro_on_layout.py new file mode 100644 index 0000000..7625a90 --- /dev/null +++ b/src/eynollah/mb_ro_on_layout.py @@ -0,0 +1,1134 @@ +""" +Image enhancer. The output can be written as same scale of input or in new predicted scale. +""" + +from logging import Logger +from difflib import SequenceMatcher as sq +from PIL import Image, ImageDraw, ImageFont +import math +import os +import sys +import time +from typing import Optional +import atexit +import warnings +from functools import partial +from pathlib import Path +from multiprocessing import cpu_count +import gc +import copy +from loky import ProcessPoolExecutor +import xml.etree.ElementTree as ET +import cv2 +import numpy as np +from ocrd import OcrdPage +from ocrd_utils import getLogger, tf_disable_interactive_logs +import statistics +from tensorflow.keras.models import load_model +from .utils.resize import resize_image +from .utils import ( + crop_image_inside_box +) + +from .utils.contour import ( + filter_contours_area_of_image, + filter_contours_area_of_image_tables, + find_contours_mean_y_diff, + find_new_features_of_contours, + find_features_of_contours, + get_text_region_boxes_by_given_contours, + get_textregion_contours_in_org_image, + get_textregion_contours_in_org_image_light, + return_contours_of_image, + return_contours_of_interested_region, + return_contours_of_interested_region_by_min_size, + return_contours_of_interested_textline, + return_parent_contours, +) + +DPI_THRESHOLD = 298 +KERNEL = np.ones((5, 5), np.uint8) + + +class machine_based_reading_order_on_layout: + def __init__( + self, + dir_models : str, + dir_out : Optional[str] = None, + logger : Optional[Logger] = None, + ): + self.dir_out = dir_out + + self.logger = logger if logger else getLogger('mbro on layout') + # for parallelization of CPU-intensive tasks: + self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200) + atexit.register(self.executor.shutdown) + self.dir_models = dir_models + self.model_reading_order_dir = dir_models + "/model_step_5100000_mb_ro"#"/model_ens_reading_order_machine_based" + + try: + for device in tf.config.list_physical_devices('GPU'): + tf.config.experimental.set_memory_growth(device, True) + except: + self.logger.warning("no GPU device available") + + self.model_reading_order = self.our_load_model(self.model_reading_order_dir) + self.light_version = True + + + def cache_images(self, image_filename=None, image_pil=None, dpi=None): + ret = {} + t_c0 = time.time() + if image_filename: + ret['img'] = cv2.imread(image_filename) + if self.light_version: + self.dpi = 100 + else: + self.dpi = 0#check_dpi(image_filename) + else: + ret['img'] = pil2cv(image_pil) + if self.light_version: + self.dpi = 100 + else: + self.dpi = 0#check_dpi(image_pil) + ret['img_grayscale'] = cv2.cvtColor(ret['img'], cv2.COLOR_BGR2GRAY) + for prefix in ('', '_grayscale'): + ret[f'img{prefix}_uint8'] = ret[f'img{prefix}'].astype(np.uint8) + self._imgs = ret + if dpi is not None: + self.dpi = dpi + + def reset_file_name_dir(self, image_filename): + t_c = time.time() + self.cache_images(image_filename=image_filename) + self.output_filename = os.path.join(self.dir_out, Path(image_filename).stem +'.png') + + def imread(self, grayscale=False, uint8=True): + key = 'img' + if grayscale: + key += '_grayscale' + if uint8: + key += '_uint8' + return self._imgs[key].copy() + + def isNaN(self, num): + return num != num + + @staticmethod + def our_load_model(model_file): + if model_file.endswith('.h5') and Path(model_file[:-3]).exists(): + # prefer SavedModel over HDF5 format if it exists + model_file = model_file[:-3] + try: + model = load_model(model_file, compile=False) + except: + model = load_model(model_file, compile=False, custom_objects={ + "PatchEncoder": PatchEncoder, "Patches": Patches}) + return model + + def predict_enhancement(self, img): + self.logger.debug("enter predict_enhancement") + + img_height_model = self.model_enhancement.layers[-1].output_shape[1] + img_width_model = self.model_enhancement.layers[-1].output_shape[2] + if img.shape[0] < img_height_model: + img = cv2.resize(img, (img.shape[1], img_width_model), interpolation=cv2.INTER_NEAREST) + if img.shape[1] < img_width_model: + img = cv2.resize(img, (img_height_model, img.shape[0]), interpolation=cv2.INTER_NEAREST) + margin = int(0.1 * img_width_model) + width_mid = img_width_model - 2 * margin + height_mid = img_height_model - 2 * margin + img = img / 255. + img_h = img.shape[0] + img_w = img.shape[1] + + prediction_true = np.zeros((img_h, img_w, 3)) + nxf = img_w / float(width_mid) + nyf = img_h / float(height_mid) + nxf = int(nxf) + 1 if nxf > int(nxf) else int(nxf) + nyf = int(nyf) + 1 if nyf > int(nyf) else int(nyf) + + for i in range(nxf): + for j in range(nyf): + if i == 0: + index_x_d = i * width_mid + index_x_u = index_x_d + img_width_model + else: + index_x_d = i * width_mid + index_x_u = index_x_d + img_width_model + if j == 0: + index_y_d = j * height_mid + index_y_u = index_y_d + img_height_model + else: + index_y_d = j * height_mid + index_y_u = index_y_d + img_height_model + + if index_x_u > img_w: + index_x_u = img_w + index_x_d = img_w - img_width_model + if index_y_u > img_h: + index_y_u = img_h + index_y_d = img_h - img_height_model + + img_patch = img[np.newaxis, index_y_d:index_y_u, index_x_d:index_x_u, :] + label_p_pred = self.model_enhancement.predict(img_patch, verbose=0) + seg = label_p_pred[0, :, :, :] * 255 + + if i == 0 and j == 0: + prediction_true[index_y_d + 0:index_y_u - margin, + index_x_d + 0:index_x_u - margin] = \ + seg[0:-margin or None, + 0:-margin or None] + elif i == nxf - 1 and j == nyf - 1: + prediction_true[index_y_d + margin:index_y_u - 0, + index_x_d + margin:index_x_u - 0] = \ + seg[margin:, + margin:] + elif i == 0 and j == nyf - 1: + prediction_true[index_y_d + margin:index_y_u - 0, + index_x_d + 0:index_x_u - margin] = \ + seg[margin:, + 0:-margin or None] + elif i == nxf - 1 and j == 0: + prediction_true[index_y_d + 0:index_y_u - margin, + index_x_d + margin:index_x_u - 0] = \ + seg[0:-margin or None, + margin:] + elif i == 0 and j != 0 and j != nyf - 1: + prediction_true[index_y_d + margin:index_y_u - margin, + index_x_d + 0:index_x_u - margin] = \ + seg[margin:-margin or None, + 0:-margin or None] + elif i == nxf - 1 and j != 0 and j != nyf - 1: + prediction_true[index_y_d + margin:index_y_u - margin, + index_x_d + margin:index_x_u - 0] = \ + seg[margin:-margin or None, + margin:] + elif i != 0 and i != nxf - 1 and j == 0: + prediction_true[index_y_d + 0:index_y_u - margin, + index_x_d + margin:index_x_u - margin] = \ + seg[0:-margin or None, + margin:-margin or None] + elif i != 0 and i != nxf - 1 and j == nyf - 1: + prediction_true[index_y_d + margin:index_y_u - 0, + index_x_d + margin:index_x_u - margin] = \ + seg[margin:, + margin:-margin or None] + else: + prediction_true[index_y_d + margin:index_y_u - margin, + index_x_d + margin:index_x_u - margin] = \ + seg[margin:-margin or None, + margin:-margin or None] + + prediction_true = prediction_true.astype(int) + return prediction_true + + def calculate_width_height_by_columns(self, img, num_col, width_early, label_p_pred): + self.logger.debug("enter calculate_width_height_by_columns") + if num_col == 1: + img_w_new = 2000 + elif num_col == 2: + img_w_new = 2400 + elif num_col == 3: + img_w_new = 3000 + elif num_col == 4: + img_w_new = 4000 + elif num_col == 5: + img_w_new = 5000 + elif num_col == 6: + img_w_new = 6500 + else: + img_w_new = width_early + img_h_new = img_w_new * img.shape[0] // img.shape[1] + + if img_h_new >= 8000: + img_new = np.copy(img) + num_column_is_classified = False + else: + img_new = resize_image(img, img_h_new, img_w_new) + num_column_is_classified = True + + return img_new, num_column_is_classified + + def early_page_for_num_of_column_classification(self,img_bin): + self.logger.debug("enter early_page_for_num_of_column_classification") + if self.input_binary: + img = np.copy(img_bin).astype(np.uint8) + else: + img = self.imread() + img = cv2.GaussianBlur(img, (5, 5), 0) + img_page_prediction = self.do_prediction(False, img, self.model_page) + + imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY) + _, thresh = cv2.threshold(imgray, 0, 255, 0) + thresh = cv2.dilate(thresh, KERNEL, iterations=3) + contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + if len(contours)>0: + cnt_size = np.array([cv2.contourArea(contours[j]) + for j in range(len(contours))]) + cnt = contours[np.argmax(cnt_size)] + box = cv2.boundingRect(cnt) + else: + box = [0, 0, img.shape[1], img.shape[0]] + cropped_page, page_coord = crop_image_inside_box(box, img) + + self.logger.debug("exit early_page_for_num_of_column_classification") + return cropped_page, page_coord + + def calculate_width_height_by_columns_1_2(self, img, num_col, width_early, label_p_pred): + self.logger.debug("enter calculate_width_height_by_columns") + if num_col == 1: + img_w_new = 1000 + else: + img_w_new = 1300 + img_h_new = img_w_new * img.shape[0] // img.shape[1] + + if label_p_pred[0][int(num_col - 1)] < 0.9 and img_w_new < width_early: + img_new = np.copy(img) + num_column_is_classified = False + #elif label_p_pred[0][int(num_col - 1)] < 0.8 and img_h_new >= 8000: + elif img_h_new >= 8000: + img_new = np.copy(img) + num_column_is_classified = False + else: + img_new = resize_image(img, img_h_new, img_w_new) + num_column_is_classified = True + + return img_new, num_column_is_classified + + def resize_and_enhance_image_with_column_classifier(self, light_version): + self.logger.debug("enter resize_and_enhance_image_with_column_classifier") + dpi = 0#self.dpi + self.logger.info("Detected %s DPI", dpi) + if self.input_binary: + img = self.imread() + prediction_bin = self.do_prediction(True, img, self.model_bin, n_batch_inference=5) + prediction_bin = 255 * (prediction_bin[:,:,0]==0) + prediction_bin = np.repeat(prediction_bin[:, :, np.newaxis], 3, axis=2).astype(np.uint8) + img= np.copy(prediction_bin) + img_bin = prediction_bin + else: + img = self.imread() + self.h_org, self.w_org = img.shape[:2] + img_bin = None + + width_early = img.shape[1] + t1 = time.time() + _, page_coord = self.early_page_for_num_of_column_classification(img_bin) + + self.image_page_org_size = img[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3], :] + self.page_coord = page_coord + + if self.num_col_upper and not self.num_col_lower: + num_col = self.num_col_upper + label_p_pred = [np.ones(6)] + elif self.num_col_lower and not self.num_col_upper: + num_col = self.num_col_lower + label_p_pred = [np.ones(6)] + elif not self.num_col_upper and not self.num_col_lower: + if self.input_binary: + img_in = np.copy(img) + img_in = img_in / 255.0 + img_in = cv2.resize(img_in, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = img_in.reshape(1, 448, 448, 3) + else: + img_1ch = self.imread(grayscale=True) + width_early = img_1ch.shape[1] + img_1ch = img_1ch[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] + + img_1ch = img_1ch / 255.0 + img_1ch = cv2.resize(img_1ch, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = np.zeros((1, img_1ch.shape[0], img_1ch.shape[1], 3)) + img_in[0, :, :, 0] = img_1ch[:, :] + img_in[0, :, :, 1] = img_1ch[:, :] + img_in[0, :, :, 2] = img_1ch[:, :] + + label_p_pred = self.model_classifier.predict(img_in, verbose=0) + num_col = np.argmax(label_p_pred[0]) + 1 + elif (self.num_col_upper and self.num_col_lower) and (self.num_col_upper!=self.num_col_lower): + if self.input_binary: + img_in = np.copy(img) + img_in = img_in / 255.0 + img_in = cv2.resize(img_in, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = img_in.reshape(1, 448, 448, 3) + else: + img_1ch = self.imread(grayscale=True) + width_early = img_1ch.shape[1] + img_1ch = img_1ch[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] + + img_1ch = img_1ch / 255.0 + img_1ch = cv2.resize(img_1ch, (448, 448), interpolation=cv2.INTER_NEAREST) + img_in = np.zeros((1, img_1ch.shape[0], img_1ch.shape[1], 3)) + img_in[0, :, :, 0] = img_1ch[:, :] + img_in[0, :, :, 1] = img_1ch[:, :] + img_in[0, :, :, 2] = img_1ch[:, :] + + label_p_pred = self.model_classifier.predict(img_in, verbose=0) + num_col = np.argmax(label_p_pred[0]) + 1 + + if num_col > self.num_col_upper: + num_col = self.num_col_upper + label_p_pred = [np.ones(6)] + if num_col < self.num_col_lower: + num_col = self.num_col_lower + label_p_pred = [np.ones(6)] + else: + num_col = self.num_col_upper + label_p_pred = [np.ones(6)] + + self.logger.info("Found %d columns (%s)", num_col, np.around(label_p_pred, decimals=5)) + + if dpi < DPI_THRESHOLD: + if light_version and num_col in (1,2): + img_new, num_column_is_classified = self.calculate_width_height_by_columns_1_2( + img, num_col, width_early, label_p_pred) + else: + img_new, num_column_is_classified = self.calculate_width_height_by_columns( + img, num_col, width_early, label_p_pred) + if light_version: + image_res = np.copy(img_new) + else: + image_res = self.predict_enhancement(img_new) + is_image_enhanced = True + + else: + num_column_is_classified = True + image_res = np.copy(img) + is_image_enhanced = False + + self.logger.debug("exit resize_and_enhance_image_with_column_classifier") + return is_image_enhanced, img, image_res, num_col, num_column_is_classified, img_bin + def read_xml(self, xml_file): + file_name = Path(xml_file).stem + tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding='utf-8')) + root1=tree1.getroot() + alltags=[elem.tag for elem in root1.iter()] + link=alltags[0].split('}')[0]+'}' + + index_tot_regions = [] + tot_region_ref = [] + + for jj in root1.iter(link+'Page'): + y_len=int(jj.attrib['imageHeight']) + x_len=int(jj.attrib['imageWidth']) + + for jj in root1.iter(link+'RegionRefIndexed'): + index_tot_regions.append(jj.attrib['index']) + tot_region_ref.append(jj.attrib['regionRef']) + + if (link+'PrintSpace' in alltags) or (link+'Border' in alltags): + co_printspace = [] + if link+'PrintSpace' in alltags: + region_tags_printspace = np.unique([x for x in alltags if x.endswith('PrintSpace')]) + elif link+'Border' in alltags: + region_tags_printspace = np.unique([x for x in alltags if x.endswith('Border')]) + + for tag in region_tags_printspace: + if link+'PrintSpace' in alltags: + tag_endings_printspace = ['}PrintSpace','}printspace'] + elif link+'Border' in alltags: + tag_endings_printspace = ['}Border','}border'] + + if tag.endswith(tag_endings_printspace[0]) or tag.endswith(tag_endings_printspace[1]): + for nn in root1.iter(tag): + c_t_in = [] + sumi = 0 + for vv in nn.iter(): + # check the format of coords + if vv.tag == link + 'Coords': + coords = bool(vv.attrib) + if coords: + p_h = vv.attrib['points'].split(' ') + c_t_in.append( + np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])) + break + else: + pass + + if vv.tag == link + 'Point': + c_t_in.append([int(float(vv.attrib['x'])), int(float(vv.attrib['y']))]) + sumi += 1 + elif vv.tag != link + 'Point' and sumi >= 1: + break + co_printspace.append(np.array(c_t_in)) + img_printspace = np.zeros( (y_len,x_len,3) ) + img_printspace=cv2.fillPoly(img_printspace, pts =co_printspace, color=(1,1,1)) + img_printspace = img_printspace.astype(np.uint8) + + imgray = cv2.cvtColor(img_printspace, cv2.COLOR_BGR2GRAY) + _, thresh = cv2.threshold(imgray, 0, 255, 0) + contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))]) + cnt = contours[np.argmax(cnt_size)] + x, y, w, h = cv2.boundingRect(cnt) + + bb_coord_printspace = [x, y, w, h] + + else: + bb_coord_printspace = None + + + region_tags=np.unique([x for x in alltags if x.endswith('Region')]) + co_text_paragraph=[] + co_text_drop=[] + co_text_heading=[] + co_text_header=[] + co_text_marginalia=[] + co_text_catch=[] + co_text_page_number=[] + co_text_signature_mark=[] + co_sep=[] + co_img=[] + co_table=[] + co_graphic=[] + co_graphic_text_annotation=[] + co_graphic_decoration=[] + co_noise=[] + + co_text_paragraph_text=[] + co_text_drop_text=[] + co_text_heading_text=[] + co_text_header_text=[] + co_text_marginalia_text=[] + co_text_catch_text=[] + co_text_page_number_text=[] + co_text_signature_mark_text=[] + co_sep_text=[] + co_img_text=[] + co_table_text=[] + co_graphic_text=[] + co_graphic_text_annotation_text=[] + co_graphic_decoration_text=[] + co_noise_text=[] + + id_paragraph = [] + id_header = [] + id_heading = [] + id_marginalia = [] + + for tag in region_tags: + if tag.endswith('}TextRegion') or tag.endswith('}Textregion'): + for nn in root1.iter(tag): + for child2 in nn: + tag2 = child2.tag + if tag2.endswith('}TextEquiv') or tag2.endswith('}TextEquiv'): + for childtext2 in child2: + if childtext2.tag.endswith('}Unicode') or childtext2.tag.endswith('}Unicode'): + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + co_text_drop_text.append(childtext2.text) + elif "type" in nn.attrib and nn.attrib['type']=='heading': + co_text_heading_text.append(childtext2.text) + elif "type" in nn.attrib and nn.attrib['type']=='signature-mark': + co_text_signature_mark_text.append(childtext2.text) + elif "type" in nn.attrib and nn.attrib['type']=='header': + co_text_header_text.append(childtext2.text) + ###elif "type" in nn.attrib and nn.attrib['type']=='catch-word': + ###co_text_catch_text.append(childtext2.text) + ###elif "type" in nn.attrib and nn.attrib['type']=='page-number': + ###co_text_page_number_text.append(childtext2.text) + elif "type" in nn.attrib and nn.attrib['type']=='marginalia': + co_text_marginalia_text.append(childtext2.text) + else: + co_text_paragraph_text.append(childtext2.text) + c_t_in_drop=[] + c_t_in_paragraph=[] + c_t_in_heading=[] + c_t_in_header=[] + c_t_in_page_number=[] + c_t_in_signature_mark=[] + c_t_in_catch=[] + c_t_in_marginalia=[] + + + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + + coords=bool(vv.attrib) + if coords: + #print('birda1') + p_h=vv.attrib['points'].split(' ') + + + + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + + c_t_in_drop.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + elif "type" in nn.attrib and nn.attrib['type']=='heading': + ##id_heading.append(nn.attrib['id']) + c_t_in_heading.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + elif "type" in nn.attrib and nn.attrib['type']=='signature-mark': + + c_t_in_signature_mark.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + #print(c_t_in_paragraph) + elif "type" in nn.attrib and nn.attrib['type']=='header': + #id_header.append(nn.attrib['id']) + c_t_in_header.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + ###elif "type" in nn.attrib and nn.attrib['type']=='catch-word': + ###c_t_in_catch.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + ###elif "type" in nn.attrib and nn.attrib['type']=='page-number': + + ###c_t_in_page_number.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + elif "type" in nn.attrib and nn.attrib['type']=='marginalia': + #id_marginalia.append(nn.attrib['id']) + + c_t_in_marginalia.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + else: + #id_paragraph.append(nn.attrib['id']) + + c_t_in_paragraph.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + break + else: + pass + + + if vv.tag==link+'Point': + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + + c_t_in_drop.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + elif "type" in nn.attrib and nn.attrib['type']=='heading': + #id_heading.append(nn.attrib['id']) + c_t_in_heading.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + + elif "type" in nn.attrib and nn.attrib['type']=='signature-mark': + + c_t_in_signature_mark.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + elif "type" in nn.attrib and nn.attrib['type']=='header': + #id_header.append(nn.attrib['id']) + c_t_in_header.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + + ###elif "type" in nn.attrib and nn.attrib['type']=='catch-word': + ###c_t_in_catch.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + ###sumi+=1 + + ###elif "type" in nn.attrib and nn.attrib['type']=='page-number': + + ###c_t_in_page_number.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + ###sumi+=1 + + elif "type" in nn.attrib and nn.attrib['type']=='marginalia': + #id_marginalia.append(nn.attrib['id']) + + c_t_in_marginalia.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + else: + #id_paragraph.append(nn.attrib['id']) + c_t_in_paragraph.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + elif vv.tag!=link+'Point' and sumi>=1: + break + + if len(c_t_in_drop)>0: + co_text_drop.append(np.array(c_t_in_drop)) + if len(c_t_in_paragraph)>0: + co_text_paragraph.append(np.array(c_t_in_paragraph)) + id_paragraph.append(nn.attrib['id']) + if len(c_t_in_heading)>0: + co_text_heading.append(np.array(c_t_in_heading)) + id_heading.append(nn.attrib['id']) + + if len(c_t_in_header)>0: + co_text_header.append(np.array(c_t_in_header)) + id_header.append(nn.attrib['id']) + if len(c_t_in_page_number)>0: + co_text_page_number.append(np.array(c_t_in_page_number)) + if len(c_t_in_catch)>0: + co_text_catch.append(np.array(c_t_in_catch)) + + if len(c_t_in_signature_mark)>0: + co_text_signature_mark.append(np.array(c_t_in_signature_mark)) + + if len(c_t_in_marginalia)>0: + co_text_marginalia.append(np.array(c_t_in_marginalia)) + id_marginalia.append(nn.attrib['id']) + + + elif tag.endswith('}GraphicRegion') or tag.endswith('}graphicregion'): + for nn in root1.iter(tag): + c_t_in=[] + c_t_in_text_annotation=[] + c_t_in_decoration=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + + if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': + c_t_in_text_annotation.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + elif "type" in nn.attrib and nn.attrib['type']=='decoration': + c_t_in_decoration.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + else: + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + break + else: + pass + + + if vv.tag==link+'Point': + if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': + c_t_in_text_annotation.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + elif "type" in nn.attrib and nn.attrib['type']=='decoration': + c_t_in_decoration.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + else: + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + if len(c_t_in_text_annotation)>0: + co_graphic_text_annotation.append(np.array(c_t_in_text_annotation)) + if len(c_t_in_decoration)>0: + co_graphic_decoration.append(np.array(c_t_in_decoration)) + if len(c_t_in)>0: + co_graphic.append(np.array(c_t_in)) + + + + elif tag.endswith('}ImageRegion') or tag.endswith('}imageregion'): + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + elif vv.tag!=link+'Point' and sumi>=1: + break + co_img.append(np.array(c_t_in)) + co_img_text.append(' ') + + + elif tag.endswith('}SeparatorRegion') or tag.endswith('}separatorregion'): + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + elif vv.tag!=link+'Point' and sumi>=1: + break + co_sep.append(np.array(c_t_in)) + + + + elif tag.endswith('}TableRegion') or tag.endswith('}tableregion'): + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + elif vv.tag!=link+'Point' and sumi>=1: + break + co_table.append(np.array(c_t_in)) + co_table_text.append(' ') + + elif tag.endswith('}NoiseRegion') or tag.endswith('}noiseregion'): + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + + elif vv.tag!=link+'Point' and sumi>=1: + break + co_noise.append(np.array(c_t_in)) + co_noise_text.append(' ') + + img = np.zeros( (y_len,x_len,3) ) + img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=(1,1,1)) + + img_poly=cv2.fillPoly(img, pts =co_text_heading, color=(2,2,2)) + img_poly=cv2.fillPoly(img, pts =co_text_header, color=(2,2,2)) + img_poly=cv2.fillPoly(img, pts =co_text_marginalia, color=(3,3,3)) + img_poly=cv2.fillPoly(img, pts =co_img, color=(4,4,4)) + img_poly=cv2.fillPoly(img, pts =co_sep, color=(5,5,5)) + + return tree1, root1, bb_coord_printspace, file_name, id_paragraph, id_header+id_heading, co_text_paragraph, co_text_header+co_text_heading,\ + tot_region_ref,x_len, y_len,index_tot_regions, img_poly + + def return_indexes_of_contours_loctaed_inside_another_list_of_contours(self, contours, contours_loc, cx_main_loc, cy_main_loc, indexes_loc): + indexes_of_located_cont = [] + center_x_coordinates_of_located = [] + center_y_coordinates_of_located = [] + #M_main_tot = [cv2.moments(contours_loc[j]) + #for j in range(len(contours_loc))] + #cx_main_loc = [(M_main_tot[j]["m10"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] + #cy_main_loc = [(M_main_tot[j]["m01"] / (M_main_tot[j]["m00"] + 1e-32)) for j in range(len(M_main_tot))] + + for ij in range(len(contours)): + results = [cv2.pointPolygonTest(contours[ij], (cx_main_loc[ind], cy_main_loc[ind]), False) + for ind in range(len(cy_main_loc)) ] + results = np.array(results) + indexes_in = np.where((results == 0) | (results == 1)) + indexes = indexes_loc[indexes_in]# [(results == 0) | (results == 1)]#np.where((results == 0) | (results == 1)) + + indexes_of_located_cont.append(indexes) + center_x_coordinates_of_located.append(np.array(cx_main_loc)[indexes_in] ) + center_y_coordinates_of_located.append(np.array(cy_main_loc)[indexes_in] ) + + return indexes_of_located_cont, center_x_coordinates_of_located, center_y_coordinates_of_located + + def do_order_of_regions_with_model(self, contours_only_text_parent, contours_only_text_parent_h, text_regions_p): + height1 =672#448 + width1 = 448#224 + + height2 =672#448 + width2= 448#224 + + height3 =672#448 + width3 = 448#224 + + inference_bs = 3 + + ver_kernel = np.ones((5, 1), dtype=np.uint8) + hor_kernel = np.ones((1, 5), dtype=np.uint8) + + + min_cont_size_to_be_dilated = 10 + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + cx_conts, cy_conts, x_min_conts, x_max_conts, y_min_conts, y_max_conts, _ = find_new_features_of_contours(contours_only_text_parent) + args_cont_located = np.array(range(len(contours_only_text_parent))) + + diff_y_conts = np.abs(y_max_conts[:]-y_min_conts) + diff_x_conts = np.abs(x_max_conts[:]-x_min_conts) + + mean_x = statistics.mean(diff_x_conts) + median_x = statistics.median(diff_x_conts) + + + diff_x_ratio= diff_x_conts/mean_x + + args_cont_located_excluded = args_cont_located[diff_x_ratio>=1.3] + args_cont_located_included = args_cont_located[diff_x_ratio<1.3] + + contours_only_text_parent_excluded = [contours_only_text_parent[ind] for ind in range(len(contours_only_text_parent)) if diff_x_ratio[ind]>=1.3]#contours_only_text_parent[diff_x_ratio>=1.3] + contours_only_text_parent_included = [contours_only_text_parent[ind] for ind in range(len(contours_only_text_parent)) if diff_x_ratio[ind]<1.3]#contours_only_text_parent[diff_x_ratio<1.3] + + + cx_conts_excluded = [cx_conts[ind] for ind in range(len(cx_conts)) if diff_x_ratio[ind]>=1.3]#cx_conts[diff_x_ratio>=1.3] + cx_conts_included = [cx_conts[ind] for ind in range(len(cx_conts)) if diff_x_ratio[ind]<1.3]#cx_conts[diff_x_ratio<1.3] + + cy_conts_excluded = [cy_conts[ind] for ind in range(len(cy_conts)) if diff_x_ratio[ind]>=1.3]#cy_conts[diff_x_ratio>=1.3] + cy_conts_included = [cy_conts[ind] for ind in range(len(cy_conts)) if diff_x_ratio[ind]<1.3]#cy_conts[diff_x_ratio<1.3] + + #print(diff_x_ratio, 'ratio') + text_regions_p = text_regions_p.astype('uint8') + + if len(contours_only_text_parent_excluded)>0: + textregion_par = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1])).astype('uint8') + textregion_par = cv2.fillPoly(textregion_par, pts=contours_only_text_parent_included, color=(1,1)) + else: + textregion_par = (text_regions_p[:,:]==1)*1 + textregion_par = textregion_par.astype('uint8') + + text_regions_p_textregions_dilated = cv2.erode(textregion_par , hor_kernel, iterations=2) + text_regions_p_textregions_dilated = cv2.dilate(text_regions_p_textregions_dilated , ver_kernel, iterations=4) + text_regions_p_textregions_dilated = cv2.erode(text_regions_p_textregions_dilated , hor_kernel, iterations=1) + text_regions_p_textregions_dilated = cv2.dilate(text_regions_p_textregions_dilated , ver_kernel, iterations=5) + text_regions_p_textregions_dilated[text_regions_p[:,:]>1] = 0 + + + contours_only_dilated, hir_on_text_dilated = return_contours_of_image(text_regions_p_textregions_dilated) + contours_only_dilated = return_parent_contours(contours_only_dilated, hir_on_text_dilated) + + indexes_of_located_cont, center_x_coordinates_of_located, center_y_coordinates_of_located = self.return_indexes_of_contours_loctaed_inside_another_list_of_contours(contours_only_dilated, contours_only_text_parent_included, cx_conts_included, cy_conts_included, args_cont_located_included) + + + if len(args_cont_located_excluded)>0: + for ind in args_cont_located_excluded: + indexes_of_located_cont.append(np.array([ind])) + contours_only_dilated.append(contours_only_text_parent[ind]) + center_y_coordinates_of_located.append(0) + + array_list = [np.array([elem]) if isinstance(elem, int) else elem for elem in indexes_of_located_cont] + flattened_array = np.concatenate([arr.ravel() for arr in array_list]) + #print(len( np.unique(flattened_array)), 'indexes_of_located_cont uniques') + + missing_textregions = list( set(np.array(range(len(contours_only_text_parent))) ) - set(np.unique(flattened_array)) ) + #print(missing_textregions, 'missing_textregions') + + for ind in missing_textregions: + indexes_of_located_cont.append(np.array([ind])) + contours_only_dilated.append(contours_only_text_parent[ind]) + center_y_coordinates_of_located.append(0) + + + if contours_only_text_parent_h: + for vi in range(len(contours_only_text_parent_h)): + indexes_of_located_cont.append(int(vi+len(contours_only_text_parent))) + + array_list = [np.array([elem]) if isinstance(elem, int) else elem for elem in indexes_of_located_cont] + flattened_array = np.concatenate([arr.ravel() for arr in array_list]) + + y_len = text_regions_p.shape[0] + x_len = text_regions_p.shape[1] + + img_poly = np.zeros((y_len,x_len), dtype='uint8') + img_poly[text_regions_p[:,:]==1] = 1 + img_poly[text_regions_p[:,:]==2] = 2 + img_poly[text_regions_p[:,:]==3] = 4 + img_poly[text_regions_p[:,:]==6] = 5 + + img_header_and_sep = np.zeros((y_len,x_len), dtype='uint8') + if contours_only_text_parent_h: + _, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, _ = find_new_features_of_contours( + contours_only_text_parent_h) + for j in range(len(cy_main)): + img_header_and_sep[int(y_max_main[j]):int(y_max_main[j])+12, + int(x_min_main[j]):int(x_max_main[j])] = 1 + co_text_all_org = contours_only_text_parent + contours_only_text_parent_h + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + co_text_all = contours_only_dilated + contours_only_text_parent_h + else: + co_text_all = contours_only_text_parent + contours_only_text_parent_h + else: + co_text_all_org = contours_only_text_parent + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + co_text_all = contours_only_dilated + else: + co_text_all = contours_only_text_parent + + if not len(co_text_all): + return [], [] + + labels_con = np.zeros((int(y_len /6.), int(x_len/6.), len(co_text_all)), dtype=bool) + + co_text_all = [(i/6).astype(int) for i in co_text_all] + for i in range(len(co_text_all)): + img = labels_con[:,:,i].astype(np.uint8) + + #img = cv2.resize(img, (int(img.shape[1]/6), int(img.shape[0]/6)), interpolation=cv2.INTER_NEAREST) + + cv2.fillPoly(img, pts=[co_text_all[i]], color=(1,)) + labels_con[:,:,i] = img + + + labels_con = resize_image(labels_con.astype(np.uint8), height1, width1).astype(bool) + img_header_and_sep = resize_image(img_header_and_sep, height1, width1) + img_poly = resize_image(img_poly, height3, width3) + + + + input_1 = np.zeros((inference_bs, height1, width1, 3)) + ordered = [list(range(len(co_text_all)))] + index_update = 0 + #print(labels_con.shape[2],"number of regions for reading order") + while index_update>=0: + ij_list = ordered.pop(index_update) + i = ij_list.pop(0) + + ante_list = [] + post_list = [] + tot_counter = 0 + batch = [] + for j in ij_list: + img1 = labels_con[:,:,i].astype(float) + img2 = labels_con[:,:,j].astype(float) + img1[img_poly==5] = 2 + img2[img_poly==5] = 2 + img1[img_header_and_sep==1] = 3 + img2[img_header_and_sep==1] = 3 + + input_1[len(batch), :, :, 0] = img1 / 3. + input_1[len(batch), :, :, 2] = img2 / 3. + input_1[len(batch), :, :, 1] = img_poly / 5. + + tot_counter += 1 + batch.append(j) + if tot_counter % inference_bs == 0 or tot_counter == len(ij_list): + y_pr = self.model_reading_order.predict(input_1 , verbose=0) + for jb, j in enumerate(batch): + if y_pr[jb][0]>=0.5: + post_list.append(j) + else: + ante_list.append(j) + batch = [] + + if len(ante_list): + ordered.insert(index_update, ante_list) + index_update += 1 + ordered.insert(index_update, [i]) + if len(post_list): + ordered.insert(index_update + 1, post_list) + + index_update = -1 + for index_next, ij_list in enumerate(ordered): + if len(ij_list) > 1: + index_update = index_next + break + + ordered = [i[0] for i in ordered] + + ##id_all_text = np.array(id_all_text)[index_sort] + + + if len(contours_only_text_parent)>min_cont_size_to_be_dilated and self.light_version: + org_contours_indexes = [] + for ind in range(len(ordered)): + region_with_curr_order = ordered[ind] + if region_with_curr_order < len(contours_only_dilated): + if np.isscalar(indexes_of_located_cont[region_with_curr_order]): + org_contours_indexes = org_contours_indexes + [indexes_of_located_cont[region_with_curr_order]] + else: + arg_sort_located_cont = np.argsort(center_y_coordinates_of_located[region_with_curr_order]) + org_contours_indexes = org_contours_indexes + list(np.array(indexes_of_located_cont[region_with_curr_order])[arg_sort_located_cont]) ##org_contours_indexes + list ( + else: + org_contours_indexes = org_contours_indexes + [indexes_of_located_cont[region_with_curr_order]] + + region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] + return org_contours_indexes, region_ids + else: + region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] + return ordered, region_ids + + + + + def run(self, xml_filename : Optional[str] = None, dir_in : Optional[str] = None, overwrite : bool = False): + """ + Get image and scales, then extract the page of scanned image + """ + self.logger.debug("enter run") + t0_tot = time.time() + + if dir_in: + self.ls_xmls = os.listdir(dir_in) + elif xml_filename: + self.ls_xmls = [xml_filename] + else: + raise ValueError("run requires either a single image filename or a directory") + + for xml_filename in self.ls_xmls: + self.logger.info(xml_filename) + t0 = time.time() + + if dir_in: + xml_file = os.path.join(dir_in, xml_filename) + else: + xml_file = xml_filename + + tree_xml, root_xml, bb_coord_printspace, file_name, id_paragraph, id_header, co_text_paragraph, co_text_header, tot_region_ref, x_len, y_len, index_tot_regions, img_poly = self.read_xml(xml_file) + + id_all_text = id_paragraph + id_header + + order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model(co_text_paragraph, co_text_header, img_poly[:,:,0]) + + id_all_text = np.array(id_all_text)[order_text_new] + + alltags=[elem.tag for elem in root_xml.iter()] + + + + link=alltags[0].split('}')[0]+'}' + name_space = alltags[0].split('}')[0] + name_space = name_space.split('{')[1] + + page_element = root_xml.find(link+'Page') + + + old_ro = root_xml.find(".//{*}ReadingOrder") + + if old_ro is not None: + page_element.remove(old_ro) + + #print(old_ro, 'old_ro') + ro_subelement = ET.Element('ReadingOrder') + + ro_subelement2 = ET.SubElement(ro_subelement, 'OrderedGroup') + ro_subelement2.set('id', "ro357564684568544579089") + + for index, id_text in enumerate(id_all_text): + new_element_2 = ET.SubElement(ro_subelement2, 'RegionRefIndexed') + new_element_2.set('regionRef', id_all_text[index]) + new_element_2.set('index', str(index)) + + if (link+'PrintSpace' in alltags) or (link+'Border' in alltags): + page_element.insert(1, ro_subelement) + else: + page_element.insert(0, ro_subelement) + + alltags=[elem.tag for elem in root_xml.iter()] + + ET.register_namespace("",name_space) + tree_xml.write(os.path.join(self.dir_out, file_name+'.xml'),xml_declaration=True,method='xml',encoding="utf8",default_namespace=None) + + #sys.exit() + From 7dd281267df33c89ec26945559fb2e10bd67f9c1 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 26 Aug 2025 22:38:03 +0200 Subject: [PATCH 39/40] Marginals are divided into left and right, and written from top to bottom. --- src/eynollah/eynollah.py | 138 ++++++++++++++++++++++++-------- src/eynollah/mb_ro_on_layout.py | 18 +++-- src/eynollah/utils/utils_ocr.py | 88 ++++++++++---------- src/eynollah/utils/xml.py | 10 ++- src/eynollah/writer.py | 58 ++++++++++---- 5 files changed, 215 insertions(+), 97 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 5299d3e..30e180d 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -289,7 +289,7 @@ class Eynollah: self.model_region_dir_p_ens = dir_models + "/eynollah-main-regions-ensembled_20210425" self.model_region_dir_p_ens_light = dir_models + "/eynollah-main-regions_20220314" self.model_region_dir_p_ens_light_only_images_extraction = dir_models + "/eynollah-main-regions_20231127_672_org_ens_11_13_16_17_18" - self.model_reading_order_dir = dir_models + "/model_step_4800000_mb_ro"#"/model_ens_reading_order_machine_based" + self.model_reading_order_dir = dir_models + "/model_eynollah_reading_order_20250824"#"/model_mb_ro_aug_ens_11"#"/model_step_3200000_mb_ro"#"/model_ens_reading_order_machine_based"#"/model_mb_ro_aug_ens_8"#"/model_ens_reading_order_machine_based" #"/modelens_12sp_elay_0_3_4__3_6_n" #"/modelens_earlylayout_12spaltige_2_3_5_6_7_8" #"/modelens_early12_sp_2_3_5_6_7_8_9_10_12_14_15_16_18" @@ -725,6 +725,7 @@ class Eynollah: label_p_pred = self.model_classifier.predict(img_in, verbose=0) num_col = np.argmax(label_p_pred[0]) + 1 + elif (self.num_col_upper and self.num_col_lower) and (self.num_col_upper!=self.num_col_lower): if self.input_binary: img_in = np.copy(img) @@ -3090,6 +3091,26 @@ class Eynollah: num_col = num_col + 1 if not num_column_is_classified: num_col_classifier = num_col + 1 + if self.num_col_upper and self.num_col_lower: + if self.num_col_upper == self.num_col_lower: + num_col_classifier = self.num_col_upper + else: + if num_col_classifier < self.num_col_lower: + num_col_classifier = self.num_col_lower + if num_col_classifier > self.num_col_upper: + num_col_classifier = self.num_col_upper + + elif self.num_col_lower and not self.num_col_upper: + if num_col_classifier < self.num_col_lower: + num_col_classifier = self.num_col_lower + + elif self.num_col_upper and not self.num_col_lower: + if num_col_classifier > self.num_col_upper: + num_col_classifier = self.num_col_upper + + else: + pass + except Exception as why: self.logger.error(why) num_col = None @@ -3223,7 +3244,6 @@ class Eynollah: text_regions_p_1[mask_lines[:, :] == 1] = 3 text_regions_p = text_regions_p_1[:, :] text_regions_p = np.array(text_regions_p) - if num_col_classifier in (1, 2): try: regions_without_separators = (text_regions_p[:, :] == 1) * 1 @@ -4447,6 +4467,43 @@ class Eynollah: return (slopes_rem, all_found_textline_polygons_rem, boxes_text_rem, txt_con_org_rem, contours_only_text_parent_rem, index_by_text_par_con_rem_sort) + + def separate_marginals_to_left_and_right_and_order_from_top_to_down(self, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes_marginals, mid_point_of_page_width): + cx_marg, cy_marg, _, _, _, _, _ = find_new_features_of_contours( + polygons_of_marginals) + + cx_marg = np.array(cx_marg) + cy_marg = np.array(cy_marg) + + poly_marg_left = list( np.array(polygons_of_marginals)[cx_marg < mid_point_of_page_width] ) + poly_marg_right = list( np.array(polygons_of_marginals)[cx_marg >= mid_point_of_page_width] ) + + all_found_textline_polygons_marginals_left = list( np.array(all_found_textline_polygons_marginals)[cx_marg < mid_point_of_page_width] ) + all_found_textline_polygons_marginals_right = list( np.array(all_found_textline_polygons_marginals)[cx_marg >= mid_point_of_page_width] ) + + all_box_coord_marginals_left = list( np.array(all_box_coord_marginals)[cx_marg < mid_point_of_page_width] ) + all_box_coord_marginals_right = list( np.array(all_box_coord_marginals)[cx_marg >= mid_point_of_page_width] ) + + slopes_marg_left = list( np.array(slopes_marginals)[cx_marg < mid_point_of_page_width] ) + slopes_marg_right = list( np.array(slopes_marginals)[cx_marg >= mid_point_of_page_width] ) + + cy_marg_left = cy_marg[cx_marg < mid_point_of_page_width] + cy_marg_right = cy_marg[cx_marg >= mid_point_of_page_width] + + ordered_left_marginals = [poly for _, poly in sorted(zip(cy_marg_left, poly_marg_left), key=lambda x: x[0])] + ordered_right_marginals = [poly for _, poly in sorted(zip(cy_marg_right, poly_marg_right), key=lambda x: x[0])] + + ordered_left_marginals_textline = [poly for _, poly in sorted(zip(cy_marg_left, all_found_textline_polygons_marginals_left), key=lambda x: x[0])] + ordered_right_marginals_textline = [poly for _, poly in sorted(zip(cy_marg_right, all_found_textline_polygons_marginals_right), key=lambda x: x[0])] + + ordered_left_marginals_bbox = [poly for _, poly in sorted(zip(cy_marg_left, all_box_coord_marginals_left), key=lambda x: x[0])] + ordered_right_marginals_bbox = [poly for _, poly in sorted(zip(cy_marg_right, all_box_coord_marginals_right), key=lambda x: x[0])] + + ordered_left_slopes_marginals = [poly for _, poly in sorted(zip(cy_marg_left, slopes_marg_left), key=lambda x: x[0])] + ordered_right_slopes_marginals = [poly for _, poly in sorted(zip(cy_marg_right, slopes_marg_right), key=lambda x: x[0])] + + return ordered_left_marginals, ordered_right_marginals, ordered_left_marginals_textline, ordered_right_marginals_textline, ordered_left_marginals_bbox, ordered_right_marginals_bbox, ordered_left_slopes_marginals, ordered_right_slopes_marginals + def run(self, image_filename : Optional[str] = None, dir_in : Optional[str] = None, overwrite : bool = False): """ @@ -4489,12 +4546,13 @@ class Eynollah: t0 = time.time() img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version) self.logger.info("Enhancing took %.1fs ", time.time() - t0) + if self.extract_only_images: text_regions_p_1, erosion_hurts, polygons_lines_xml, polygons_of_images, image_page, page_coord, cont_page = \ self.get_regions_light_v_extract_only_images(img_res, is_image_enhanced, num_col_classifier) pcgts = self.writer.build_pagexml_no_full_layout( [], page_coord, [], [], [], [], - polygons_of_images, [], [], [], [], [], + polygons_of_images, [], [], [], [], [], [], [], [], [], cont_page, [], []) if self.plotter: self.plotter.write_images_into_directory(polygons_of_images, image_page) @@ -4508,7 +4566,6 @@ class Eynollah: page_coord, image_page, textline_mask_tot_ea, img_bin_light, cont_page = \ self.run_graphics_and_columns_without_layout(textline_mask_tot_ea, img_bin_light) - ##all_found_textline_polygons =self.scale_contours_new(textline_mask_tot_ea) cnt_clean_rot_raw, hir_on_cnt_clean_rot = return_contours_of_image(textline_mask_tot_ea) @@ -4530,10 +4587,14 @@ class Eynollah: id_of_texts_tot =['region_0001'] polygons_of_images = [] - slopes_marginals = [] - polygons_of_marginals = [] - all_found_textline_polygons_marginals = [] - all_box_coord_marginals = [] + slopes_marginals_left = [] + slopes_marginals_right = [] + polygons_of_marginals_left = [] + polygons_of_marginals_right = [] + all_found_textline_polygons_marginals_left = [] + all_found_textline_polygons_marginals_right = [] + all_box_coord_marginals_left = [] + all_box_coord_marginals_right = [] polygons_lines_xml = [] contours_tables = [] conf_contours_textregions =[0] @@ -4546,8 +4607,8 @@ class Eynollah: pcgts = self.writer.build_pagexml_no_full_layout( cont_page, page_coord, order_text_new, id_of_texts_tot, - all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals, - all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, + all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals_left, polygons_of_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines=ocr_all_textlines, conf_contours_textregion=conf_contours_textregions, skip_layout_reading_order=self.skip_layout_and_reading_order) return pcgts @@ -4595,11 +4656,10 @@ class Eynollah: #self.logger.info('cont_page %s', cont_page) #plt.imshow(table_prediction) #plt.show() - if not num_col: self.logger.info("No columns detected, outputting an empty PAGE-XML") pcgts = self.writer.build_pagexml_no_full_layout( - [], page_coord, [], [], [], [], [], [], [], [], [], [], + [], page_coord, [], [], [], [], [], [], [], [], [], [], [], [], [], [], cont_page, [], []) return pcgts @@ -4771,6 +4831,7 @@ class Eynollah: contours_only_text_parent_d_ordered = [] contours_only_text_parent_d = [] #contours_only_text_parent = [] + if not len(contours_only_text_parent): # stop early empty_marginals = [[]] * len(polygons_of_marginals) @@ -4778,13 +4839,13 @@ class Eynollah: pcgts = self.writer.build_pagexml_full_layout( [], [], page_coord, [], [], [], [], [], [], polygons_of_images, contours_tables, [], - polygons_of_marginals, empty_marginals, empty_marginals, [], [], [], + polygons_of_marginals, polygons_of_marginals, empty_marginals, empty_marginals, empty_marginals, empty_marginals, [], [], [], [], cont_page, polygons_lines_xml) else: pcgts = self.writer.build_pagexml_no_full_layout( [], page_coord, [], [], [], [], polygons_of_images, - polygons_of_marginals, empty_marginals, empty_marginals, [], [], + polygons_of_marginals, polygons_of_marginals, empty_marginals, empty_marginals, empty_marginals, empty_marginals, [], [], [], cont_page, polygons_lines_xml, contours_tables) return pcgts @@ -4877,8 +4938,11 @@ class Eynollah: num_col_classifier, scale_param, slope_deskew) all_found_textline_polygons_marginals = small_textlines_to_parent_adherence2( all_found_textline_polygons_marginals, textline_mask_tot_ea, num_col_classifier) - - #print("text region early 6 in %.1fs", time.time() - t0) + + mid_point_of_page_width = text_regions_p.shape[1] / 2. + polygons_of_marginals_left, polygons_of_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes_marginals_left, slopes_marginals_right = self.separate_marginals_to_left_and_right_and_order_from_top_to_down(polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes_marginals, mid_point_of_page_width) + + #print(len(polygons_of_marginals), len(ordered_left_marginals), len(ordered_right_marginals), 'marginals ordred') if self.full_layout: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: contours_only_text_parent_d_ordered = self.return_list_of_contours_with_desired_order( @@ -4961,7 +5025,6 @@ class Eynollah: tror = time.time() order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( contours_only_text_parent, contours_only_text_parent_h, text_regions_p) - print('time spend for mb ro', time.time()-tror) else: if np.abs(slope_deskew) < SLOPE_THRESHOLD: order_text_new, id_of_texts_tot = self.do_order_of_regions( @@ -4978,10 +5041,15 @@ class Eynollah: else: ocr_all_textlines = None - if all_found_textline_polygons_marginals and len(all_found_textline_polygons_marginals)>0: - ocr_all_textlines_marginals = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + if all_found_textline_polygons_marginals_left and len(all_found_textline_polygons_marginals_left)>0: + ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals_left, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: - ocr_all_textlines_marginals = None + ocr_all_textlines_marginals_left = None + + if all_found_textline_polygons_marginals_right and len(all_found_textline_polygons_marginals_right)>0: + ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals_right, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + else: + ocr_all_textlines_marginals_right = None if all_found_textline_polygons_h and len(all_found_textline_polygons)>0: ocr_all_textlines_h = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_h, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) @@ -4994,15 +5062,16 @@ class Eynollah: ocr_all_textlines_drop = None else: ocr_all_textlines = None - ocr_all_textlines_marginals = None + ocr_all_textlines_marginals_left = None + ocr_all_textlines_marginals_right = None ocr_all_textlines_h = None ocr_all_textlines_drop = None pcgts = self.writer.build_pagexml_full_layout( contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, - polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, - all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, - cont_page, polygons_lines_xml, ocr_all_textlines, ocr_all_textlines_h, ocr_all_textlines_marginals, ocr_all_textlines_drop, conf_contours_textregions, conf_contours_textregions_h) + polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals_left, polygons_of_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_lines_xml, ocr_all_textlines, ocr_all_textlines_h, ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right, ocr_all_textlines_drop, conf_contours_textregions, conf_contours_textregions_h) return pcgts contours_only_text_parent_h = None @@ -5077,19 +5146,24 @@ class Eynollah: gc.collect() if len(all_found_textline_polygons)>0: ocr_all_textlines = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) - if all_found_textline_polygons_marginals and len(all_found_textline_polygons_marginals)>0: - ocr_all_textlines_marginals = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + + if all_found_textline_polygons_marginals_left and len(all_found_textline_polygons_marginals_left)>0: + ocr_all_textlines_marginals_left = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals_left, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) + + if all_found_textline_polygons_marginals_right and len(all_found_textline_polygons_marginals_right)>0: + ocr_all_textlines_marginals_right = return_rnn_cnn_ocr_of_given_textlines(image_page, all_found_textline_polygons_marginals_right, self.prediction_model, self.b_s_ocr, self.num_to_char, self.textline_light, self.curved_line) else: ocr_all_textlines = None - ocr_all_textlines_marginals = None + ocr_all_textlines_marginals_left = None + ocr_all_textlines_marginals_right = None self.logger.info("detection of reading order took %.1fs", time.time() - t_order) pcgts = self.writer.build_pagexml_no_full_layout( txt_con_org, page_coord, order_text_new, id_of_texts_tot, - all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, - all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, - cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines, ocr_all_textlines_marginals, conf_contours_textregions) + all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals_left, polygons_of_marginals_right, + all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, + cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines, ocr_all_textlines_marginals_left, ocr_all_textlines_marginals_right, conf_contours_textregions) return pcgts @@ -5145,7 +5219,7 @@ class Eynollah_ocr: self.b_s = int(batch_size) else: - self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250805" + self.model_ocr_dir = dir_models + "/model_step_45000_ocr"#"/model_eynollah_ocr_cnnrnn_20250805"# model_ocr = load_model(self.model_ocr_dir , compile=False) self.prediction_model = tf.keras.models.Model( @@ -5156,7 +5230,7 @@ class Eynollah_ocr: else: self.b_s = int(batch_size) - with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: + with open(os.path.join(self.model_ocr_dir, "characters_20250707_all_lang.txt"),"r") as config_file: characters = json.load(config_file) AUTOTUNE = tf.data.AUTOTUNE diff --git a/src/eynollah/mb_ro_on_layout.py b/src/eynollah/mb_ro_on_layout.py index 7625a90..c03d831 100644 --- a/src/eynollah/mb_ro_on_layout.py +++ b/src/eynollah/mb_ro_on_layout.py @@ -64,7 +64,7 @@ class machine_based_reading_order_on_layout: self.executor = ProcessPoolExecutor(max_workers=cpu_count(), timeout=1200) atexit.register(self.executor.shutdown) self.dir_models = dir_models - self.model_reading_order_dir = dir_models + "/model_step_5100000_mb_ro"#"/model_ens_reading_order_machine_based" + self.model_reading_order_dir = dir_models + "/model_eynollah_reading_order_20250824"#"/model_ens_reading_order_machine_based" try: for device in tf.config.list_physical_devices('GPU'): @@ -942,10 +942,18 @@ class machine_based_reading_order_on_layout: x_len = text_regions_p.shape[1] img_poly = np.zeros((y_len,x_len), dtype='uint8') - img_poly[text_regions_p[:,:]==1] = 1 - img_poly[text_regions_p[:,:]==2] = 2 - img_poly[text_regions_p[:,:]==3] = 4 - img_poly[text_regions_p[:,:]==6] = 5 + ###img_poly[text_regions_p[:,:]==1] = 1 + ###img_poly[text_regions_p[:,:]==2] = 2 + ###img_poly[text_regions_p[:,:]==3] = 4 + ###img_poly[text_regions_p[:,:]==6] = 5 + + ##img_poly[text_regions_p[:,:]==1] = 1 + ##img_poly[text_regions_p[:,:]==2] = 2 + ##img_poly[text_regions_p[:,:]==3] = 3 + ##img_poly[text_regions_p[:,:]==4] = 4 + ##img_poly[text_regions_p[:,:]==5] = 5 + + img_poly = np.copy(text_regions_p) img_header_and_sep = np.zeros((y_len,x_len), dtype='uint8') if contours_only_text_parent_h: diff --git a/src/eynollah/utils/utils_ocr.py b/src/eynollah/utils/utils_ocr.py index 1e9162a..d974650 100644 --- a/src/eynollah/utils/utils_ocr.py +++ b/src/eynollah/utils/utils_ocr.py @@ -384,57 +384,63 @@ def return_rnn_cnn_ocr_of_given_textlines(image, all_found_textline_polygons, pr for indexing, ind_poly_first in enumerate(all_found_textline_polygons): #ocr_textline_in_textregion = [] - for indexing2, ind_poly in enumerate(ind_poly_first): + if len(ind_poly_first)==0: cropped_lines_region_indexer.append(indexer_text_region) - if not (textline_light or curved_line): - ind_poly = copy.deepcopy(ind_poly) - box_ind = all_box_coord[indexing] + cropped_lines_meging_indexing.append(0) + img_fin = np.ones((image_height, image_width, 3))*1 + cropped_lines.append(img_fin) - ind_poly = return_textline_contour_with_added_box_coordinate(ind_poly, box_ind) - #print(ind_poly_copy) - ind_poly[ind_poly<0] = 0 - x, y, w, h = cv2.boundingRect(ind_poly) - - w_scaled = w * image_height/float(h) + else: + for indexing2, ind_poly in enumerate(ind_poly_first): + cropped_lines_region_indexer.append(indexer_text_region) + if not (textline_light or curved_line): + ind_poly = copy.deepcopy(ind_poly) + box_ind = all_box_coord[indexing] - mask_poly = np.zeros(image.shape) - - img_poly_on_img = np.copy(image) - - mask_poly = cv2.fillPoly(mask_poly, pts=[ind_poly], color=(1, 1, 1)) - - - - mask_poly = mask_poly[y:y+h, x:x+w, :] - img_crop = img_poly_on_img[y:y+h, x:x+w, :] - - img_crop[mask_poly==0] = 255 - - if w_scaled < 640:#1.5*image_width: - img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) - cropped_lines.append(img_fin) - cropped_lines_meging_indexing.append(0) - else: - splited_images, splited_images_bin = return_textlines_split_if_needed(img_crop, None) + ind_poly = return_textline_contour_with_added_box_coordinate(ind_poly, box_ind) + #print(ind_poly_copy) + ind_poly[ind_poly<0] = 0 + x, y, w, h = cv2.boundingRect(ind_poly) - if splited_images: - img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[0], image_height, image_width) - cropped_lines.append(img_fin) - cropped_lines_meging_indexing.append(1) - - img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[1], image_height, image_width) - - cropped_lines.append(img_fin) - cropped_lines_meging_indexing.append(-1) - - else: + w_scaled = w * image_height/float(h) + + mask_poly = np.zeros(image.shape) + + img_poly_on_img = np.copy(image) + + mask_poly = cv2.fillPoly(mask_poly, pts=[ind_poly], color=(1, 1, 1)) + + + + mask_poly = mask_poly[y:y+h, x:x+w, :] + img_crop = img_poly_on_img[y:y+h, x:x+w, :] + + img_crop[mask_poly==0] = 255 + + if w_scaled < 640:#1.5*image_width: img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(0) + else: + splited_images, splited_images_bin = return_textlines_split_if_needed(img_crop, None) + + if splited_images: + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[0], image_height, image_width) + cropped_lines.append(img_fin) + cropped_lines_meging_indexing.append(1) + + img_fin = preprocess_and_resize_image_for_ocrcnn_model(splited_images[1], image_height, image_width) + + cropped_lines.append(img_fin) + cropped_lines_meging_indexing.append(-1) + + else: + img_fin = preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width) + cropped_lines.append(img_fin) + cropped_lines_meging_indexing.append(0) indexer_text_region+=1 - extracted_texts = [] n_iterations = math.ceil(len(cropped_lines) / b_s_ocr) diff --git a/src/eynollah/utils/xml.py b/src/eynollah/utils/xml.py index bd95702..13420df 100644 --- a/src/eynollah/utils/xml.py +++ b/src/eynollah/utils/xml.py @@ -46,16 +46,22 @@ def create_page_xml(imageFilename, height, width): )) return pcgts -def xml_reading_order(page, order_of_texts, id_of_marginalia): +def xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_marginalia_right): region_order = ReadingOrderType() og = OrderedGroupType(id="ro357564684568544579089") page.set_ReadingOrder(region_order) region_order.set_OrderedGroup(og) region_counter = EynollahIdCounter() + + for id_marginal in id_of_marginalia_left: + og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=id_marginal)) + region_counter.inc('region') + for idx_textregion, _ in enumerate(order_of_texts): og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=region_counter.region_id(order_of_texts[idx_textregion] + 1))) region_counter.inc('region') - for id_marginal in id_of_marginalia: + + for id_marginal in id_of_marginalia_right: og.add_RegionRefIndexed(RegionRefIndexedType(index=str(region_counter.get('region')), regionRef=id_marginal)) region_counter.inc('region') diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index 085ee6f..2f9caf3 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -170,7 +170,7 @@ class EynollahXmlWriter(): with open(self.output_filename, 'w') as f: f.write(to_xml(pcgts)) - def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines=None, ocr_all_textlines_marginals=None, conf_contours_textregion=None, skip_layout_reading_order=False): + def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals_left, found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, conf_contours_textregion=None, skip_layout_reading_order=False): self.logger.debug('enter build_pagexml_no_full_layout') # create the file structure @@ -181,8 +181,9 @@ class EynollahXmlWriter(): counter = EynollahIdCounter() if len(found_polygons_text_region) > 0: _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) - id_of_marginalia = [_counter_marginals.next_region_id for _ in found_polygons_marginals] - xml_reading_order(page, order_of_texts, id_of_marginalia) + id_of_marginalia_left = [_counter_marginals.next_region_id for _ in found_polygons_marginals_left] + id_of_marginalia_right = [_counter_marginals.next_region_id for _ in found_polygons_marginals_right] + xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_marginalia_right) for mm in range(len(found_polygons_text_region)): textregion = TextRegionType(id=counter.next_region_id, type_='paragraph', @@ -195,17 +196,29 @@ class EynollahXmlWriter(): else: ocr_textlines = None self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, all_box_coord, slopes, counter, ocr_textlines) - - for mm in range(len(found_polygons_marginals)): + + for mm in range(len(found_polygons_marginals_left)): marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord))) + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_left[mm], page_coord))) page.add_TextRegion(marginal) - if ocr_all_textlines_marginals: - ocr_textlines = ocr_all_textlines_marginals[mm] + if ocr_all_textlines_marginals_left: + ocr_textlines = ocr_all_textlines_marginals_left[mm] else: ocr_textlines = None - self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter, ocr_textlines) + #print(ocr_textlines, mm, len(all_found_textline_polygons_marginals_left[mm]) ) + self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_left, mm, page_coord, all_box_coord_marginals_left, slopes_marginals_left, counter, ocr_textlines) + + for mm in range(len(found_polygons_marginals_right)): + marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_right[mm], page_coord))) + page.add_TextRegion(marginal) + if ocr_all_textlines_marginals_right: + ocr_textlines = ocr_all_textlines_marginals_right[mm] + else: + ocr_textlines = None + + self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_right, mm, page_coord, all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines) for mm in range(len(found_polygons_text_region_img)): img_region = ImageRegionType(id=counter.next_region_id, Coords=CoordsType()) @@ -249,7 +262,7 @@ class EynollahXmlWriter(): return pcgts - def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, ocr_all_textlines=None, ocr_all_textlines_h=None, ocr_all_textlines_marginals=None, ocr_all_textlines_drop=None, conf_contours_textregion=None, conf_contours_textregion_h=None): + def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals_left,found_polygons_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, all_box_coord_marginals_left, all_box_coord_marginals_right, slopes, slopes_h, slopes_marginals_left, slopes_marginals_right, cont_page, polygons_lines_to_be_written_in_xml, ocr_all_textlines=None, ocr_all_textlines_h=None, ocr_all_textlines_marginals_left=None, ocr_all_textlines_marginals_right=None, ocr_all_textlines_drop=None, conf_contours_textregion=None, conf_contours_textregion_h=None): self.logger.debug('enter build_pagexml_full_layout') # create the file structure @@ -259,8 +272,9 @@ class EynollahXmlWriter(): counter = EynollahIdCounter() _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) - id_of_marginalia = [_counter_marginals.next_region_id for _ in found_polygons_marginals] - xml_reading_order(page, order_of_texts, id_of_marginalia) + id_of_marginalia_left = [_counter_marginals.next_region_id for _ in found_polygons_marginals_left] + id_of_marginalia_right = [_counter_marginals.next_region_id for _ in found_polygons_marginals_right] + xml_reading_order(page, order_of_texts, id_of_marginalia_left, id_of_marginalia_right) for mm in range(len(found_polygons_text_region)): textregion = TextRegionType(id=counter.next_region_id, type_='paragraph', @@ -285,15 +299,25 @@ class EynollahXmlWriter(): ocr_textlines = None self.serialize_lines_in_region(textregion, all_found_textline_polygons_h, mm, page_coord, all_box_coord_h, slopes_h, counter, ocr_textlines) - for mm in range(len(found_polygons_marginals)): + for mm in range(len(found_polygons_marginals_left)): marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord))) + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_left[mm], page_coord))) page.add_TextRegion(marginal) - if ocr_all_textlines_marginals: - ocr_textlines = ocr_all_textlines_marginals[mm] + if ocr_all_textlines_marginals_left: + ocr_textlines = ocr_all_textlines_marginals_left[mm] else: ocr_textlines = None - self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter, ocr_textlines) + self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_left, mm, page_coord, all_box_coord_marginals_left, slopes_marginals_left, counter, ocr_textlines) + + for mm in range(len(found_polygons_marginals_right)): + marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals_right[mm], page_coord))) + page.add_TextRegion(marginal) + if ocr_all_textlines_marginals_right: + ocr_textlines = ocr_all_textlines_marginals_right[mm] + else: + ocr_textlines = None + self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals_right, mm, page_coord, all_box_coord_marginals_right, slopes_marginals_right, counter, ocr_textlines) for mm in range(len(found_polygons_drop_capitals)): dropcapital = TextRegionType(id=counter.next_region_id, type_='drop-capital', From fdcae8dd6e35c15e7e627be9bcd0a9a940b8b316 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 28 Aug 2025 11:30:59 +0200 Subject: [PATCH 40/40] eynollah ocr: support using either a specific model name or a models directory (default model) --- src/eynollah/cli.py | 18 +++++++++--------- src/eynollah/eynollah.py | 28 +++++++++++++++++----------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index 67fd57e..9dc326d 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -456,6 +456,11 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ help="directory of models", type=click.Path(exists=True, file_okay=False), ) +@click.option( + "--model_name", + help="Specific model file path to use for OCR", + type=click.Path(exists=True, file_okay=False), +) @click.option( "--tr_ocr", "-trocr/-notrocr", @@ -474,12 +479,6 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ is_flag=True, help="if this parameter set to true, cropped textline images will not be masked with textline contour.", ) -@click.option( - "--draw_texts_on_image", - "-dtoi/-ndtoi", - is_flag=True, - help="if this parameter set to true, the predicted texts will be displayed on an image.", -) @click.option( "--prediction_with_both_of_rgb_and_bin", "-brb/-nbrb", @@ -508,16 +507,17 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ help="Override log level globally to this", ) -def ocr(image, overwrite, dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, prediction_with_both_of_rgb_and_bin, batch_size, dataset_abbrevation, min_conf_value_of_textline_text, log_level): +def ocr(image, overwrite, dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, model_name, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, prediction_with_both_of_rgb_and_bin, batch_size, dataset_abbrevation, min_conf_value_of_textline_text, log_level): initLogging() if log_level: getLogger('eynollah').setLevel(getLevelName(log_level)) + + assert not model or not model_name, "model directory -m can not be set alongside specific model name --model_name" assert not export_textline_images_and_text or not tr_ocr, "Exporting textline and text -etit can not be set alongside transformer ocr -tr_ocr" assert not export_textline_images_and_text or not model, "Exporting textline and text -etit can not be set alongside model -m" assert not export_textline_images_and_text or not batch_size, "Exporting textline and text -etit can not be set alongside batch size -bs" assert not export_textline_images_and_text or not dir_in_bin, "Exporting textline and text -etit can not be set alongside directory of bin images -dib" assert not export_textline_images_and_text or not dir_out_image_text, "Exporting textline and text -etit can not be set alongside directory of images with predicted text -doit" - assert not export_textline_images_and_text or not draw_texts_on_image, "Exporting textline and text -etit can not be set alongside draw text on image -dtoi" assert not export_textline_images_and_text or not prediction_with_both_of_rgb_and_bin, "Exporting textline and text -etit can not be set alongside prediction with both rgb and bin -brb" assert (bool(image) ^ bool(dir_in)), "Either -i (single image) or -di (directory) must be provided, but not both." eynollah_ocr = Eynollah_ocr( @@ -528,10 +528,10 @@ def ocr(image, overwrite, dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, dir_in_bin=dir_in_bin, dir_out=out, dir_models=model, + model_name=model_name, tr_ocr=tr_ocr, export_textline_images_and_text=export_textline_images_and_text, do_not_mask_with_textline_contour=do_not_mask_with_textline_contour, - draw_texts_on_image=draw_texts_on_image, prediction_with_both_of_rgb_and_bin=prediction_with_both_of_rgb_and_bin, batch_size=batch_size, pref_of_dataset=dataset_abbrevation, diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 30e180d..ec2900f 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -5171,6 +5171,7 @@ class Eynollah_ocr: def __init__( self, dir_models, + model_name=None, dir_xmls=None, dir_in=None, image_filename=None, @@ -5181,7 +5182,6 @@ class Eynollah_ocr: batch_size=None, export_textline_images_and_text=False, do_not_mask_with_textline_contour=False, - draw_texts_on_image=False, prediction_with_both_of_rgb_and_bin=False, pref_of_dataset=None, min_conf_value_of_textline_text : Optional[float]=None, @@ -5193,10 +5193,10 @@ class Eynollah_ocr: self.dir_out = dir_out self.dir_xmls = dir_xmls self.dir_models = dir_models + self.model_name = model_name self.tr_ocr = tr_ocr self.export_textline_images_and_text = export_textline_images_and_text self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour - self.draw_texts_on_image = draw_texts_on_image self.dir_out_image_text = dir_out_image_text self.prediction_with_both_of_rgb_and_bin = prediction_with_both_of_rgb_and_bin self.pref_of_dataset = pref_of_dataset @@ -5210,7 +5210,10 @@ class Eynollah_ocr: if tr_ocr: self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124" + if self.model_name: + self.model_ocr_dir = self.model_name + else: + self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124" self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) self.model_ocr.to(self.device) if not batch_size: @@ -5219,7 +5222,10 @@ class Eynollah_ocr: self.b_s = int(batch_size) else: - self.model_ocr_dir = dir_models + "/model_step_45000_ocr"#"/model_eynollah_ocr_cnnrnn_20250805"# + if self.model_name: + self.model_ocr_dir = self.model_name + else: + self.model_ocr_dir = dir_models + "/model_eynollah_ocr_cnnrnn_20250805" model_ocr = load_model(self.model_ocr_dir , compile=False) self.prediction_model = tf.keras.models.Model( @@ -5230,7 +5236,7 @@ class Eynollah_ocr: else: self.b_s = int(batch_size) - with open(os.path.join(self.model_ocr_dir, "characters_20250707_all_lang.txt"),"r") as config_file: + with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: characters = json.load(config_file) AUTOTUNE = tf.data.AUTOTUNE @@ -5271,7 +5277,7 @@ class Eynollah_ocr: img = cv2.imread(dir_img) - if self.draw_texts_on_image: + if self.dir_out_image_text: out_image_with_text = os.path.join(self.dir_out_image_text, file_name+'.png') image_text = Image.new("RGB", (img.shape[1], img.shape[0]), "white") draw = ImageDraw.Draw(image_text) @@ -5306,7 +5312,7 @@ class Eynollah_ocr: textline_coords = np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) x,y,w,h = cv2.boundingRect(textline_coords) - if self.draw_texts_on_image: + if self.dir_out_image_text: total_bb_coordinates.append([x,y,w,h]) h2w_ratio = h/float(w) @@ -5363,7 +5369,7 @@ class Eynollah_ocr: unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer) - if self.draw_texts_on_image: + if self.dir_out_image_text: font_path = "Charis-7.000/Charis-Regular.ttf" # Make sure this file exists! font = ImageFont.truetype(font_path, 40) @@ -5463,7 +5469,7 @@ class Eynollah_ocr: dir_img_bin = os.path.join(self.dir_in_bin, file_name+'.png') img_bin = cv2.imread(dir_img_bin) - if self.draw_texts_on_image: + if self.dir_out_image_text: out_image_with_text = os.path.join(self.dir_out_image_text, file_name+'.png') image_text = Image.new("RGB", (img.shape[1], img.shape[0]), "white") draw = ImageDraw.Draw(image_text) @@ -5508,7 +5514,7 @@ class Eynollah_ocr: if type_textregion=='drop-capital': angle_degrees = 0 - if self.draw_texts_on_image: + if self.dir_out_image_text: total_bb_coordinates.append([x,y,w,h]) w_scaled = w * image_height/float(h) @@ -5829,7 +5835,7 @@ class Eynollah_ocr: unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer) - if self.draw_texts_on_image: + if self.dir_out_image_text: font_path = "Charis-7.000/Charis-Regular.ttf" # Make sure this file exists! font = ImageFont.truetype(font_path, 40)